In [75]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter


In [77]:
import pickle

with open("df_lines_per_decade.pkl", "rb") as file:
    df_lines_per_decade = pickle.load(file)


In [79]:
df_lines_per_decade
# concatenate text
df_lines_per_decade.groupby('meta.release_decade')['text'].apply(' '.join).reset_index()

Unnamed: 0,meta.release_decade,text
0,192,-- but don't you remember? i'm already dead. ...
1,193,"yeah - sure! yes - sure if she can't pay, i'l..."
2,194,"-- he's just a lucky guy. monsieur rick, i -- ..."
3,195,and his 'egghead' son! we'll give 'em a <u>ro...
4,196,we're trying to get there. i hope we can. cont...
5,197,"not at all, sir. i have a pair of good pistol..."
6,198,"great, just great. that we do. and we put air ..."
7,199,they do not! they do to! i hope so. she okay? ...
8,200,"officers, there's your killer, do your duty, a..."
9,201,no!! i can prove it to you. i'll take you to...


In [81]:
# find None lines
df_lines_per_decade[df_lines_per_decade['text'].isnull()]

# drop None lines which would cause errors
df_lines_per_decade = df_lines_per_decade[df_lines_per_decade['text'].notnull()]

# concatenate text
df_lines_per_decade.groupby('meta.release_decade')['text'].apply(' '.join).reset_index()

Unnamed: 0,meta.release_decade,text
0,192,-- but don't you remember? i'm already dead. ...
1,193,"yeah - sure! yes - sure if she can't pay, i'l..."
2,194,"-- he's just a lucky guy. monsieur rick, i -- ..."
3,195,and his 'egghead' son! we'll give 'em a <u>ro...
4,196,we're trying to get there. i hope we can. cont...
5,197,"not at all, sir. i have a pair of good pistol..."
6,198,"great, just great. that we do. and we put air ..."
7,199,they do not! they do to! i hope so. she okay? ...
8,200,"officers, there's your killer, do your duty, a..."
9,201,no!! i can prove it to you. i'll take you to...


In [None]:
df_lines_per_decade = df_lines_per_decade[df_lines_per_decade['meta.release_decade'].isin(['193','194','195','196','197','198','199','200'])]

In [72]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_lines_per_decade, test_size=0.33, random_state=42)

In [73]:
print('text sample:', train['text'].iloc[0])
print('release_decade:', train['meta.release_decade'].iloc[0])
print('Training Data Shape:', train.shape)
print('Testing Data Shape:', test.shape)

text sample: um... harry?  did i ever keep the books here?
release_decade: 200
Training Data Shape: (203385, 7)
Testing Data Shape: (100176, 7)


In [74]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import string
import re
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

In [76]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

my_stop_words = "well i'm"
STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS) + my_stop_words.split()) 
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”", "--"]

In [78]:
class CleanTextTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}
    
def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    return text

def tokenizeText(sample):
    tokens = parser(sample)
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    return tokens

In [80]:
def printNMostInformative(vectorizer, clf, N):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    topClass1 = coefs_with_fns[:N]
    topClass2 = coefs_with_fns[:-(N + 1):-1]
    print("Class 1 best: ")
    for feat in topClass1:
        print(feat)
    print("Class 2 best: ")
    for feat in topClass2:
        print(feat)

vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))
clf = LinearSVC()
pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])

# data
train1 = train['text'].tolist()
labelsTrain1 = train['meta.release_decade'].tolist()

test1 = test['text'].tolist()
labelsTest1 = test['meta.release_decade'].tolist()
# train
pipe.fit(train1, labelsTrain1)

# test
preds = pipe.predict(test1)
print("accuracy:", accuracy_score(labelsTest1, preds))
print("Top 10 features used to predict: ")

printNMostInformative(vectorizer, clf, 10)

pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer)])
transform = pipe.fit_transform(train1, labelsTrain1)
vocab = vectorizer.get_feature_names()

for i in range(len(train1)):
    s = ""
    indexIntoVocab = transform.indices[transform.indptr[i]:transform.indptr[i+1]]
    numOccurences = transform.data[transform.indptr[i]:transform.indptr[i+1]]
    for idx, num in zip(indexIntoVocab, numOccurences):
        s += str((vocab[idx], num))

accuracy: 0.4680961507746366
Top 10 features used to predict: 
Class 1 best: 
(-1.7834419219227404, 'musta')
(-1.3919580095526651, 'wade')
(-1.216202029583079, 'commissioner')
(-1.2043261637862812, 'rick')
(-1.1737497409992987, 'bein')
(-1.1611666443731221, 'ed')
(-1.1406432553760812, 'dalai')
(-1.1333215366132756, 'fortress')
(-1.126606302715779, 'venezuela')
(-1.1105454941127495, 'johnny')
Class 2 best: 
(1.979610617824478, 'conway')
(1.930361604287997, 'dickson')
(1.9078225481139643, 'ninotchka')
(1.8909517459200749, 'studsy')
(1.876728927496195, 'kringelein')
(1.8752636304946455, 'wynant')
(1.8574730878414625, 'anything\x97')
(1.8316175188791801, 'schuyler')
(1.8268551908662585, 'preysing')
(1.825137314736557, 'swana')


In [82]:
from sklearn import metrics
print(metrics.classification_report(labelsTest1, preds, 
                                    target_names=df_lines_per_decade['meta.release_decade'].unique()))

ValueError: Number of classes, 8, does not match size of target_names, 10. Try specifying the labels parameter