In [1]:
import time
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# flag to determine whether to use TF or TF-IDF
idf = True

In [3]:
# loads data (all of Jerry, George, Elaine, and Kramer's lines)
df = pd.read_csv('jgek_lines.csv', index_col=0)

In [4]:
# intitalizes nltk stopword list
stop = stopwords.words('english')

In [5]:
# initializes CountVectorizer or TfidfVectorizer
if idf:
    vectorizer = TfidfVectorizer(stop_words=stop)
else:
    vectorizer = CountVectorizer(stop_words=stop)

In [6]:
# fits vectorizer to all data for use with classification
tf_vec = vectorizer.fit_transform(df['line'])
# takes only arrays from tf_vec to make classification simpler
tf = tf_vec.toarray()

In [7]:
# creates list of which character said which line in order
target = list(df['character'])

In [8]:
# splits term frequency lists and character labels into training and test data
tf_train, tf_test, target_train, target_test = train_test_split(tf, target, test_size=0.2, random_state=0)

In [9]:
start_time = time.time()
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=0)
rf_classifier.fit(tf_train, target_train) 
end_time = time.time()
print(end_time - start_time)

894.0180718898773


In [10]:
target_pred = rf_classifier.predict(tf_test)

In [11]:
print(confusion_matrix(target_test,target_pred))

[[ 226  181 1000  126]
 [ 143  401 1106  157]
 [ 244  364 2018  172]
 [ 110  163  769  279]]


In [12]:
print(classification_report(target_test,target_pred))

              precision    recall  f1-score   support

      ELAINE       0.31      0.15      0.20      1533
      GEORGE       0.36      0.22      0.28      1807
       JERRY       0.41      0.72      0.52      2798
      KRAMER       0.38      0.21      0.27      1321

    accuracy                           0.39      7459
   macro avg       0.37      0.33      0.32      7459
weighted avg       0.37      0.39      0.35      7459



In [13]:
print(accuracy_score(target_test, target_pred))

0.3920096527684676


In [14]:
(vectorizer.vocabulary_)

{'know': 6832,
 'one': 8557,
 'single': 11237,
 'enjoyable': 4147,
 'experiences': 4347,
 'life': 7123,
 'people': 9037,
 'ever': 4265,
 'hear': 5670,
 'talking': 12293,
 'go': 5234,
 'whole': 13617,
 'thing': 12496,
 'home': 5882,
 'person': 9091,
 'tryin': 12856,
 'find': 4633,
 'us': 13144,
 'ring': 10390,
 'tell': 12393,
 'going': 5251,
 'must': 8136,
 'gone': 5260,
 'wanna': 13422,
 'see': 10851,
 'button': 1829,
 'worst': 13786,
 'possible': 9463,
 'spot': 11652,
 'second': 10836,
 'literally': 7198,
 'makes': 7447,
 'breaks': 1622,
 'shirt': 11058,
 'look': 7263,
 'high': 5793,
 'man': 7464,
 'land': 6908,
 'like': 7143,
 'live': 7206,
 'mother': 8043,
 'course': 2935,
 'try': 12855,
 'buy': 1832,
 'yes': 13908,
 'purple': 9790,
 'liked': 7144,
 'actually': 316,
 'recall': 10011,
 'considering': 2756,
 'buttons': 1830,
 'oh': 8513,
 'uh': 12956,
 'time': 12581,
 'well': 13537,
 'senator': 10893,
 'knew': 6815,
 'sure': 12122,
 'decaf': 3266,
 'orange': 8627,
 'indicator': 6228,
