In [1]:
import time
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# flag to determine whether to use TF or TF-IDF
idf = True

In [3]:
# loads data (all of Jerry, George, Elaine, and Kramer's lines)
df = pd.read_csv('jgek_lines.csv', index_col=0)

In [4]:
# intitalizes nltk stopword list
stop = stopwords.words('english')

In [5]:
# initializes CountVectorizer or TfidfVectorizer
if idf:
    vectorizer = TfidfVectorizer(stop_words=stop)
else:
    vectorizer = CountVectorizer(stop_words=stop)

In [6]:
# fits vectorizer to all data for use with classification
tf_vec = vectorizer.fit_transform(df['line'])
# takes only arrays from tf_vec to make classification simpler
tf = tf_vec.toarray()

In [7]:
# creates list of which character said which line in order
target = list(df['character'])

In [8]:
# splits term frequency lists and character labels into training and test data
tf_train, tf_test, target_train, target_test = train_test_split(tf, target, test_size=0.2, random_state=0)

In [9]:
start_time = time.time()
sv_classifier = svm.SVC()
sv_classifier.fit(tf_train, target_train) 
end_time = time.time()
print(end_time - start_time)

24477.415349006653


In [10]:
target_pred = sv_classifier.predict(tf_test)

In [11]:
print(confusion_matrix(target_test,target_pred))

[[ 177  120 1167   69]
 [  70  342 1306   89]
 [ 117  230 2343  108]
 [  59  115  921  226]]


In [12]:
print(classification_report(target_test,target_pred))

              precision    recall  f1-score   support

      ELAINE       0.42      0.12      0.18      1533
      GEORGE       0.42      0.19      0.26      1807
       JERRY       0.41      0.84      0.55      2798
      KRAMER       0.46      0.17      0.25      1321

    accuracy                           0.41      7459
   macro avg       0.43      0.33      0.31      7459
weighted avg       0.42      0.41      0.35      7459



In [13]:
print(accuracy_score(target_test, target_pred))

0.41399651427805334
