In [1]:
#Predict which main character spoke a given line, using generated TF-IDF values.
#Note that I opted to use NTLK's TF-IDF functionality over my implemented solution, partly because I wanted to be more familiar
#with it but also because it can be easily combined with scikit-learn.

In [2]:
import string
import re
import time

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords

In [3]:
from nltk.stem import WordNetLemmatizer

#Helper to clean the line of white spaces, etc.
def clean_line(line):
    output = re.sub(r'\W', ' ', line) #Remove non-words
    for p in string.punctuation:      #Remove punctuation
        output = output.replace(p, '')
    output = re.sub(r'\s+', ' ', output) #Remove multiple spaces
    output = output.lower()
    
    #Lemmatize the words
    stemmer = WordNetLemmatizer()
    output = output.split()
    output = [stemmer.lemmatize(word) for word in output]
    output = ' '.join(output)
    
    return output

In [4]:
main_chars = ["Leslie Knope", "Tom Haverford", "Ron Swanson", "April Ludgate", "Andy Dwyer", "Ben Wyatt", "Chris Traeger", "Jerry Gergich", "Donna Meagle"]

#Load the transcript into DataFrame, remove empty lines and lines acting as episode labels; accepts a boolean for
#whether we want to isolate the data and predictions to only the main characters.
def gather_data(main_only = True):   
    print("Cleaning data...")
    full_trans = pd.read_csv('data\p_r_scripts.csv')
    full_trans = full_trans.dropna(subset=['Character'])
    
    #Filter out other people's lines if we only want the main characters of the show
    if main_only:
        full_trans = full_trans.loc[full_trans.Character.isin(main_chars)]
        
    full_trans['Line'] = full_trans['Line'].apply(lambda x: clean_line(x))

    #Convert the characters to integers (values will vary depending on whether or not to include others)
    if main_only:
        full_trans['Character'] = full_trans['Character'].apply(lambda x: main_chars.index(x))
    else:
        all_char = full_trans.Character.unique()
        full_trans['Character'] = full_trans['Character'].apply(lambda x: all_char.index(x))
    
    print("Data cleaned.")
    return full_trans

In [5]:
#Train the model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

data = gather_data(True)
lines = data.Line.values
speakers = data.Character.values

tfidf_convert = TfidfVectorizer(min_df = 2, max_df = 1.0, stop_words = stopwords.words('English'))
X = tfidf_convert.fit_transform(lines).toarray()

#Separate into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, speakers, test_size = 0.2)
print(tfidf_convert.get_feature_names())

Cleaning data...
Data cleaned.


In [6]:
#Different classifier functions to try

#Returns a classifier trained.
def random_forest(n_est=10, max_d=2):
    start = time.time()
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators=n_est, max_depth=max_d)
    classifier.fit(X_train, y_train)
    end = time.time()
    print(str(n_est) + " estimators and " + str(max_d) + " Max depth takes: " + str(end-start) + " seconds.")
    return classifier


#Gets a prediction based on the model requested
def predict(kind='random forest'):
    line = str(input("Enter a line: "))
    line = tfidf_convert.transform([line])
    if kind == 'random forest':
        model = random_forest()
        prediction = model.predict(line)
        return main_chars[prediction[0]]

In [7]:
#Test the different methods
from sklearn.metrics import accuracy_score

y_pred = random_forest(20, 7).predict(X_test)
print(1 in y_pred)
print(accuracy_score(y_test, y_pred))

20 estimators and 7 Max depth takes: 6.625388145446777 seconds.
False
0.36475869809203143


In [8]:
predict()

Enter a line: Testing line
10 estimators and 2 Max depth takes: 1.8779897689819336 seconds.


'Leslie Knope'