In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import featuretools as ft


Read in the data from the CSV file

In [3]:
shake = pd.read_csv("./data/raw/Shakespeare_data.csv")
shake

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,6,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils
6,7,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.
7,8,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil
8,9,Henry IV,1.0,1.1.6,KING HENRY IV,Shall daub her lips with her own children's bl...
9,10,Henry IV,1.0,1.1.7,KING HENRY IV,"Nor more shall trenching war channel her fields,"


Drop the useless Column of data

In [4]:
df_shake = shake.drop(columns="Dataline")
df_shake['Player'].replace(np.nan, 'No Character', inplace = True)

Gets list of unique players

In [5]:
df_PlayerNumber = df_shake.groupby('Player').nunique()

Get the list of times the player talks

In [6]:
val = shake['Player'].value_counts()
val

GLOUCESTER         1920
HAMLET             1582
IAGO               1161
FALSTAFF           1117
KING HENRY V       1086
                   ... 
Mariners              1
First Murder          1
VAUGHAN               1
Second Knight         1
Second murderer       1
Name: Player, Length: 934, dtype: int64

In [7]:
df = val.rename_axis('Player').reset_index(name='Number of Appearances')
df

Unnamed: 0,Player,Number of Appearances
0,GLOUCESTER,1920
1,HAMLET,1582
2,IAGO,1161
3,FALSTAFF,1117
4,KING HENRY V,1086
5,BRUTUS,1051
6,OTHELLO,928
7,MARK ANTONY,927
8,KING HENRY VI,917
9,DUKE VINCENTIO,909


In [8]:
play_grouping = df_shake.groupby(['Play','Player' ]).count()
play_grouping

Unnamed: 0_level_0,Unnamed: 1_level_0,PlayerLinenumber,ActSceneLine,PlayerLine
Play,Player,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A Comedy of Errors,ADRIANA,284,276,284
A Comedy of Errors,AEGEON,150,147,150
A Comedy of Errors,AEMELIA,75,73,75
A Comedy of Errors,ANGELO,99,96,99
A Comedy of Errors,ANTIPHOLUS,6,6,6
A Comedy of Errors,BALTHAZAR,31,31,31
A Comedy of Errors,Courtezan,43,40,43
A Comedy of Errors,DROMIO OF EPHESUS,191,187,191
A Comedy of Errors,DROMIO OF SYRACUSE,323,314,323
A Comedy of Errors,DUKE SOLINUS,97,93,97


Imports for both the decision tree and the random forest implmentation

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split 
from sklearn import metrics 

So prior to being able to run the classification things, you need to make sure that the processor is able to take in the values. In this case it only takes floats so we need to turn all of our strings into floats by using the label encoder.

In [10]:
le = preprocessing.LabelEncoder()
le.fit(df_shake['Player'])
df_shake['Player'] = le.transform(df_shake['Player'])
le.fit(df_shake['Play'])
df_shake['Play'] = le.transform(df_shake['Play'])

df_shake['ActSceneLine'] = df_shake['ActSceneLine'].astype(str)

le.fit(df_shake['ActSceneLine'])
df_shake['ActSceneLine'] = le.transform(df_shake['ActSceneLine'])

This takes in the X and y classification that we will be using and splitting it into training and testing sets. We will be using Play and ActSceneLine to determine the player.



In [11]:
X= df_shake[['Play', 'ActSceneLine']]
y= df_shake['Player']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

Takes in the training and testing set to predict the outcome from our testing set. The accuracy we get returned averages around 60%. The decision tree is quick, but often leads to overfitting of the data.

In [12]:
decision_tree = DecisionTreeClassifier()
clf = decision_tree.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6038599640933573


Like the Decision tree the random forest classifcation takes in the training and testing set to predict the outcome from our testing set. The accuracy we get returned averages around 60%. The random forest classification is again very accurate and fast, but falls into the trap of scaling and the larger the number of trees the slower the classifcation, our dataset begins to show growing pains.

In [14]:
my_model = RandomForestClassifier()
my_model.fit(X_train, y_train)
y_pred = my_model.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))



MemoryError: could not allocate 483393536 bytes

All in all we were able to take the shakespeare play and create some feature engineering models to help establish future ways to explain our data. If I had to add one more model, I would have done a way to analyze the playerline in the equation. I think adding that part can lead to an accuracy of over 90%