In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, plot_confusion_matrix
import matplotlib.pyplot as plt
import os

In [2]:
bitles = pd.read_csv('../data/bigram_titles_pos_w2v.csv')
bitles.head()

Unnamed: 0,title,good_title,pos,pos1,pos2,title_token,word_vec
0,mingled yarn,1,"['VBN', 'NN']",VBN,NN,"['mingled', 'yarn']",[[-2.34753769e-02 -3.81344706e-02 5.21896631e...
1,stiff news,1,"['JJ', 'NN']",JJ,NN,"['stiff', 'news']",[[-1.87895857e-02 -9.64995101e-02 1.73002873e...
2,salad days,1,"['JJ', 'NNS']",JJ,NNS,"['salad', 'days']",[[ 3.79228336e-03 -1.64979380e-02 1.30106723e...
3,gaudy night,1,"['NN', 'NN']",NN,NN,"['gaudy', 'night']",[[-1.18637476e-02 -2.42617801e-02 2.91478410e...
4,immortal longings,1,"['JJ', 'NNS']",JJ,NNS,"['immortal', 'longings']",[[-6.05380274e-02 -7.34784156e-02 8.35435167e...


In [3]:
bishake = pd.read_csv('../data/shakespearean_bigrams_pos_w2v.csv')
bishake.sample(5)

Unnamed: 0,title,pos,pos_len,pos1,pos2,title_token,word_vec
197210,think'st it,"['NN', 'PRP']",2,NN,PRP,"[""think'st"", 'it']",[[-7.71375895e-02 -1.30776465e-01 1.00374825e...
166509,sharp-quill 'd,"['JJ', 'MD']",2,JJ,MD,"['sharp-quill', ""'d""]",[[ 3.35064740e-03 -1.73842418e-03 -7.01316632e...
28676,bold and,"['NN', 'CC']",2,NN,CC,"['bold', 'and']",[[-1.28817648e-01 -3.42830151e-01 -2.81210333e...
225,? loved,"['.', 'VBD']",2,.,VBD,"['?', 'loved']",[[-1.07928395e-01 1.39279291e-01 -2.83154815e...
161677,savage bull,"['NN', 'NN']",2,NN,NN,"['savage', 'bull']",[[-2.93833390e-02 -9.46322158e-02 1.45298153e...


I need to to be very careful with the one hot encoding because the title and shakespeare datasets will have different pos tags. I'll need to filter out all the pos tags in the shakespeare dataset that do not appear in the titles dataset.

In [4]:
all_bitles_pos1 = set(bitles.pos1.tolist())
all_bitles_pos2 = set(bitles.pos2.tolist())

print('Number of pos tags in titles pos1:', len(all_bitles_pos1))
print('Number of pos tags in titles pos2:', len(all_bitles_pos2))

Number of pos tags in titles pos1: 24
Number of pos tags in titles pos2: 27


In [5]:
all_bishake_pos1 = set(bishake.pos1.tolist())
all_bishake_pos2 = set(bishake.pos2.tolist())

print('Number of pos tags in shakespeare pos1:', len(all_bishake_pos1))
print('Number of pos tags in shakespeare pos2:', len(all_bishake_pos2))

Number of pos tags in shakespeare pos1: 32
Number of pos tags in shakespeare pos2: 36


In [6]:
pos1_rm = list(all_bishake_pos1.difference(all_bitles_pos1))
pos1_rm

['TO', 'UH', 'RP', 'EX', 'FW', 'WP$', 'PDT', 'RBS', '.']

In [7]:
pos2_rm = list(all_bishake_pos2.difference(all_bitles_pos2))
pos2_rm

['POS', 'RP', 'EX', 'WP$', 'PDT', '""', '``', 'JJS', 'FW']

In [8]:
print('Starting number of bigrams', bishake.shape[0])

bishake = bishake[~bishake['pos1'].isin(pos1_rm)]
bishake.shape[0]

Starting number of bigrams 236631


232924

In [9]:
print('Starting number of bigrams', bishake.shape[0])

bishake = bishake[~bishake['pos2'].isin(pos2_rm)]
bishake.shape[0]

Starting number of bigrams 232924


228508

In [10]:
all_bishake_pos1 = set(bishake.pos1.tolist())
all_bishake_pos2 = set(bishake.pos2.tolist())

print('Number of pos tags in shakespeare pos1:', len(all_bishake_pos1))
print('Number of pos tags in shakespeare pos2:', len(all_bishake_pos2))

Number of pos tags in shakespeare pos1: 23
Number of pos tags in shakespeare pos2: 27


In [11]:
pos1_rm2 = list(all_bitles_pos1.difference(all_bishake_pos1))
pos1_rm2

['POS']

In [12]:
bitles[bitles['pos1']=='POS']


Unnamed: 0,title,good_title,pos,pos1,pos2,title_token,word_vec
123,'s kingdom,0,"['POS', 'NN']",POS,NN,"[""'s"", 'kingdom']",[[ 5.33072054e-01 4.56044883e-01 8.39015186e...


In [13]:
bitles.drop(index=[123], inplace=True)

POS tags synced! Now to reformat the `word_vec` column from a string to an array

In [14]:
result1 = bitles['word_vec'].apply(lambda x: 
                           np.fromstring(
                               x.replace('\n','')
                                .replace('[','')
                                .replace(']','')
                                .replace('  ',' '), sep=' '))
len(result1[0])

400

In [15]:
bitles['result'] = result1
bitles.sample()

Unnamed: 0,title,good_title,pos,pos1,pos2,title_token,word_vec,result
156,his qualities,0,"['PRP$', 'NNS']",PRP$,NNS,"['his', 'qualities']",[[-4.69330192e-01 1.26165497e+00 7.18420982e...,"[-0.469330192, 1.26165497, 0.718420982, 0.5546..."


In [16]:
result2 = bishake['word_vec'].apply(lambda x: 
                           np.fromstring(
                               x.replace('\n','')
                                .replace('[','')
                                .replace(']','')
                                .replace('  ',' '), sep=' '))
len(result2[0])

400

In [17]:
bishake['result'] = result2
bishake.sample()

Unnamed: 0,title,pos,pos_len,pos1,pos2,title_token,word_vec,result
84525,hearts and,"['NNS', 'CC']",2,NNS,CC,"['hearts', 'and']",[[-9.39506143e-02 -2.44569719e-01 -2.53253490e...,"[-0.0939506143, -0.244569719, -0.25325349, -0...."


Now to create dummy variables from the POS tags

In [18]:
bitles_pos = bitles[['pos1', 'pos2']]
bitles_pos = pd.get_dummies(bitles_pos, prefix=['pos1', 'pos2'], columns=['pos1', 'pos2'])

print('bitles_pos.shape:', bitles_pos.shape)
bitles_pos.sample()

bitles_pos.shape: (244, 50)


Unnamed: 0,pos1_CC,pos1_CD,pos1_DT,pos1_IN,pos1_JJ,pos1_JJR,pos1_JJS,pos1_MD,pos1_NN,pos1_NNS,...,pos2_TO,pos2_VB,pos2_VBD,pos2_VBG,pos2_VBN,pos2_VBP,pos2_VBZ,pos2_WDT,pos2_WP,pos2_WRB
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
all_bitles_pos = bitles_pos.values.tolist()
print('all_bitles_pos list of lists length:', len(all_bitles_pos))
print('length of inner lists aka number of pos tags:', len(all_bitles_pos[0]))

all_bitles_pos list of lists length: 244
length of inner lists aka number of pos tags: 50


In [20]:
bishake_pos = bishake[['pos1', 'pos2']]
bishake_pos = pd.get_dummies(bishake_pos, prefix=['pos1', 'pos2'], columns=['pos1', 'pos2'])

print('bishake_pos.shape:', bishake_pos.shape)
bishake_pos.sample()

bishake_pos.shape: (228508, 50)


Unnamed: 0,pos1_CC,pos1_CD,pos1_DT,pos1_IN,pos1_JJ,pos1_JJR,pos1_JJS,pos1_MD,pos1_NN,pos1_NNS,...,pos2_TO,pos2_VB,pos2_VBD,pos2_VBG,pos2_VBN,pos2_VBP,pos2_VBZ,pos2_WDT,pos2_WP,pos2_WRB
224387,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
all_bishake_pos = bishake_pos.values.tolist()
print('all_bishake_pos list of lists length:', len(all_bishake_pos))
print('length of inner lists aka number of pos tags:', len(all_bishake_pos[0]))

all_bishake_pos list of lists length: 228508
length of inner lists aka number of pos tags: 50


In [22]:
title_col = bitles_pos.columns
shake_col = bishake_pos.columns

In [23]:
for i in range(0, 50):
    if title_col[i] != shake_col[i]:
        print('Alas!', title_col[i], shake_col[i])

In [24]:
bitles['all_pos'] = all_bitles_pos
bitles.sample()

Unnamed: 0,title,good_title,pos,pos1,pos2,title_token,word_vec,result,all_pos
169,envy could,0,"['NN', 'MD']",NN,MD,"['envy', 'could']",[[-1.04356455e-02 -1.08434558e-01 7.23581389e...,"[-0.0104356455, -0.108434558, 0.0723581389, -0...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [25]:
bishake['all_pos'] = all_bishake_pos
bishake.sample()

Unnamed: 0,title,pos,pos_len,pos1,pos2,title_token,word_vec,result,all_pos
23855,beautiful ?,"['NN', '.']",2,NN,.,"['beautiful', '?']",[[-6.15859358e-03 -1.01133250e-01 7.16529712e...,"[-0.00615859358, -0.10113325, 0.0716529712, -0...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [26]:
bitles_pos2 = bitles[['pos1', 'pos2']]
bitles_pos2 = pd.get_dummies(bitles_pos2, prefix=['pos1', 'pos2'], columns=['pos1', 'pos2'], drop_first=True)

print('bitles_pos2.shape:', bitles_pos2.shape)

dp1_bitles_pos = bitles_pos2.values.tolist()
print('all_bitles_pos list of lists length:', len(dp1_bitles_pos))
print('length of inner lists aka number of pos tags:', len(dp1_bitles_pos[0]))

bitles_pos2.shape: (244, 48)
all_bitles_pos list of lists length: 244
length of inner lists aka number of pos tags: 48


In [27]:
bishake_pos2 = bishake[['pos1', 'pos2']]
bishake_pos2 = pd.get_dummies(bishake_pos2, prefix=['pos1', 'pos2'], columns=['pos1', 'pos2'], drop_first=True)

print('bishake_pos2.shape:', bishake_pos2.shape)

dp1_bishake_pos = bishake_pos2.values.tolist()
print('dp1_bishake_pos list of lists length:', len(dp1_bishake_pos))
print('length of inner lists aka number of pos tags:', len(dp1_bishake_pos[0]))

bishake_pos2.shape: (228508, 48)
dp1_bishake_pos list of lists length: 228508
length of inner lists aka number of pos tags: 48


In [28]:
bitles['dp1_pos'] = dp1_bitles_pos
bishake['dp1_pos'] = dp1_bishake_pos

In [29]:
bitles.sample()

Unnamed: 0,title,good_title,pos,pos1,pos2,title_token,word_vec,result,all_pos,dp1_pos
138,been studied,0,"['VBN', 'VBN']",VBN,VBN,"['been', 'studied']",[[ 3.30702662e-01 -8.10150981e-01 -5.32813907e...,"[0.330702662, -0.810150981, -0.532813907, -0.8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [31]:
bishake.sample()

Unnamed: 0,title,pos,pos_len,pos1,pos2,title_token,word_vec,result,all_pos,dp1_pos
220052,well now,"['RB', 'RB']",2,RB,RB,"['well', 'now']",[[ 7.37399161e-02 -1.16558611e+00 -1.33562589e...,"[0.0737399161, -1.16558611, -1.33562589, -0.05...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."


In [32]:
datapath = '../data'

datapath_bishake = os.path.join(datapath, 'shakespearean_bigrams.pkl')
bishake.to_pickle(datapath_bishake)

datapath_bitles = os.path.join(datapath, 'bigram_titles.pkl')
bitles.to_pickle(datapath_bitles)