In [1]:
import gensim
import numpy as np
import pandas as pd
import os
import re
from nltk.tokenize import TreebankWordTokenizer

pd.set_option('display.max_colwidth', None)

In [2]:
shake_bi = pd.read_csv('../data/shakespearean_bigrams_pos.csv')

In [3]:
shake_bi.sample(5)

Unnamed: 0,title,title_tokenized,pos_tag,pos,pos_len,pos1,pos2
205373,thou note,"['thou', 'note']","['NN', 'NN']","['NN', 'NN']",2,NN,NN
192052,the blackest,"['the', 'blackest']","['DT', 'JJS']","['DT', 'JJS']",2,DT,JJS
114350,look they,"['look', 'they']","['NN', 'PRP']","['NN', 'PRP']",2,NN,PRP
182886,strange insurrections,"['strange', 'insurrections']","['JJ', 'NNS']","['JJ', 'NNS']",2,JJ,NNS
152805,pluck 'd,"['pluck', ""'d""]","['NN', 'MD']","['NN', 'MD']",2,NN,MD


In [4]:
shake_bi = shake_bi.drop(columns=['title_tokenized', 'pos_tag'])

In [5]:
twd = TreebankWordTokenizer()
shake_bi['title_token'] = shake_bi['title'].apply(lambda x: twd.tokenize(x))
shake_bi.sample(5)

Unnamed: 0,title,pos,pos_len,pos1,pos2,title_token
198858,their tender,"['PRP$', 'NN']",2,PRP$,NN,"[their, tender]"
101585,instigated by,"['VBN', 'IN']",2,VBN,IN,"[instigated, by]"
30054,bought mine,"['NN', 'NN']",2,NN,NN,"[bought, mine]"
18986,as they,"['IN', 'PRP']",2,IN,PRP,"[as, they]"
144203,or from,"['CC', 'IN']",2,CC,IN,"[or, from]"


Previous work has demonstrated that words with a period at the end (such as `soldier.`) will not have an accompanying word vector, which would result in the loss of thousands of entries. The below code will remove the period at the end of such words, but will retain other punctuation and single event periods.

In [16]:
titles = shake_bi.title_token.tolist()

In [7]:
first = []
second = []

for title in titles:
    first.append(title[0])
    second.append(title[1])
        

In [9]:
new_first = []

for word in first:
    if len(word) > 1 and word.endswith('.'):
        new_first.append(word.rstrip(word[-1]))
    else:
        new_first.append(word)
        

In [10]:
new_second = []

for word in second:
    if len(word) > 1 and word.endswith('.'):
        new_second.append(word.rstrip(word[-1]))
    else:
        new_second.append(word)

In [11]:
title_clean = list(list(x) for x in zip(new_first, new_second))

In [14]:
title_clean[177240:177245]

[['soldier', 'with'],
 ['soldier', 'you'],
 ['soldier-like', 'phrase'],
 ['soldier', 'and'],
 ['soldier', 'let']]

In [15]:
shake_bi['title_clean'] = title_clean
shake_bi.sample(5)

Unnamed: 0,title,pos,pos_len,pos1,pos2,title_token,title_clean
210605,to example,"['TO', 'NN']",2,TO,NN,"[to, example]","[to, example]"
82764,hate ',"['NN', '""""']",2,NN,"""""","[hate, ']","[hate, ']"
93104,hit the,"['VB', 'DT']",2,VB,DT,"[hit, the]","[hit, the]"
61280,every coast,"['DT', 'NN']",2,DT,NN,"[every, coast]","[every, coast]"
105881,jove and,"['NN', 'CC']",2,NN,CC,"[jove, and]","[jove, and]"


Now to replace every word in each bigram with the learned word vector

In [18]:
wv = gensim.models.KeyedVectors.load("shake_w2v.wordsvectors", mmap='r')

In [19]:
words = set(wv.index2word)
len(words)

26304

In [20]:
bigram_vect = [np.array([wv[i] for i in ls if i in words])
                                for ls in shake_bi['title_clean']]

In [23]:
loss_title_clean = []

for i,v in enumerate(bigram_vect):
    if len(v) != 2:
        loss_title_clean.append(i)
#         print(shake_bi['title_clean'][i])
#         print('Vector len:', len(v), 'Index:', i)

print('Number of bigrams with no word matches form title_clean', len(loss_title_clean))

Number of bigrams with no word matches form title_clean 67


In [24]:
bigram_vect2 = [np.array([wv[i] for i in ls if i in words])
                                for ls in shake_bi['title_token']]

In [25]:
loss_title_token = []

for i,v in enumerate(bigram_vect2):
    if len(v) != 2:
        loss_title_token.append(i)

print('Number of bigrams with no word matches form title_token', len(loss_title_token))

Number of bigrams with no word matches form title_token 5970


After all the above coding, I think I would rather lose 5,970 titles. Now that it is morning, and I am thinking *clearly*, I would not want most of those titles anyway because they are at the end of a sentence. I think it was my original belief, that genism took sentence ends into consideration. Reversing course below...

In [29]:
shake_bi.drop(columns=['title_clean'], inplace=True)
shake_bi.sample()

Unnamed: 0,title,pos,pos_len,pos1,pos2,title_token
120676,me something,"['PRP', 'NN']",2,PRP,NN,"[me, something]"


In [30]:
shake_bi['word_vec'] = bigram_vect2
shake_bi.sample()

Unnamed: 0,title,pos,pos_len,pos1,pos2,title_token,word_vec
237925,ye done,"['NN', 'VBN']",2,NN,VBN,"[ye, done]","[[0.38721955, -0.6298508, -0.64152753, 0.5480759, -0.27110818, -0.39844513, -0.043123763, -0.7947163, 0.72984946, -0.16564111, -0.33992615, -0.92562103, 0.14775954, -0.36516213, -0.5071994, 0.099314995, -0.22760622, 0.3127083, 0.07279155, -0.5925049, 0.19713917, -0.09201206, 0.11940089, -0.17175232, 0.54297876, 0.31815404, -0.19652726, 0.5672226, 0.78805965, 0.24863446, 0.85366786, 0.31839156, -0.29994288, 0.48651826, -0.16115738, 0.30141068, 0.8820832, -0.791511, -0.13478667, -0.34277233, 0.43665951, -0.39968136, 0.1799807, -0.36762482, 0.3285105, -0.0061254804, -0.088755235, 0.29515928, -0.016218198, 0.013438691, 0.13234694, 0.072043456, -0.117082514, 0.18616527, -0.49476078, -0.15034261, 0.34449473, -0.7079535, -0.0849729, 0.7924161, 0.53180283, 0.022599643, -0.73220396, -0.3911725, 0.48320225, 0.20673294, 0.7856577, 0.011493572, 0.43976203, -0.32445842, 0.08919123, -0.6732706, 0.003777869, 0.56608033, 0.9314576, -0.13060005, -0.088135466, -0.108308144, 0.27884117, -0.46055293, 0.07342696, 0.0559225, 0.69816613, 0.4011239, -0.028544694, 0.08064641, 0.026036564, -0.19206068, -0.41460633, -0.4037335, 0.2626454, -0.12135888, 0.3745098, -0.13898301, 0.054465998, -0.116109446, 0.11705433, 0.4324808, -0.12951922, 0.22345495, ...], [-0.07660817, -0.46201295, -0.5043615, -0.37414637, -0.31470814, -0.8197248, -0.44553378, -0.5021222, -0.2418622, -0.031998783, -0.23411427, -0.55792576, 0.64886415, -0.70458436, -0.76247084, 0.19106038, 0.41632313, 0.26451474, -0.6341871, -0.67780197, 0.07453282, -0.50505286, -0.2803194, -0.08616192, 0.52132565, 0.36279613, 0.06505542, 0.8476284, 0.5180436, -0.052286968, 1.2920185, 0.014844066, 0.4599209, 0.7517877, -0.2820122, -0.13168254, 0.8589992, -0.9272397, -0.062209684, 0.22712871, -0.05800619, -0.94080347, 1.1490519, -0.070918664, 0.4371037, 0.16446914, -0.8361203, 0.871916, 0.2964518, -0.35617283, -0.12215503, 0.37325692, -0.031303022, -0.11569968, -0.93556637, -0.35087806, 0.4073037, -0.37156284, 0.17356054, -0.02665485, 0.7202497, 1.1636028, -1.4689933, 0.16169229, 0.5691571, 0.39378658, 1.2258487, -0.65096074, -0.007456261, -0.08383923, 0.19063438, -0.20051263, -0.54743105, -0.023379356, 0.9503865, -0.4447622, 0.17243233, 0.30253276, 0.042590097, 0.07329144, 0.17095493, 0.3200955, -0.2536034, 0.06708895, -0.5525636, -0.22286516, -0.2790356, -0.37580344, 0.0032717872, -0.3346582, -0.7029885, 0.49254662, 0.25746125, 0.0010136539, -0.70784163, 0.6414307, -0.046651535, 0.5646374, -1.0406041, -0.37364417, ...]]"


In [31]:
shake_bi.shape

(242601, 7)

In [32]:
len(loss_title_token)

5970

In [33]:
shake_bi.drop(index=loss_title_token, inplace=True)
shake_bi.shape

(236631, 7)

In [34]:
shake_bi.reset_index(drop=True, inplace=True)

In [35]:
datapath = '../data'

datapath_shake_bi = os.path.join(datapath, 'shakespearean_bigrams_pos_w2v.csv')
shake_bi.to_csv(datapath_shake_bi, index=False)