In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import TreebankWordTokenizer
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
import os

In [2]:
tritles = pd.read_csv('../data/trigram_titles_pos_w2v.csv')
tritles.head()

Unnamed: 0,title,good_title,pos,pos1,pos2,pos3,title_token,word_vec
0,all's well,1,"['DT', 'POS', 'NN']",DT,POS,NN,"['all', ""'s"", 'well']",[[-1.81372836e-01 1.97823837e-01 -7.73134887e...
1,the devil drives,1,"['DT', 'NN', 'NNS']",DT,NN,NNS,"['the', 'devil', 'drives']",[[-9.59224224e-01 1.22396159e+00 -6.39906406e...
2,edge of hazard,1,"['NN', 'IN', 'NN']",NN,IN,NN,"['edge', 'of', 'hazard']",[[-2.87398398e-02 -7.36244917e-02 1.42647075e...
3,a mingled yarn,1,"['DT', 'VBN', 'NN']",DT,VBN,NN,"['a', 'mingled', 'yarn']",[[ 3.78270075e-03 3.85284066e-01 -8.90050769e...
4,love is enough,1,"['NN', 'VBZ', 'JJ']",NN,VBZ,JJ,"['love', 'is', 'enough']",[[-6.29614413e-01 -7.17607141e-01 -1.16408551e...


In [3]:
trishake = pd.read_csv('../data/shakespearean_trigrams_pos.csv')
trishake.sample(5)

Unnamed: 0,title,title_tokenized,pos_tag,pos,pos_len,pos1,pos2,pos3
135813,from whence his,"['from', 'whence', 'his']","['IN', 'NN', 'PRP$']","['IN', 'NN', 'PRP$']",3,IN,NN,PRP$
62030,blind too .,"['blind', 'too', '.']","['VB', 'RB', '.']","['VB', 'RB', '.']",3,VB,RB,.
175004,his passion is,"['his', 'passion', 'is']","['PRP$', 'NN', 'VBZ']","['PRP$', 'NN', 'VBZ']",3,PRP$,NN,VBZ
314745,quickly to sir,"['quickly', 'to', 'sir']","['RB', 'TO', 'VB']","['RB', 'TO', 'VB']",3,RB,TO,VB
296069,our captain 's,"['our', 'captain', ""'s""]","['PRP$', 'NN', 'POS']","['PRP$', 'NN', 'POS']",3,PRP$,NN,POS


The trigrams still need word embedding

In [4]:
trishake = trishake.drop(columns=['title_tokenized', 'pos_tag', 'pos_len'])

In [5]:
twd = TreebankWordTokenizer()

trishake['title_token'] = trishake['title'].apply(lambda x: twd.tokenize(x))
trishake.sample(5)

Unnamed: 0,title,pos,pos1,pos2,pos3,title_token
397231,thick-grown brake we,"['JJ', 'NN', 'PRP']",JJ,NN,PRP,"[thick-grown, brake, we]"
194285,in continual practise,"['IN', 'JJ', 'NN']",IN,JJ,NN,"[in, continual, practise]"
133874,friends ! friends,"['NNS', '.', 'NNS']",NNS,.,NNS,"[friends, !, friends]"
122545,"fifty , or","['NN', ',', 'CC']",NN,",",CC,"[fifty, ,, or]"
394928,these letters are,"['DT', 'NNS', 'VBP']",DT,NNS,VBP,"[these, letters, are]"


Now to replace every word in each trigram with the learned word vector

In [6]:
wv = KeyedVectors.load("../feature_engineering/shake_w2v.wordsvectors", mmap='r')

In [7]:
words = set(wv.index2word)
len(words)

26304

In [8]:
trigram_vect = [np.array([wv[i] for i in ls if i in words])
                                for ls in trishake['title_token']]

In [9]:
trishake['word_vec'] = trigram_vect

In [10]:
loss_title_token = []

for i,v in enumerate(trigram_vect):
    if len(v) != 3:
        loss_title_token.append(i)

print('Number of trigrams with no word matches from title_token', len(loss_title_token))

Number of trigrams with no word matches from title_token 12284


Previous wrod has shown that many of these title losses are due to pwrds with a period at the end, such as:
`a bawd. hold` or ` about him. in`. These occur at sentence breaks, which are areas the we would not want titles generated from.
    

In [11]:
trishake.iloc[11170]

title                                            absence. let me
pos                                          ['NN', 'VB', 'PRP']
pos1                                                          NN
pos2                                                          VB
pos3                                                         PRP
title_token                                  [absence., let, me]
word_vec       [[-1.7783487, 0.5082124, -0.66882604, -0.21298...
Name: 11170, dtype: object

In [12]:
trishake.drop(index=loss_title_token, inplace=True)
trishake.shape

(480426, 7)

In [13]:
trishake.reset_index(drop=True, inplace=True)
trishake.sample(3)

Unnamed: 0,title,pos,pos1,pos2,pos3,title_token,word_vec
430131,was ever the,"['VBD', 'RB', 'DT']",VBD,RB,DT,"[was, ever, the]","[[-0.027510468, -0.4950884, -0.21331064, -2.44..."
348189,summers have enrich,"['NNS', 'VBP', 'VBN']",NNS,VBP,VBN,"[summers, have, enrich]","[[0.017781138, -0.039706722, 0.049680747, -0.0..."
179688,"i compile ,","['NN', 'NN', ',']",NN,NN,",","[i, compile, ,]","[[-0.38634965, -0.51687634, -1.2370201, 0.8931..."


I need to to be very careful with the one hot encoding because the title and shakespeare datasets will have different pos tags. I'll need to filter out all the pos tags in the shakespeare dataset that do not appear in the titles dataset, and visa versa

In [14]:
all_tritles_pos1 = set(tritles.pos1.tolist())
all_tritles_pos2 = set(tritles.pos2.tolist())
all_tritles_pos3 = set(tritles.pos3.tolist())

print('Number of pos tags in titles pos1:', len(all_tritles_pos1))
print('Number of pos tags in titles pos2:', len(all_tritles_pos2))
print('Number of pos tags in titles pos3:', len(all_tritles_pos3))

all_trishake_pos1 = set(trishake.pos1.tolist())
all_trishake_pos2 = set(trishake.pos2.tolist())
all_trishake_pos3 = set(trishake.pos3.tolist())

print('Number of pos tags in shakespeare pos1:', len(all_trishake_pos1))
print('Number of pos tags in shakespeare pos2:', len(all_trishake_pos2))
print('Number of pos tags in shakespeare pos3:', len(all_trishake_pos3))

Number of pos tags in titles pos1: 25
Number of pos tags in titles pos2: 29
Number of pos tags in titles pos3: 27
Number of pos tags in shakespeare pos1: 32
Number of pos tags in shakespeare pos2: 37
Number of pos tags in shakespeare pos3: 37


In [15]:
pos1_rm = list(all_trishake_pos1.difference(all_tritles_pos1))
print(pos1_rm)

pos2_rm = list(all_trishake_pos2.difference(all_tritles_pos2))
print(pos2_rm)

pos3_rm = list(all_trishake_pos3.difference(all_tritles_pos3))
print(pos3_rm)

pos4_rm = list(all_tritles_pos1.difference(all_trishake_pos1))
print(pos4_rm)

pos5_rm = list(all_tritles_pos2.difference(all_trishake_pos2))
print(pos5_rm)

pos6_rm = list(all_tritles_pos3.difference(all_trishake_pos3))
print(pos6_rm)

['PDT', 'FW', '.', 'UH', 'RP', 'RBS', 'WP']
['PDT', 'FW', ':', 'UH', 'WP', '""', 'NNP', 'WP$']
['PDT', 'EX', 'FW', ':', 'UH', 'JJS', 'RP', 'WP', '""', 'SYM']
[]
[]
[]


In [16]:
print('Starting number of trigrams', trishake.shape[0])
trishake = trishake[~trishake['pos1'].isin(pos1_rm)]
trishake.shape[0]

Starting number of trigrams 480426


474371

In [17]:
print('Starting number of trigrams', trishake.shape[0])
trishake = trishake[~trishake['pos2'].isin(pos2_rm)]
trishake.shape[0]

Starting number of trigrams 474371


471567

In [18]:
print('Starting number of trigrams', trishake.shape[0])
trishake = trishake[~trishake['pos3'].isin(pos3_rm)]
trishake.shape[0]

Starting number of trigrams 471567


463010

In [19]:
all_trishake_pos1 = set(trishake.pos1.tolist())
all_trishake_pos2 = set(trishake.pos2.tolist())
all_trishake_pos3 = set(trishake.pos3.tolist())

print('Number of pos tags in shakespeare pos1:', len(all_trishake_pos1))
print('Number of pos tags in shakespeare pos2:', len(all_trishake_pos2))
print('Number of pos tags in shakespeare pos3:', len(all_trishake_pos3))

Number of pos tags in shakespeare pos1: 25
Number of pos tags in shakespeare pos2: 29
Number of pos tags in shakespeare pos3: 27


In [20]:
all_tritles_pos1 = set(tritles.pos1.tolist())
all_tritles_pos2 = set(tritles.pos2.tolist())
all_tritles_pos3 = set(tritles.pos3.tolist())

print('Number of pos tags in titles pos1:', len(all_tritles_pos1))
print('Number of pos tags in titles pos2:', len(all_tritles_pos2))
print('Number of pos tags in titles pos3:', len(all_tritles_pos3))

all_trishake_pos1 = set(trishake.pos1.tolist())
all_trishake_pos2 = set(trishake.pos2.tolist())
all_trishake_pos3 = set(trishake.pos3.tolist())

print('Number of pos tags in shakespeare pos1:', len(all_trishake_pos1))
print('Number of pos tags in shakespeare pos2:', len(all_trishake_pos2))
print('Number of pos tags in shakespeare pos3:', len(all_trishake_pos3))

Number of pos tags in titles pos1: 25
Number of pos tags in titles pos2: 29
Number of pos tags in titles pos3: 27
Number of pos tags in shakespeare pos1: 25
Number of pos tags in shakespeare pos2: 29
Number of pos tags in shakespeare pos3: 27


In [21]:
pos1_rm = list(all_trishake_pos1.difference(all_tritles_pos1))
print(pos1_rm)

pos2_rm = list(all_trishake_pos2.difference(all_tritles_pos2))
print(pos2_rm)

pos3_rm = list(all_trishake_pos3.difference(all_tritles_pos3))
print(pos3_rm)

[]
[]
[]


POS tags synced! Now to create dummy variables from the POS tags.

In [22]:
tritles_pos = tritles[['pos1', 'pos2', 'pos3']]
tritles_pos = pd.get_dummies(tritles_pos, prefix=['pos1', 'pos2', 'pos3'], columns=['pos1', 'pos2', 'pos3'])
print('tritles_pos.shape:', tritles_pos.shape)
tritles_pos.sample()

tritles_pos.shape: (389, 81)


Unnamed: 0,pos1_CC,pos1_CD,pos1_DT,pos1_EX,pos1_IN,pos1_JJ,pos1_JJR,pos1_JJS,pos1_MD,pos1_NN,...,pos3_TO,pos3_VB,pos3_VBD,pos3_VBG,pos3_VBN,pos3_VBP,pos3_VBZ,pos3_WDT,pos3_WP$,pos3_WRB
212,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
all_tritles_pos = tritles_pos.values.tolist()
print('all_tritles_pos list of lists length:', len(all_tritles_pos))
print('length of inner lists aka number of pos tags:', len(all_tritles_pos[0]))

all_tritles_pos list of lists length: 389
length of inner lists aka number of pos tags: 81


In [24]:
trishake_pos = trishake[['pos1', 'pos2', 'pos3']]
trishake_pos = pd.get_dummies(trishake_pos, prefix=['pos1', 'pos2', 'pos3'], columns=['pos1', 'pos2', 'pos3'])
print('trishake_pos.shape:', trishake_pos.shape)
trishake_pos.sample()

trishake_pos.shape: (463010, 81)


Unnamed: 0,pos1_CC,pos1_CD,pos1_DT,pos1_EX,pos1_IN,pos1_JJ,pos1_JJR,pos1_JJS,pos1_MD,pos1_NN,...,pos3_TO,pos3_VB,pos3_VBD,pos3_VBG,pos3_VBN,pos3_VBP,pos3_VBZ,pos3_WDT,pos3_WP$,pos3_WRB
213802,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [25]:
all_trishake_pos = trishake_pos.values.tolist()
print('all_trishake_pos list of lists length:', len(all_trishake_pos))
print('length of inner lists aka number of pos tags:', len(all_trishake_pos[0]))

all_trishake_pos list of lists length: 463010
length of inner lists aka number of pos tags: 81


In [26]:
title_col = tritles_pos.columns
shake_col = trishake_pos.columns

for i in range(0, 81):
    if title_col[i] != shake_col[i]:
        print('Alas!', title_col[i], shake_col[i])

In [27]:
tritles['all_pos'] = all_tritles_pos
tritles.sample()

Unnamed: 0,title,good_title,pos,pos1,pos2,pos3,title_token,word_vec,all_pos
373,facile question bear,0,"['JJ', 'NN', 'NN']",JJ,NN,NN,"['facile', 'question', 'bear']",[[ 7.76453083e-03 -1.61159341e-03 -1.35662211e...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [28]:
trishake['all_pos'] = all_trishake_pos
trishake.sample()

Unnamed: 0,title,pos,pos1,pos2,pos3,title_token,word_vec,all_pos
194636,in your deeds,"['IN', 'PRP$', 'NNS']",IN,PRP$,NNS,"[in, your, deeds]","[[-1.1804608, 0.88281506, 0.58544093, -1.35429...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


And now to create a second dataset by removing the first one-hot encoded category

In [29]:
tritles_pos2 = tritles[['pos1', 'pos2', 'pos3']]
tritles_pos2 = pd.get_dummies(tritles_pos2, prefix=['pos1', 'pos2', 'pos3'], columns=['pos1', 'pos2', 'pos3'], drop_first=True)
print('tritles_pos2.shape:', tritles_pos2.shape)
tritles_pos2.sample()

tritles_pos2.shape: (389, 78)


Unnamed: 0,pos1_CD,pos1_DT,pos1_EX,pos1_IN,pos1_JJ,pos1_JJR,pos1_JJS,pos1_MD,pos1_NN,pos1_NNS,...,pos3_TO,pos3_VB,pos3_VBD,pos3_VBG,pos3_VBN,pos3_VBP,pos3_VBZ,pos3_WDT,pos3_WP$,pos3_WRB
237,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
dp1_tritles_pos = tritles_pos2.values.tolist()
print('dp1_tritles_pos list of lists length:', len(dp1_tritles_pos))
print('length of inner lists aka number of pos tags:', len(dp1_tritles_pos[0]))

dp1_tritles_pos list of lists length: 389
length of inner lists aka number of pos tags: 78


In [31]:
trishake_pos2 = trishake[['pos1', 'pos2', 'pos3']]
trishake_pos2 = pd.get_dummies(trishake_pos2, prefix=['pos1', 'pos2', 'pos3'], columns=['pos1', 'pos2', 'pos3'], drop_first=True)
print('trishake_pos2.shape:', trishake_pos2.shape)
trishake_pos2.sample()

trishake_pos2.shape: (463010, 78)


Unnamed: 0,pos1_CD,pos1_DT,pos1_EX,pos1_IN,pos1_JJ,pos1_JJR,pos1_JJS,pos1_MD,pos1_NN,pos1_NNS,...,pos3_TO,pos3_VB,pos3_VBD,pos3_VBG,pos3_VBN,pos3_VBP,pos3_VBZ,pos3_WDT,pos3_WP$,pos3_WRB
91362,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
dp1_trishake_pos = trishake_pos2.values.tolist()
print('dp1_trishake_pos list of lists length:', len(dp1_trishake_pos))
print('length of inner lists aka number of pos tags:', len(dp1_trishake_pos[0]))

dp1_trishake_pos list of lists length: 463010
length of inner lists aka number of pos tags: 78


In [33]:
title_col2 = tritles_pos2.columns
shake_col2 = trishake_pos2.columns

for i in range(0, 78):
    if title_col2[i] != shake_col2[i]:
        print('Alas!', title_col2[i], shake_col2[i])

In [34]:
tritles['dp1_pos'] = dp1_tritles_pos
tritles.sample()

Unnamed: 0,title,good_title,pos,pos1,pos2,pos3,title_token,word_vec,all_pos,dp1_pos
222,but throw her,0,"['CC', 'VB', 'PRP$']",CC,VB,PRP$,"['but', 'throw', 'her']",[[-4.21053231e-01 -8.56678188e-01 -8.04431915e...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [35]:
trishake['dp1_pos'] = dp1_trishake_pos
trishake.sample()

Unnamed: 0,title,pos,pos1,pos2,pos3,title_token,word_vec,all_pos,dp1_pos
136416,given him any,"['VBN', 'PRP', 'DT']",VBN,PRP,DT,"[given, him, any]","[[-0.3254524, 0.10828757, 0.021896513, -0.2253...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


The vector in `word_vec` is stored as a string. Need to convert into a 600 dimension word vector.

In [36]:
result = tritles['word_vec'].apply(lambda x: 
                           np.fromstring(
                               x.replace('\n','')
                                .replace('[','')
                                .replace(']','')
                                .replace('  ',' '), sep=' '))

len(result[0])

600

In [37]:
tritles['w2v'] = result
tritles.sample()

Unnamed: 0,title,good_title,pos,pos1,pos2,pos3,title_token,word_vec,all_pos,dp1_pos,w2v
25,the brightest heaven,1,"['DT', 'JJS', 'NN']",DT,JJS,NN,"['the', 'brightest', 'heaven']",[[-9.59224224e-01 1.22396159e+00 -6.39906406e...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.959224224, 1.22396159, -0.639906406, -0.51..."


Combine the `word_vec` and `pos` arrays 

In [38]:
w2v_all_pos = tritles[['w2v', 'all_pos']]
w2v_all = w2v_all_pos.values.tolist()

w2v_dp1_pos = tritles[['w2v', 'dp1_pos']]
w2v_dp1 = w2v_dp1_pos.values.tolist()

print('Length of w2v_all list:', len(w2v_all))
print('Length of first element of w2v_all list:', len(w2v_all[0]))
print('Length of first element of inner list:', len(w2v_all[0][0]))
print('Type of first element of inner list:', type(w2v_all[0][0]))
print('Length of seceond element of inner list', len(w2v_all[0][1]))
print('Type of element of inner list', type(w2v_all[0][1]))

Length of w2v_all list: 389
Length of first element of w2v_all list: 2
Length of first element of inner list: 600
Type of first element of inner list: <class 'numpy.ndarray'>
Length of seceond element of inner list 81
Type of element of inner list <class 'list'>


In [39]:
new_w2v_all = []

for i in range(len(w2v_all)):
    new_w2v_all.append(list(w2v_all[i][0]) + w2v_all[i][1])
    
print('Length of new_w2v_all', len(new_w2v_all))
print('Length of inner list', len(new_w2v_all[0]))

Length of new_w2v_all 389
Length of inner list 681


In [40]:
new_w2v_dp1 = []

for i in range(len(w2v_dp1)):
    new_w2v_dp1.append(list(w2v_dp1[i][0]) + w2v_dp1[i][1])
    
print('Length of new_w2v_dp1', len(new_w2v_dp1))
print('Length of inner list', len(new_w2v_dp1[0]))

Length of new_w2v_dp1 389
Length of inner list 678


In [41]:
tritles['w2v_all'] = new_w2v_all
tritles['w2v_dp1'] = new_w2v_dp1
tritles.sample()

Unnamed: 0,title,good_title,pos,pos1,pos2,pos3,title_token,word_vec,all_pos,dp1_pos,w2v,w2v_all,w2v_dp1
160,perchance to dream,1,"['NN', 'TO', 'VB']",NN,TO,VB,"['perchance', 'to', 'dream']",[[-3.04288212e-02 -1.39931336e-01 -2.82580741e...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[-0.0304288212, -0.139931336, -0.0282580741, 0...","[-0.0304288212, -0.139931336, -0.0282580741, 0...","[-0.0304288212, -0.139931336, -0.0282580741, 0..."


Do the same for the shakepeare trigrams `word_vec` and `pos` arrays. Also need to flatten `word_vec` which is an array of three 200 dimension word vectors.

In [42]:
result = trishake['word_vec'].apply(lambda x: x.flatten())

len(result)

463010

In [43]:
trishake['w2v'] = result
trishake.sample()

Unnamed: 0,title,pos,pos1,pos2,pos3,title_token,word_vec,all_pos,dp1_pos,w2v
139976,good mistress page,"['JJ', 'NN', 'NN']",JJ,NN,NN,"[good, mistress, page]","[[0.89183116, -0.21151099, -0.4573949, 0.00399...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.89183116, -0.21151099, -0.4573949, 0.003994..."


In [44]:
datapath = '../data'

datapath_trishake = os.path.join(datapath, 'shakespearean_trigrams.pkl')
trishake.to_pickle(datapath_trishake)

datapath_tritles = os.path.join(datapath, 'trigram_titles.pkl')
tritles.to_pickle(datapath_tritles)

In [72]:
# w2v_all_pos = trishake[['w2v', 'all_pos']]
# w2v_all = w2v_all_pos.values.tolist()

# w2v_dp1_pos = trishake[['w2v', 'dp1_pos']]
# w2v_dp1 = w2v_dp1_pos.values.tolist()

# print('Length of w2v_all list:', len(w2v_all))
# print('Length of first element of w2v_all list:', len(w2v_all[0]))
# print('Length of first element of inner list:', len(w2v_all[0][0]))
# print('Type of first element of inner list:', type(w2v_all[0][0]))
# print('Length of seceond element of inner list', len(w2v_all[0][1]))
# print('Type of element of inner list', type(w2v_all[0][1]))

Length of w2v_all list: 463010
Length of first element of w2v_all list: 2
Length of first element of inner list: 600
Type of first element of inner list: <class 'numpy.ndarray'>
Length of seceond element of inner list 81
Type of element of inner list <class 'list'>


These dataframes are getting too large for my processor, so I am splitting them now:

**trigram_df1**, which will retain the dropped first one-hot encoding and **trigram_df2** which will retain all of the pos encoding

In [45]:
trigram_df1 = trishake[['title', 'w2v', 'dp1_pos']]
trigram_df1.sample()

Unnamed: 0,title,w2v,dp1_pos
82164,come to cressid,"[-0.43055856, 0.2836218, -1.549295, 0.44880795...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [48]:
word_vec2 = trigram_df1['w2v'].apply(lambda x: list(x))

In [49]:
trigram_df1['w2v'] = word_vec2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trigram_df1['w2v'] = word_vec2


In [46]:
trigram_df2 = trishake[['title', 'w2v', 'all_pos']]
trigram_df2.sample()

Unnamed: 0,title,w2v,all_pos
463302,ye should jar,"[0.38721955, -0.6298508, -0.64152753, 0.548075...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."


In [50]:
w2v_dp1_pos = trigram_df1[['w2v', 'dp1_pos']]
w2v_dp1 = w2v_dp1_pos.values.tolist()

print('Length of w2v_dp1 list:', len(w2v_dp1))
print('Length of first element of w2v_dp1 list:', len(w2v_dp1[0]))
print('Length of first element of inner list:', len(w2v_dp1[0][0]))
print('Type of first element of inner list:', type(w2v_dp1[0][0]))
print('Length of seceond element of inner list', len(w2v_dp1[0][1]))
print('Type of element of inner list', type(w2v_dp1[0][1]))

Length of w2v_dp1 list: 463010
Length of first element of w2v_dp1 list: 2
Length of first element of inner list: 600
Type of first element of inner list: <class 'list'>
Length of seceond element of inner list 78
Type of element of inner list <class 'list'>


In [51]:
new_w2v_dp1 = []

for i in range(len(w2v_dp1)):
    new_w2v_dp1.append(w2v_dp1[i][0] + w2v_dp1[i][1])
    
print('Length of new_w2v_dp1', len(new_w2v_dp1))
print('Length of inner list', len(new_w2v_dp1[0]))

Length of new_w2v_dp1 463010
Length of inner list 678


In [52]:
trigram_df1['w2v_dp1'] = new_w2v_dp1
trigram_df1.sample()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trigram_df1['w2v_dp1'] = new_w2v_dp1


Unnamed: 0,title,w2v,dp1_pos,w2v_dp1
188486,impudence to gainsay,"[0.00053952844, -0.018305901, 0.015596594, -0....","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0.00053952844, -0.018305901, 0.015596594, -0...."


In [53]:
print(trigram_df1.iloc[15654,3])

[0.0807096, -1.3587284, -1.0823939, -1.0976366, 0.047064725, -0.06557366, 0.2576524, -1.729977, -0.08876638, -0.7768364, -1.1120688, 0.1703223, -0.80719894, -1.0509889, -0.9496145, 1.399656, -0.61601216, 1.1853921, 0.14824629, -0.118394315, -0.9897278, 1.070386, -0.7828252, -1.138204, -0.53862935, -0.24603614, 0.5009268, 0.17365944, 1.430671, -0.15076216, -1.0192459, -1.5335596, 0.24154729, -1.075438, 0.8162576, 0.033223614, 2.2081532, -0.732324, -0.2924743, 0.8710904, -0.48896503, -1.2354227, 1.1289583, -0.0779926, -0.82045686, -0.4268056, -0.33609322, -0.17917156, -0.035220284, 0.4964373, 0.004385288, -0.58764887, -0.29082775, -0.02412139, -0.6514667, -0.35490793, 1.027432, -1.621269, -1.1602274, 1.2093196, -1.1650782, 0.06542528, -1.1833655, -0.2689617, -0.7662771, -0.02973663, 0.7769583, -0.3336884, -2.047132, -1.342562, 2.3502333, -0.9453741, 0.015789187, 0.32739648, 1.1115769, -0.16643466, 0.5801253, 0.15593454, 0.6774502, -0.2628041, 0.29531765, 0.43133, 0.92916787, -0.1544743, 

In [54]:
print(trigram_df1.iloc[15654,0])

am dancing ,


In [None]:
datapath_trigram_df1 = os.path.join(datapath, 'trigram_df1.pkl')
trigram_df1.to_pickle(datapath_trigram_df1)

Create train and test sets

In [56]:
X = tritles[['title', 'w2v_all', 'w2v_dp1']]
y = tritles[['title', 'good_title']]