<h3>word2vec / doc2vec</h3>

In [1]:
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec
from sklearn.manifold import TSNE
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import NLP
from matplotlib import pyplot as plt
import pickle
from pyemd import emd
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from random import shuffle

In [2]:
# connect to postgresql db
username = 'kimberly'
dbname = 'medium'

dbe = create_engine('postgres://%s@localhost/%s'%(username,dbname))

# get articles df, drop missing data
dfA = pd.read_sql('articles', dbe, index_col='postid')
dfA = dfA.dropna(axis=0,how='any')

dfS = pd.read_sql('sentences', dbe, index_col='level_0')

<h3>doc2vec tutorial</h3>

The first goal is to create `sentences`, a list of lists of words in each sentence.
This is the same as the sentence corpus I created earlier....
Use the NLCorporizer to get this list.

In [3]:
mText = NLP.NLCorporizer(list(dfA.text))
mText.process_text(break_on=['.'], init_split_on='database', origdb=list(dfA.origdb), to_stem=False)
mText.make_ddiv_count()
sent_count = mText.get_ddiv_count()
#print(sent_count)
print(len(sent_count))
textdf = mText.get_text(ttype='tokenized',output_type='dataframe')


4640


In [4]:
# get sentences list made!!!
sentences_a = []
for art in textdf.index:
    sentences_a.extend(list(textdf.iloc[art])[0:sent_count[art]])   
print(len(sentences_a))
print(sentences_a[1])

434792
['today', 'air', 'force', 'one', 'touched', 'havana', 'first', 'time', 'history']


Now, let word2vec run....

In [44]:
# run w2v model

w2v = Word2Vec(sentences_a, workers=2, size=100, min_count=2, window=10, sample=1e-3)
w2v.save('./sentencesw2v_nostem.d2v')
print('saved')

X = w2v[w2v.wv.vocab]



saved


In [45]:
X.shape

(50188, 100)

In [75]:
print(sentences_a[1])
w2v.wv.most_similar(positive=sentences_a[1])

['today', 'air', 'force', 'one', 'touched', 'havana', 'first', 'time', 'history']


[('tumultuous', 0.7570525407791138),
 ('witnessed', 0.7554850578308105),
 ('layoff', 0.7460023760795593),
 ('contemplated', 0.7436095476150513),
 ('embattled', 0.7392175793647766),
 ('shattering', 0.7386874556541443),
 ('calamity', 0.738664448261261),
 ('snuffed', 0.7384495735168457),
 ('woe', 0.7366495132446289),
 ('scariest', 0.7341961860656738)]

...and we have successfully run w2v on our sentences!

<h3>doc2vec</h3>

Work through the doc2vec tutorial.

In [47]:
# class for labeled line sentences

class LabeledLineSentenceMany(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

In [61]:
class LabeledLineSentence(object):
    def __init__(self, filename):
        self.filename = filename
    def __iter__(self):
        for uid, line in enumerate(open(self.filename)):
            yield LabeledSentence(words=line.split(), tags=['SENT_%s' % uid])

In [49]:
# save sentences as a file
with open('sentences_nostem_lls.txt','a+') as f:
    for s in sentences:
        f.write(' '.join(s) + '\n')

In [62]:
sentences = LabeledLineSentence('sentences_nostem_lls.txt')


In [63]:
d2v = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=2)

d2v.build_vocab(sentences)


In [72]:
#d2v.train(sentences,total_examples=d2v.corpus_count,epochs=10)
    
#d2v.save('./sentences_nostem_d2v.d2v')

array([ 0.53315175, -0.59312069, -0.3701601 , -0.18196325,  0.28388122,
       -0.74992895,  0.68897438,  0.26536128, -0.34383121,  1.07788527,
        0.37506256, -0.05320381, -0.41084877, -1.07210004,  0.44800216,
       -0.56467682, -0.35093901,  0.0502545 ,  0.50935429, -0.7060861 ,
       -1.94781554, -0.10304946, -0.26417047,  0.92790186,  0.32236648,
        0.29425913, -0.16162297, -0.38732916,  1.25885463,  0.58431602,
       -0.5896669 ,  0.3234182 , -0.1274462 ,  0.48359117, -0.87190038,
        1.24142945,  0.05691811, -0.28647289, -0.01585779,  1.43177879,
       -0.01189147, -0.53295171, -1.13922834,  1.11891568, -0.82652575,
        0.17162588,  0.3701334 , -0.36291724,  1.0749253 ,  0.11640415,
        0.959647  ,  1.10030985,  0.49011269,  0.41076243, -0.82315665,
       -0.41916588,  0.43981299, -0.40636227,  1.38032138,  0.10013141,
       -0.59827274,  0.50330585,  0.1101383 , -0.62858713, -0.51197654,
       -0.27379429, -0.50591344,  0.53733039, -0.34335226,  0.72

Now I can compute distances between documents!!

In [None]:
d2v = Doc2Vec.load('./sentences_nostem_d2v.d2v') 

In [15]:
d2v.wv.wmdistance(sentences_a[1],sentences_a[2])

8.903405465534675

In [21]:
# get article-wise text
mText.flatten_text()
articles_a=mText.get_text(ttype='flat')


In [32]:
ix1 = [0] + sent_count
ix1 = ix1[0:len(ix1)-1]
ix2 = sent_count

# for each article

# for each sentence in article

# compare sentence to article

swmdist = []
for ax,i1,i2 in zip(range(len(ix1)),ix1,ix2):
    for s in sentences_a[i1:i2]:
        swmdist.append(d2v.wv.wmdistance(s,articles_a[ax]))

KeyboardInterrupt: 

In [33]:
print(len(swmdist))

8817


In [36]:
 # pickle again
output = open('swmdist.pkl', 'wb')
pickle.dump(swmdist, output)
output.close()    

In [38]:
swmdist[0:10]

[7.362201324461683,
 7.2909977437455975,
 8.22981427988547,
 5.92001179102847,
 6.204813485985964,
 7.3937105626288195,
 7.549260488532161,
 7.7831144329862605,
 7.763446417078926,
 7.7728531862715995]

<h3>descope...</h3>
This is going to take about 10 days to run.

We need to use only the `startup` things.

In [44]:
pkl_file = open('startup_index.pkl', 'rb')
startup_ax = pickle.load(pkl_file)
pkl_file.close() 
print(len(startup_ax))

986


In [62]:
swmdist1 = []
for ax,i1,i2 in zip(range(len(ix1)),ix1,ix2):
    print(ax)
    if ax in startup_ax:
        for s in sentences_a[i1:i2]:
            swmdist1.append(d2v.wv.wmdistance(s,articles_a[ax]))
            
output = open('swmdist1.pkl', 'wb')
pickle.dump(swmdist1, output)
output.close()  

swmdist2 = []
for ax,i1,i2 in zip(range(len(ix1)),ix1,ix2):
    print(ax)    
    if ax not in startup_ax:
        for s in sentences_a[i1:i2]:
            swmdist2.append(d2v.wv.wmdistance(s,articles_a[ax]))
            
output = open('swmdist2.pkl', 'wb')
pickle.dump(swmdist2, output)
output.close() 

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

KeyboardInterrupt: 

In [63]:
print(len(swmdist1))

15477


In [64]:
output = open('swmdist1.pkl', 'wb')
pickle.dump(swmdist1, output)
output.close()  

In [65]:
print(len(swmdist2))

0


In [66]:
output = open('swmdist2.pkl', 'wb')
pickle.dump(swmdist2, output)
output.close() 

In [67]:
ax

1552

In [69]:
startup_ax

[1,
 7,
 8,
 11,
 17,
 22,
 29,
 31,
 39,
 40,
 41,
 42,
 52,
 71,
 72,
 74,
 78,
 89,
 95,
 100,
 102,
 106,
 108,
 111,
 113,
 120,
 122,
 129,
 130,
 137,
 139,
 143,
 144,
 150,
 154,
 155,
 162,
 168,
 170,
 174,
 187,
 189,
 191,
 195,
 208,
 224,
 225,
 229,
 237,
 239,
 241,
 243,
 244,
 246,
 247,
 248,
 249,
 251,
 256,
 259,
 261,
 267,
 268,
 270,
 272,
 277,
 279,
 283,
 284,
 287,
 289,
 291,
 299,
 301,
 309,
 326,
 328,
 330,
 334,
 346,
 347,
 350,
 354,
 357,
 369,
 372,
 380,
 387,
 393,
 399,
 404,
 406,
 415,
 420,
 423,
 429,
 438,
 439,
 443,
 445,
 446,
 448,
 455,
 456,
 459,
 461,
 463,
 469,
 472,
 485,
 488,
 492,
 494,
 507,
 510,
 516,
 519,
 536,
 544,
 547,
 550,
 559,
 566,
 567,
 570,
 575,
 586,
 589,
 590,
 591,
 596,
 599,
 600,
 612,
 615,
 617,
 625,
 628,
 638,
 639,
 646,
 649,
 652,
 659,
 661,
 665,
 666,
 674,
 675,
 676,
 686,
 688,
 689,
 691,
 693,
 696,
 697,
 699,
 706,
 709,
 716,
 723,
 726,
 729,
 731,
 739,
 740,
 745,
 749,
 750,
 7

In [70]:
startup_ax.index(1552)

335

In [73]:
swmdist1[0:10]

[8.104330753999673,
 6.97040145378549,
 7.285465398900232,
 8.470926167843567,
 7.150231570589457,
 6.620841151639415,
 7.865514095681986,
 10.747019802365745,
 7.8445831221358135,
 7.601060865850559]

<h3>Test the ones I already have...</h3>

In [75]:
startup_ax_done = startup_ax[0:335]

In [77]:
len(swmdist1)

15477

In [81]:
outlier_articles = mText.get_removed_articles()

In [82]:
print(outlier_articles)

[2298, 2399, 2454, 2607, 2808, 2992, 3167, 3283, 4590]


In [83]:
output = open('outlier_articles.pkl', 'wb')
pickle.dump(outlier_articles, output)
output.close() 

In [85]:
print([a for a in outlier_articles if a in startup_ax])

[2399, 2808, 2992, 4590]


In [86]:
ax

1552

In [97]:
dfA.shape

(4649, 14)

In [88]:
len(startup_ax)

986

In [89]:
max(startup_ax)

4637

In [93]:
is_startup = [(a in startup_ax) for a in range(dfA.shape[0])]
print(len(is_startup))
print(is_startup[0])
dfA['startup'] = is_startup

4649
False


In [94]:
dfA.head()

Unnamed: 0_level_0,title,popdate,url,userid,username,highlight,nlikes,ncomments,ntags,origdb,tags,text,npar,startup
postid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1015a0f4961d,Day One: President Obama and the First Family ...,2016-03-21,https://medium.com/@ObamaWhiteHouse/day-one-pr...,ca9f8f16893b,The Obama White House,"Today, Air Force One touched down here in Hava...",336,15,3.0,3.0,"Cuba,Twitter,Cuba Trip","¡Hola desde cuba! Today, Air Force One touched...",20.0,False
101a407e8c61,You don’t ‘make it’ — it makes you.,2016-06-02,https://medium.com/the-mission/you-dont-make-i...,5ce28105ffbc,Jon Westenberg,You don’t ‘make it’ — it makes you.,549,37,3.0,3.0,"Entrepreneurship,Startup,Life",I always wanted to make it. I grew up dreaming...,21.0,True
1030d29376f1,UX: Infinite Scrolling vs. Pagination,2016-05-02,https://uxplanet.org/ux-infinite-scrolling-vs-...,bcab753a4d4e,Nick Babich,There are only a few instances where infinite ...,1.91K,46,4.0,3.0,"UX,Design,User Experience,UX Design",“Should I use Infinite scrolling or Pagination...,34.0,False
10315016b299,A Lesson on Stereotypes,2016-08-20,https://medium.com/@mramsburg85/a-lesson-on-st...,d38709ba4e06,Michael Ramsburg,"Stereotypes strip you of your culture, like ou...",583,103,5.0,3.0,"Stereotypes,Appalachia,Culture,Essay,Opinion","Stereotypes./nMrs. Mitchell, my sixth grade te...",12.0,False
10321e751c6d,"For This Republican, Never Trump Means “I’m Wi...",2016-07-30,https://medium.com/@ccmccain/for-this-republic...,4e965facd5f9,Caroline McCain,"Trump’s statement, in my view, is unforgivable...",2.5K,302,5.0,3.0,"Hillary Clinton,Donald Trump,Never Trump,2016 ...","If you know me at all, you know I am a woman f...",45.0,False


In [95]:
dfA.to_sql('articles_s', dbe)

In [113]:
dfTemp = pd.merge(dfS,dfA,how='outer',left_on='postid',left_index=False,right_index=True)
print(dfTemp.shape)
dfTemp = dfTemp[dfTemp.startup==True]
print(dfTemp.shape)

(434801, 21)
(95635, 21)


In [115]:
startup_sent_count = [s for sx,s in enumerate(sent_count) if sx in startup_ax]

In [102]:
len(startup_sent_count)

986

In [116]:
sum(startup_sent_count)

93384

In [117]:
dfA[dfA.startup==True].shape

(986, 14)

In [114]:
len(dfTemp.groupby('postid').size())

986

In [130]:
ax=0
startindex = [0] + startup_sent_count
startindex = startindex[0:len(startup_sent_count)]
print(startindex[ax],startup_sent_count[ax])
len(swmdist1[startindex[ax]:startup_sent_count[ax]])

0 65


65

In [120]:
swmdists_final = []
startindex = [0] + startup_sent_count
startindex = startindex[0:len(startup_sent_count)]
for ax,dfs,axs in zip(range(len(startup_ax)),dfTemp.groupby('postid').size(),startup_sent_count):
    if dfs==axs:
        swmdists_final.append(swmdist1[startindex[ax]:startup_sent_count[ax]])
    else:
        swmdists_final.
        

65 65
80 80
9 9
153 153
85 85
47 47
245 245
50 43
58 50
71 58
43 71
65 108
205 65
109 205
40 109
85 40
112 85
108 112
169 169
103 103
61 61
41 41
34 34
64 64
83 83
36 36
110 110
124 124
26 26
51 51
125 125
55 343
52 114
1 55
343 52
114 1
21 21
61 61
59 59
39 39
135 135
165 165
61 61
72 72
86 86
134 134
439 439
71 71
47 47
58 58
147 147
68 68
182 182
59 59
82 82
100 100
89 89
68 68
42 42
85 85
41 41
92 92
44 44
124 124
71 71
90 90
61 61
70 70
90 90
34 34
147 147
75 75
51 51
117 117
155 155
42 42
143 143
80 80
3 3
99 99
263 263
54 54
113 113
106 106
78 78
35 35
167 167
203 203
96 96
162 162
82 82
62 62
30 30
72 72
87 87
44 44
158 158
156 156
191 191
66 66
95 95
83 83
56 56
73 73
68 68
74 74
28 28
71 71
14 14
271 271
47 47
30 30
58 58
44 44
67 67
117 117
97 97
64 64
28 28
279 279
62 62
83 83
55 55
75 75
50 50
34 34
59 59
103 103
586 586
132 132
90 90
76 76
131 131
39 39
58 58
87 87
101 101
109 109
41 41
105 105
19 19
74 74
320 320
588 588
274 274
102 102
174 174
54 54
244 244
184 184
26 2

In [128]:
print(startup_ax)

[1, 7, 8, 11, 17, 22, 29, 31, 39, 40, 41, 42, 52, 71, 72, 74, 78, 89, 95, 100, 102, 106, 108, 111, 113, 120, 122, 129, 130, 137, 139, 143, 144, 150, 154, 155, 162, 168, 170, 174, 187, 189, 191, 195, 208, 224, 225, 229, 237, 239, 241, 243, 244, 246, 247, 248, 249, 251, 256, 259, 261, 267, 268, 270, 272, 277, 279, 283, 284, 287, 289, 291, 299, 301, 309, 326, 328, 330, 334, 346, 347, 350, 354, 357, 369, 372, 380, 387, 393, 399, 404, 406, 415, 420, 423, 429, 438, 439, 443, 445, 446, 448, 455, 456, 459, 461, 463, 469, 472, 485, 488, 492, 494, 507, 510, 516, 519, 536, 544, 547, 550, 559, 566, 567, 570, 575, 586, 589, 590, 591, 596, 599, 600, 612, 615, 617, 625, 628, 638, 639, 646, 649, 652, 659, 661, 665, 666, 674, 675, 676, 686, 688, 689, 691, 693, 696, 697, 699, 706, 709, 716, 723, 726, 729, 731, 739, 740, 745, 749, 750, 754, 761, 762, 768, 775, 789, 790, 794, 805, 807, 808, 809, 821, 834, 836, 848, 860, 869, 872, 882, 886, 887, 888, 893, 899, 902, 905, 907, 917, 928, 933, 936, 944, 948, 9