In [3]:
import glob
import os
import gensim
from gensim import corpora, models, similarities
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec
from collections import defaultdict
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [4]:
functions = pd.read_csv('functions.csv', names=('id', 'document'))

In [5]:
texts = [[word for word in document.split() ] for document in functions['document']]

frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1] for text in texts]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [4]:
# lsi model
lsi_model = models.LsiModel(corpus=corpus, id2word=dictionary, num_topics=200)
lsi_model.save('model/lsi.model')
corpus_lsi = lsi_model[corpus]
dv = gensim.matutils.corpus2dense(corpus_lsi, num_terms=lsi_model.num_topics).T
np.save('model/lsi.dv', dv)

In [19]:
# lda model
lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=100, dtype=np.float64)
corpus_lda = lda_model[corpus]
dv = gensim.matutils.corpus2dense(corpus_lda, num_terms=lda_model.num_topics).T
np.save('model/lda.dv', dv)

In [44]:
# pvdbow
sentences = [TaggedDocument(text, [i]) for i, text in enumerate(texts)]
model = Doc2Vec(sentences, dm=0, vector_size=300, window=15, min_count=1, workers=4, epochs=20, sample = 1e-3, seed=1)
model.save("model/pvdbow.model")
model = Doc2Vec.load("model/pvdbow.model")
dv = model.docvecs.vectors_docs
np.save('model/pvdbow.dv', dv)

In [43]:
# pvdm
sentences = [TaggedDocument(text, [i]) for i, text in enumerate(texts)]
model = Doc2Vec(sentences, dm=1, vector_size=300, window=5, min_count=1, workers=4, epochs=20, sample = 1e-3, seed=11)
model.save('model/pvdm.model')
dv = model.docvecs.vectors_docs
np.save('model/pvdm.dv', dv)

In [51]:
# WV-avg
num_features = 300
sentences = [doc.split() for doc in functions['document']]
#model = Word2Vec(sentences, workers=4, hs = 0, sg = 1, negative = 10, iter = 25,size=num_features, min_count = 1, window = 10, sample = 1e-3, seed=1)
#model.save("model/word2vec.model")
model = Word2Vec.load("model/word2vec.model")
wv = model.wv
size = model.vector_size
dv = []
for text in texts:
    vec = np.zeros( num_features, dtype="float32" )
    for word in text:
        vec += wv[word]
    norm = np.sqrt(np.einsum('...i,...i', vec, vec))
    if(norm!=0):
        vec /= norm
    dv.append(vec)
np.save('model/avgvec.dv', np.array(dv))

In [22]:
functions['id'][10]

5610

In [32]:
functions[functions['id']==5610].index[0]

10

In [38]:
functions[functions['id']==53609].size

0