### Sentiment analysis Doc2Vec

In [1]:
import gensim
from sklearn.model_selection import train_test_split  
import numpy as np
import glob
import os 

#### Read data

In [2]:
data_path = '../../imdb'
train_pos_path = os.path.join(data_path,'train','pos')
train_neg_path = os.path.join(data_path,'train','neg')
train_unsup_path = os.path.join(data_path,'train','unsup')
#test_pos_path = os.path.join(data_path,'test','pos')
#test_neg_path = os.path.join(data_path,'test','neg')
#test_unsup_path = os.path.join(data_path,'test','unsup')

In [3]:
def read_text(path):
    files = glob.glob(path+'/*.txt')
    sentances = list()
    for fi in files:
        with open(fi) as f:
            text = f.read()
        sentances.append(text)
    return sentances

In [4]:
train_pos_reviews = read_text(train_pos_path)
train_neg_reviews = read_text(train_neg_path)
train_unsup_reviews = read_text(train_unsup_path)
#test_pos_reviews = read_text(test_pos_path)
#test_neg_reviews = read_text(test_neg_path)
#test_unsup_reviews = read_text(test_unsup_path)

In [5]:
def clean_text(corpus):
    punctuation = """.,?!:;(){}[]"""
    corpus = [z.lower().replace('\n','') for z in corpus]
    corpus = [z.replace('<br />', ' ') for z in corpus]
    
    # treat punctuation as individual words 
    for c in punctuation:
        corpus = [z.replace(c, ' %s '%c) for z in corpus]
    corpus = [z.split() for z in corpus]
    return corpus  

In [6]:
x = np.concatenate((train_pos_reviews, train_neg_reviews),axis=0)
y = np.concatenate((np.ones(len(train_pos_reviews)), np.zeros(len(train_neg_reviews))))
x= clean_text(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [7]:
print(len(x_train)," ",len(y_train))

20000   20000


Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
We do this by using the LabeledSentence method. The format will be "TRAIN_i" or "TEST_i" where "i" is
a dummy index of the review.

In [8]:
## use labeledSentence object
#LabeledSentence = gensim.models.doc2vec.LabeledSentence
from gensim.models.doc2vec import TaggedDocument

In [9]:
def read_corpus(reviews):
    for i,doc in enumerate(reviews):
        yield TaggedDocument(doc,[i])

train_corpus = list(read_corpus(x_train))

In [12]:
import random
from gensim.models import Doc2Vec
import multiprocessing

In [14]:
cores = multiprocessing.cpu_count()
size = 400

model = Doc2Vec(size=size, min_count=2, iter=50)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

185751837

In [46]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

185762610

In [28]:
model.docvecs.most_similar(positive=[1],topn=10)

[(16000, 0.5273270010948181),
 (7256, 0.4762829840183258),
 (10868, 0.47512879967689514),
 (11088, 0.4725697338581085),
 (18840, 0.4579414427280426),
 (12502, 0.45151767134666443),
 (7169, 0.4514364004135132),
 (1340, 0.4424135684967041),
 (3230, 0.4380508065223694),
 (5637, 0.4369805157184601)]

In [45]:
model.wv.most_similar("stupid",topn=10)

[('bad', 0.4046088755130768),
 ('dumb', 0.3904520273208618),
 ('silly', 0.3591079115867615),
 ('lame', 0.3406074345111847),
 ('ridiculous', 0.3390560746192932),
 ('implausible', 0.30531254410743713),
 ('corny', 0.2954520285129547),
 ('unbelievable', 0.290984183549881),
 ('laughable', 0.2862737476825714),
 ('boring', 0.2851186692714691)]

In [59]:
model.wv.most_similar("stupid",topn=10)

[('dumb', 0.3871752917766571),
 ('bad', 0.38552621006965637),
 ('silly', 0.3671604096889496),
 ('ridiculous', 0.33124151825904846),
 ('laughable', 0.30433395504951477),
 ('boring', 0.3040004074573517),
 ('lame', 0.2810211777687073),
 ('predictable', 0.2790890336036682),
 ('awful', 0.27542972564697266),
 ('confusing', 0.2697737514972687)]

In [58]:
#Get training set vectors from our models
def getVecs(model, corpus, size):
    vecs = [model.docvecs[z].reshape((1, size)) for z in range(len(x_train))]
    return np.concatenate(vecs)

In [60]:
train_vecs_dm = getVecs(model,x_train,size)

In [66]:
from sklearn.linear_model import SGDClassifier

lr = SGDClassifier(loss='log', penalty='l1')
lr.fit(train_vecs_dm,y_train )

print ('Test Accuracy: %.2f'%lr.score(train_vecs_dm, y_train))

Test Accuracy: 0.83
