### Sentiment analysis Doc2Vec

In [1]:
import gensim
from sklearn.model_selection import train_test_split  
import numpy as np
import glob
import os 
import imdb_preprocess

#### Read data

In [2]:
data_path = '../../imdb'
file_path = os.path.join(data_path,'imdb.p')
if os.path.exists(file_path): 
    print('load pickle data from disk')
    x,y = imdb_preprocess.load_processed_data(file_path)
else:
    print('process and load data')
    imdb_preprocess.save_processed_data(data_path)
    x,y = imdb_preprocess.load_processed_data(file_path)

load pickle data from disk


In [3]:
## train and test split data 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(len(x_train)," ",len(y_train))

20000   20000


### Gensim doc2vec model

Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
We do this by using the LabeledSentence method. The format will be "TRAIN_i" or "TEST_i" where "i" is
a dummy index of the review.

In [4]:
## use labeledSentence object
#LabeledSentence = gensim.models.doc2vec.LabeledSentence
from gensim.models.doc2vec import TaggedDocument
import random
from gensim.models import Doc2Vec

In [5]:
def read_corpus(reviews):
    for i,doc in enumerate(reviews):
        yield TaggedDocument(doc,[i])

train_corpus = list(read_corpus(x_train))

In [None]:
cores = os.cpu_count()
size = 200
window = 5 
iteration = 20 

model = Doc2Vec(size=size, min_count=2, window = window,iter=iteration)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

## after you trained your model, you can train them again 
# model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

In [None]:
model.save(os.path.join(data_path,'trained','imdb.d2v'))
## save model 
if not os.path.exists(os.path.join(data_path,"trained")):
    os.makedirs(os.path.join(data_path,"trained"))
    model.save(os.path.join(data_path,'trained','imdb.d2v'))
else:
    model = Doc2Vec.load(os.path.join(data_path,'trained','imdb.d2v'))

### look at some samples 

In [8]:
model.docvecs.most_similar(positive=[1],topn=10)

[(4528, 0.6481720805168152),
 (6347, 0.6399887800216675),
 (7782, 0.6349166035652161),
 (280, 0.6317951083183289),
 (10009, 0.6220613718032837),
 (7220, 0.6116548180580139),
 (1867, 0.605474591255188),
 (7188, 0.604194164276123),
 (19419, 0.6015190482139587),
 (854, 0.5951851606369019)]

In [9]:
print(' '.join(x[1]) , '\n\n')
print(' '.join(x[4528]))

i had the tv on for a white noise companion and heard" $400 for a fully furnished apartment" so i ran into the tv room expecting another 70's flick and got much more . luckily , i could rewind to the beginning ( dvr buffer ) and hit the record button to watch it entirely . ( cinemax uncut and in hd no less ! ) aside from some holes in the story and intermittent improbable dialog/events , this is an effective thriller worthy of your time to watch . pretty creepy and progressive at times : beverly d'angelo's character masturbates in front of alison parker , played adroitly by cristina raines , parker stabs , in very gory fashion , her father , an explicit menage a trios scene . ( don't let the kids watch ) the film is totally 70's full of bad clothes ( polyester suits and tacky ascots ) and decor , bad hair , over bloated music score , and familiar looking cinematography . the cast is excellent , take a second on this film's home page to check it out . it was a surprise to see christophe

In [10]:
infer_review = 'I love this movie'

In [11]:
model.infer_vector(infer_review.split())

AttributeError: 'Doc2Vec' object has no attribute 'syn1neg'

### Use document vector for sentiment analysis

In [None]:
#Get training set vectors from our models
def getVecs(model, corpus, size):
    vecs = [model.docvecs[z].reshape((1, size)) for z in range(len(x_train))]
    return np.concatenate(vecs)

In [None]:
train_vecs_dm = getVecs(model,x_train,size)

In [None]:
from sklearn.linear_model import SGDClassifier

lr = SGDClassifier(loss='log', penalty='l1')
lr.fit(train_vecs_dm,y_train )

print ('Test Accuracy: %.2f'%lr.score(train_vecs_dm, y_train))