### Sentiment analysis Doc2Vec

In [1]:
import gensim
from sklearn.model_selection import train_test_split  
import numpy as np
import glob
import os 

#### Read data

In [2]:
data_path = '../../imdb'
train_pos_path = os.path.join(data_path,'train','pos')
train_neg_path = os.path.join(data_path,'train','neg')
train_unsup_path = os.path.join(data_path,'train','unsup')
#test_pos_path = os.path.join(data_path,'test','pos')
#test_neg_path = os.path.join(data_path,'test','neg')
#test_unsup_path = os.path.join(data_path,'test','unsup')

In [3]:
def read_text(path):
    files = glob.glob(path+'/*.txt')
    sentances = list()
    for fi in files:
        with open(fi) as f:
            text = f.read()
        sentances.append(text)
    return sentances

In [4]:
train_pos_reviews = read_text(train_pos_path)
train_neg_reviews = read_text(train_neg_path)
train_unsup_reviews = read_text(train_unsup_path)
#test_pos_reviews = read_text(test_pos_path)
#test_neg_reviews = read_text(test_neg_path)
#test_unsup_reviews = read_text(test_unsup_path)

In [5]:
def clean_text(corpus):
    punctuation = """.,?!:;(){}[]"""
    corpus = [z.lower().replace('\n','') for z in corpus]
    corpus = [z.replace('<br />', ' ') for z in corpus]
    
    # treat punctuation as individual words 
    for c in punctuation:
        corpus = [z.replace(c, ' %s '%c) for z in corpus]
    corpus = [z.split() for z in corpus]
    return corpus  

In [6]:
x = np.concatenate((train_pos_reviews, train_neg_reviews),axis=0)
y = np.concatenate((np.ones(len(train_pos_reviews)), np.zeros(len(train_neg_reviews))))
x= clean_text(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [7]:
print(len(x_train)," ",len(y_train))

20000   20000


Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
We do this by using the LabeledSentence method. The format will be "TRAIN_i" or "TEST_i" where "i" is
a dummy index of the review.

In [8]:
## use labeledSentence object
LabeledSentence = gensim.models.doc2vec.LabeledSentence

In [9]:
def labelizeReviews(reviews, label_type):
    labelized = []
    for i,v in enumerate(reviews):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeReviews(x_train, 'TRAIN')
x_test = labelizeReviews(x_test, 'TEST')

In [10]:
import random
from gensim.models import Doc2Vec
import multiprocessing

In [11]:

cores = multiprocessing.cpu_count()
size = 400

#instantiate our DM and DBOW models

# PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
model_dm = Doc2Vec(size=size, window=5, negative=5, hs=0, min_count=2, sample=1e-3,workers=cores)
# PV-DBOW 
model_dbow = Doc2Vec(size=size, negative=5, hs=0, min_count=2, sample=1e-3, workers=cores)


In [12]:
#build vocab over all reviews
model_dm.build_vocab(x_train+x_test)
model_dbow.build_vocab(x_train+x_test)

In [24]:

for epoch in range(10):
    input_data = random.shuffle(x_train)
    model_dm.train(total_examples=model_dm.corpus_count,epochs=model_dm.iter)
    model_dbow.train(input_data,total_examples=model_dm.corpus_count,epochs=model_dm.iter)

TypeError: train() missing 1 required positional argument: 'sentences'

In [18]:
perm = np.random.permutation(len(x_train))
perm.tolist()

[10260,
 9565,
 598,
 17803,
 3408,
 12838,
 635,
 6290,
 1674,
 4777,
 3416,
 17983,
 2518,
 491,
 2785,
 927,
 19077,
 13197,
 11808,
 3968,
 17155,
 16153,
 5652,
 15835,
 6737,
 14446,
 4341,
 15277,
 293,
 16213,
 12279,
 17950,
 4650,
 3631,
 16807,
 14006,
 3511,
 4456,
 6344,
 7518,
 7608,
 13829,
 8384,
 12053,
 6563,
 4130,
 18890,
 19370,
 1111,
 4271,
 7122,
 9661,
 13513,
 15025,
 4979,
 16693,
 13570,
 5440,
 8716,
 291,
 4779,
 19715,
 6445,
 7740,
 17424,
 4721,
 508,
 10590,
 6365,
 9561,
 4273,
 6577,
 7259,
 14424,
 11445,
 2914,
 1281,
 2578,
 3078,
 9202,
 14224,
 947,
 10845,
 11824,
 6855,
 5716,
 6313,
 19488,
 17568,
 1275,
 5018,
 11709,
 9889,
 17668,
 9755,
 12896,
 13385,
 15127,
 750,
 12834,
 10840,
 5741,
 13608,
 19463,
 2623,
 19546,
 6311,
 6994,
 813,
 1608,
 613,
 14406,
 11161,
 19606,
 19396,
 15661,
 21,
 5557,
 16597,
 2856,
 1942,
 4024,
 4554,
 1348,
 8761,
 12152,
 5228,
 11229,
 17437,
 8058,
 15297,
 1000,
 13443,
 5552,
 2161,
 12301,
 174

In [22]:
x_train[[0,1]]

TypeError: list indices must be integers or slices, not list