In [18]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# numpy
import numpy

# random
from random import shuffle

# classifier
from sklearn.linear_model import LogisticRegression

In [19]:
# the class to load the data
class LabeledLineSentence(object):
    def __init__(self,sources):
        self.sources = sources
        
        # to make sure that keys are unique
        flipped = {}
        for key,value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
                
    def __iter__(self):
        for source,prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no,line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(),[prefix + '_%s' % item_no])
                    
    def to_array(self):
        self.sentences = []
        for source,prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no,line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(),[prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentence_perm(self):
        shuffle(self.sentences)
        return self.sentences
    
    def data_size(self):
        self.size = len(self.sentences)
        return self.size

In [20]:
sources = {'test-neg.txt':'TEST_NEG','test-pos.txt':'TEST_POS','train-neg.txt':'TRAIN_NEG','train-pos.txt':'TRAIN_POS','train-unsup.txt':'TRAIN_UNS'}
sentences = LabeledLineSentence(sources)

In [21]:
model = Doc2Vec(min_count=1,window=10,size=100,sample=1e-4,negative=5,workers=8)

model.build_vocab(sentences.to_array())



In [23]:
model.train(sentences.sentence_perm(),sentences.data_size(),epochs=10)

In [24]:
model.most_similar('good')

  """Entry point for launching an IPython kernel.


[('great', 0.7953191995620728),
 ('decent', 0.7480731010437012),
 ('bad', 0.7437841296195984),
 ('nice', 0.7267850637435913),
 ('really', 0.6648749113082886),
 ('but', 0.6587772965431213),
 ('fine', 0.6404907703399658),
 ('okay', 0.6154918670654297),
 ('ok', 0.6082793474197388),
 ('solid', 0.6078991293907166)]

In [25]:
model['TRAIN_NEG_0']

array([-8.1449628e-02, -1.2080881e-01, -1.6393578e-02, -3.2506067e-02,
       -1.1495809e-01,  4.1353401e-02, -1.7466456e-01,  2.5273144e-01,
        1.2826523e-01,  6.5643370e-02, -1.3284428e-01,  9.6570075e-02,
        2.4982466e-01, -2.9778197e-02, -2.1619467e-02, -1.3065597e-01,
        9.4116375e-02, -3.3287060e-02,  2.8271976e-01, -3.5795206e-01,
       -2.4556493e-02,  1.5345135e-01, -8.8969700e-02,  2.9506502e-01,
       -5.3405393e-02,  1.7110260e-01, -1.2507200e-01, -5.5582793e-03,
       -1.5700355e-01,  1.4176077e-02,  2.1745965e-01,  1.8602523e-01,
       -7.8293987e-02,  1.1045813e-01,  2.7930164e-03, -3.8413096e-02,
       -1.5410967e-04, -3.4835632e-03, -1.9273332e-01, -1.1314550e-01,
        7.5516471e-04, -3.4545515e-02,  1.0198006e-01, -6.9790304e-02,
        6.7190170e-02,  5.7642076e-02,  2.9831430e-02,  4.6888407e-02,
       -1.7281073e-01,  7.2444782e-02, -2.5020510e-01,  2.5306782e-04,
        1.5087141e-01, -6.4567998e-02, -2.5147947e-02, -1.7890921e-01,
      

In [26]:
model['TRAIN_NEG_1']

array([ 0.41424492, -0.22864938,  0.56521887,  0.09944438,  0.18952218,
       -0.00676533, -0.13417138,  0.79027873,  0.17253539,  0.04632005,
       -0.35245946,  0.17284836,  0.7389112 , -0.26180723,  0.01310386,
       -0.08789073, -0.31169692, -0.04699206,  0.9776067 , -0.5450748 ,
       -0.05935624, -0.15328032, -0.7886594 ,  0.9146491 ,  0.3332732 ,
       -0.03769577, -0.16853772,  0.3795821 , -0.02370242,  0.16264299,
        0.23740599,  0.4598789 ,  0.37023434,  0.12731685, -0.43478373,
       -0.46394598, -0.28952488,  0.16053693, -0.41447666,  0.16364542,
        0.35293925, -0.0305146 , -0.35157254,  0.40345263, -0.27706018,
        0.5159742 ,  0.83918136, -0.32310107, -0.13694666,  0.30507025,
       -0.29070017,  0.57944703,  0.24109359,  0.04800017,  0.2511528 ,
       -0.62800866,  0.83147895, -0.13214229, -0.03927199, -0.50905716,
       -0.14243847,  0.2190909 , -0.46284288, -0.13849251,  0.20964257,
       -0.0161692 ,  0.321623  , -0.431447  , -0.07591576, -0.48

In [27]:
model.save('./imdb.d2v')

In [29]:
model = Doc2Vec.load('./imdb.d2v')

In [32]:
# to construct the dataset of classification
train_arrays = numpy.zeros((25000,100))
train_labels = numpy.zeros(25000)

for i in range(12500):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[i] = model[prefix_train_pos]
    train_arrays[12500 + i] = model[prefix_train_neg]
    train_labels[i] = 1
    train_labels[12500 + i] = 0

In [34]:
print(train_arrays)

[[ 0.09243025 -0.24560335  0.28744516 ... -0.17074208 -0.10332243
   0.23346883]
 [ 0.2137818  -0.53485626  0.03239717 ... -0.21181569 -0.07500508
   0.17904328]
 [-0.13854299 -0.02330245  0.15953882 ... -0.12011004  0.026761
  -0.18551254]
 ...
 [ 0.21171957 -0.01099468  0.35562286 ... -0.04002251 -0.04518218
   0.05994232]
 [ 0.23212874 -0.27640426  0.14737687 ... -0.01750329  0.03695849
   0.02526121]
 [ 0.13487029 -0.25067493  0.13125062 ...  0.04233909 -0.02014057
   0.08592685]]


In [35]:
print(train_labels)

[1. 1. 1. ... 0. 0. 0.]


In [36]:
test_arrays = numpy.zeros((25000,100))
test_labels = numpy.zeros(25000)

for i in range(12500):
    prefix_test_pos = 'TEST_POS_' + str(i)
    prefix_test_neg = 'TEST_NEG_' + str(i)
    test_arrays[i] = model[prefix_test_pos]
    test_arrays[12500 + i] = model[prefix_test_neg]
    test_labels[i] = 1
    test_labels[12500 + i] = 0

In [37]:
# to construct the model of classification
classifier = LogisticRegression()
classifier.fit(train_arrays,train_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [38]:
classifier.score(test_arrays,test_labels)

0.84988