In [1]:
import gensim
from sklearn.model_selection import train_test_split  
import numpy as np
import glob
import os 

### Data pre-process

In [2]:
data_path = '../../imdb'
train_pos_path = os.path.join(data_path,'train','pos')
train_neg_path = os.path.join(data_path,'train','neg')
train_unsup_path = os.path.join(data_path,'train','unsup')

In [3]:
def read_text(path):
    files = glob.glob(path+'/*.txt')
    sentances = list()
    for fi in files:
        with open(fi) as f:
            text = f.read()
        sentances.append(text)
    return sentances

In [4]:
train_pos_reviews = read_text(train_pos_path)
train_neg_reviews = read_text(train_neg_path)
train_unsup_reviews = read_text(train_unsup_path)

In [5]:
def clean_text(corpus):
    punctuation = """.,?!:;(){}[]"""
    corpus = [z.lower().replace('\n','') for z in corpus]
    corpus = [z.replace('<br />', ' ') for z in corpus]
    
    # treat punctuation as individual words 
    for c in punctuation:
        corpus = [z.replace(c, ' %s '%c) for z in corpus]
    corpus = [z.split() for z in corpus]
    return corpus  

In [6]:
x = np.concatenate((train_pos_reviews, train_neg_reviews),axis=0)
y = np.concatenate((np.ones(len(train_pos_reviews)), np.zeros(len(train_neg_reviews))))
x= clean_text(x)

In [10]:
len(x)

25000

### Now we can do doc2vect model 

In [13]:
from gensim import corpora, models
from gensim.models.doc2vec import TaggedDocument
import random
from gensim.models import Doc2Vec
#LabeledSentence = gensim.models.doc2vec.LabeledSentence

label all sentances 

In [12]:
def read_corpus(reviews):
    for i,doc in enumerate(reviews):
        yield TaggedDocument(doc,[i])

train_corpus = list(read_corpus(x))

In [14]:
cores = os.cpu_count()
size = 300
window = 10

model = Doc2Vec(size=size, min_count=4, window =window,iter=50)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

229839184

In [15]:
model.save('../../reviews.d2v')

In [41]:
model.docvecs.most_similar([now_doc],topn=10)

[(19237, 0.6252765655517578),
 (10212, 0.6087512969970703),
 (23786, 0.6073160767555237),
 (4257, 0.5982194542884827),
 (21356, 0.5819867849349976),
 (1090, 0.5811036229133606),
 (11509, 0.5805668234825134),
 (19927, 0.572235643863678),
 (19059, 0.5641859173774719),
 (1546, 0.5593593120574951)]

In [42]:
now_doc = model.infer_vector('this is a very interesting'.split())

In [43]:
now_doc

array([-0.10814448,  0.10383542, -0.0092326 , -0.09356308, -0.01124847,
       -0.03740203,  0.01803833, -0.05689717, -0.09740145, -0.04742213,
        0.06144497, -0.0207079 ,  0.08344452, -0.0102215 ,  0.01805961,
       -0.0563374 , -0.03795331, -0.02004271,  0.00814117,  0.00222164,
       -0.01514807, -0.05275096,  0.01659757, -0.05877526,  0.00673218,
       -0.24246559,  0.07701737, -0.13610932, -0.09650938,  0.03345409,
       -0.04590155, -0.22094624,  0.04422341, -0.09393533,  0.14385749,
        0.00740965,  0.09581789, -0.04488746,  0.01246224, -0.15887953,
       -0.00963537, -0.04535013, -0.07856664, -0.04914945, -0.05787019,
        0.12106705,  0.01034486, -0.03578044,  0.13452947, -0.00391577,
        0.14465956,  0.03680795, -0.02507572,  0.09778269, -0.08615296,
        0.02163956,  0.00292641, -0.02043228,  0.0804169 ,  0.12413591,
        0.0187016 ,  0.05675929,  0.03213715,  0.00652719,  0.0433692 ,
        0.01553363, -0.0638222 , -0.0215292 ,  0.09417614,  0.07

In [37]:
model.docvecs[1]

array([-0.67711502,  1.31172943, -0.072905  , -0.89918149, -0.8678568 ,
       -0.4304364 ,  0.93772459, -0.68208808,  0.37590054,  0.27221611,
       -0.05383255, -0.12778024, -0.4114587 , -0.20941962,  0.28855571,
        0.66473871, -0.79857826,  0.01300741, -0.14620067, -0.16101973,
       -1.42005539, -0.15119924, -0.11553811, -0.6446625 ,  0.7714867 ,
        0.55380434, -0.32386684,  0.59924847,  0.39897364, -0.84701961,
        0.88661528, -1.47857201,  0.4564923 ,  0.43879887,  0.59963828,
       -0.40678221,  0.94591957, -0.48353058, -0.72126746, -1.92311585,
       -1.81732571,  0.08316034,  0.49982268, -0.17522204, -0.17699605,
        0.67652327,  0.25639775,  0.08452005, -0.95197713,  1.34113288,
       -0.39507714,  0.06607235,  0.98143154, -0.20966935,  0.24187106,
        0.78817004, -0.62818182,  1.23096454,  0.10810765, -0.13854726,
        0.87149864,  0.82891649,  1.54508841, -0.2048618 , -0.36379874,
        0.05074835, -1.96004784,  0.23594297, -0.30373374,  0.14

In [38]:
model.docvecs.most_similar(1)

[(8105, 0.46626079082489014),
 (19927, 0.43985748291015625),
 (6419, 0.4132939279079437),
 (14796, 0.4051125943660736),
 (14649, 0.399934321641922),
 (7059, 0.39813166856765747),
 (5489, 0.3968348205089569),
 (17792, 0.39266782999038696),
 (9992, 0.3916124701499939),
 (286, 0.3911878168582916)]

In [40]:
gensim.similarity(now_doc,model.docvecs[1])

AttributeError: module 'gensim' has no attribute 'similarity'