# CS5339 Project

# Part 1. Data Cleaning

In [9]:
import pandas as pd

from tempfile import mkstemp
from shutil import move, copymode
from os import fdopen, remove
import re
from sklearn.linear_model import LogisticRegression

def replace(file_path):
    #Create temp file
    fh, abs_path = mkstemp()
    with fdopen(fh,'w') as new_file:
        with open(file_path) as old_file:
            for line in old_file:
                new_file.write(re.sub("  " , " ", line))

    #Copy the file permissions from the old file to the new file
    copymode(file_path, abs_path)
    #Remove original file
    remove(file_path)
    #Move new file
    move(abs_path, file_path)

replace('aclImdb/test-neg.txt')
replace('aclImdb/train-pos.txt')
replace('aclImdb/test-pos.txt')
replace('aclImdb/test-neg.txt')
replace('aclImdb/train-unsup.txt')

train_neg = pd.read_csv('aclImdb/train-neg.txt', header = None, delimiter = "\n")
train_pos = pd.read_csv('aclImdb/train-pos.txt', header = None, delimiter = "\n")
train_unsup = pd.read_csv('aclImdb/train-unsup.txt', header = None, delimiter = "\n")
test_pos = pd.read_csv('aclImdb/test-pos.txt', header = None, delimiter = "\n")
test_neg = pd.read_csv('aclImdb/test-neg.txt', header = None, delimiter = "\n")


all_train = pd.concat([train_neg, train_pos, train_unsup], ignore_index=True)

all_train.head(10)

Unnamed: 0,0
0,story of a man who has unnatural feelings for ...
1,airport '77 starts as a brand new luxury 747 p...
2,this film lacked something i couldn't put my f...
3,"sorry everyone , , , i know this is suppose..."
4,when i was little my parents took me along to ...
5,""" it appears that many critics find the idea ..."
6,the second attempt by a new york intellectual ...
7,"i don't know who to blame , the timid writers..."
8,this film is mediocre at best . angie harmon ...
9,the film is bad . there is no other way to sa...


In [3]:
type(all_train)

pandas.core.frame.DataFrame

## Load dataset
https://ai.stanford.edu/~amaas/data/sentiment/

In [329]:
import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')

alldocs = []  # will hold all docs in original order
with open('aclImdb/alldata-id.txt') as alldata:
    for line_no, line in enumerate(alldata):
        tokens = gensim.utils.to_unicode(line).split()
        words = tokens[1:]
        tags = [line_no] # `tags = [tokens[0]]` would also work at extra memory cost
        split = ['train','test','extra','extra'][line_no//25000]  # 25k train, 25k test, 50k unlabled data
        sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
        alldocs.append(SentimentDocument(words, tags, split, sentiment))

train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:]  # for reshuffling per pass

print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))

100000 docs: 25000 train-sentiment, 25000 test-sentiment


In [202]:
# Check first document 
doc_list[0]

SentimentDocument(words=['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '"', 'teachers', '"', '.', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', "high's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"', 'teachers', '"', '.', 'the', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', '.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'i', 'immediately', 'recalled', '.', '.', '.', '.', '.', '.', '.', '.', '.', 'at', '.', '

# Part 2. Model

## two main categories of doc vector models: PV-DM and PV-DBOW

In [336]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
print(cores)
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

simple_models = [
    # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DBOW 
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
]

# speed setup by sharing results of 1st model's vocabulary scan
simple_models[0].build_vocab(alldocs)  # PV-DM/concat requires one special NULL word so it serves as template
print(simple_models[0])
for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

8




Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)
Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)
Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)


In [337]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec

# Like the author mentioned in the paper, we can concatenate different Doc2Vec models
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])

print(models_by_name)

OrderedDict([('Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)', <gensim.models.doc2vec.Doc2Vec object at 0x2b183b510>), ('Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)', <gensim.models.doc2vec.Doc2Vec object at 0x2b183b5d0>), ('Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)', <gensim.models.doc2vec.Doc2Vec object at 0x2b183b650>), ('dbow+dmm', <gensim.test.test_doc2vec.ConcatenatedDoc2Vec object at 0x2b1839650>), ('dbow+dmc', <gensim.test.test_doc2vec.ConcatenatedDoc2Vec object at 0x227bda150>)])


## Prediction

In [469]:
from collections import defaultdict
best_error = defaultdict(lambda :1.0)  # to selectively-print only best errors achieved

import numpy as np
import statsmodels.api as sm
from random import sample

# for timing
from contextlib import contextmanager
from timeit import default_timer
import time 

@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start
    
def logistic_predictor_from_data(train_targets, train_regressors):
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    #print(predictor.summary())
    return predictor

# to train a logistic regressor
def sk_logistic_regressor(y, X ):
    lr = LogisticRegression()
    lr.fit(X, y)
    return lr

def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]]) for doc in train_set])
    train_regressors = sm.add_constant(train_regressors)
    
#     predictor = logistic_predictor_from_data(train_targets, train_regressors)
    predictor = sk_logistic_regressor(train_targets, train_regressors)  
    # use doccument vector in training set to train logistic regressor

    test_data = test_set
    
    # case 1. for inference, a given sentence is feeded to the model, and a inferred doc vector will be given
    if infer:
        if infer_subsample < 1.0:
            test_data = sample(test_data, int(infer_subsample * len(test_data)))
        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
    
    # case 2. for evaluation, we use doc vectors in testing set to examine whether our logistic regressor performs well
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_set]
    test_regressors = sm.add_constant(test_regressors)
    
    # predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

## Training word vectors and doc vectors

In [339]:
from random import shuffle
import datetime

alpha, min_alpha, passes = (0.025, 0.001, 10)
alpha_delta = (alpha - min_alpha) / passes

print("START %s" % datetime.datetime.now())

for epoch in range(passes):
    shuffle(doc_list)  # shuffling gets best results
    
    for name, train_model in models_by_name.items():
        # train
        duration = 'na'
        train_model.alpha, train_model.min_alpha = alpha, alpha
        with elapsed_timer() as elapsed:
            train_model.train(doc_list,total_examples=100000, epochs=epoch)
            duration = '%.1f' % elapsed()
            
        # evaluate
        eval_duration = ''
        with elapsed_timer() as eval_elapsed:
            err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs)
        eval_duration = '%.1f' % eval_elapsed()
        best_indicator = ' '
        if err <= best_error[name]:
            best_error[name] = err
            best_indicator = '*' 
        print("%s%f : %i passes : %s %ss %ss" % (best_indicator, err, epoch + 1, name, duration, eval_duration))

        if ((epoch + 1) % 5) == 0 or epoch == 0:
            eval_duration = ''
            with elapsed_timer() as eval_elapsed:
                infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs, infer=True)
            eval_duration = '%.1f' % eval_elapsed()
            best_indicator = ' '
            if infer_err < best_error[name + '_inferred']:
                best_error[name + '_inferred'] = infer_err
                best_indicator = '*'
            print("%s%f : %i passes : %s %ss %ss" % (best_indicator, infer_err, epoch + 1, name + '_inferred', duration, eval_duration))

    print('completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta
    
print("END %s" % str(datetime.datetime.now()))

START 2020-04-18 23:45:15.543186
*0.506000 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 0.0s 0.3s
*0.512000 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)_inferred 0.0s 8.9s
*0.506000 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 0.0s 0.2s
*0.498800 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)_inferred 0.0s 32.4s
*0.506000 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 0.0s 0.3s
*0.498800 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)_inferred 0.0s 4.5s
*0.506040 : 1 passes : dbow+dmm 0.0s 0.6s
*0.490800 : 1 passes : dbow+dmm_inferred 0.0s 7.1s
*0.506040 : 1 passes : dbow+dmc 0.0s 0.6s
*0.492800 : 1 passes : dbow+dmc_inferred 0.0s 12.8s
completed pass 1 at alpha 0.025000
*0.441840 : 2 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 25.9s 0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.263840 : 2 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 13.8s 0.5s
*0.265000 : 2 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 21.5s 0.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.225280 : 2 passes : dbow+dmm 0.0s 1.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.263080 : 2 passes : dbow+dmc 0.0s 1.0s
completed pass 2 at alpha 0.022600


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.337840 : 3 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 50.4s 0.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.130160 : 3 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 28.1s 0.5s
*0.191560 : 3 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 41.7s 0.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.125520 : 3 passes : dbow+dmm 0.0s 1.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.128840 : 3 passes : dbow+dmc 0.0s 0.9s
completed pass 3 at alpha 0.020200
*0.256840 : 4 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 69.4s 0.3s
*0.112600 : 4 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 40.7s 16.3s
*0.168760 : 4 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 65.3s 0.3s
*0.111200 : 4 passes : dbow+dmm 0.0s 1.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.111720 : 4 passes : dbow+dmc 0.0s 1.1s
completed pass 4 at alpha 0.017800
*0.209080 : 5 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 108.3s 0.5s
*0.220400 : 5 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)_inferred 108.3s 8.7s
*0.104960 : 5 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 61.3s 0.4s
*0.113200 : 5 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)_inferred 61.3s 3.5s
*0.160840 : 5 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 91.7s 0.3s
*0.201200 : 5 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)_inferred 91.7s 4.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.105360 : 5 passes : dbow+dmm 0.0s 1.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.112400 : 5 passes : dbow+dmm_inferred 0.0s 7.7s
*0.105080 : 5 passes : dbow+dmc 0.0s 0.8s
*0.104400 : 5 passes : dbow+dmc_inferred 0.0s 11.9s
completed pass 5 at alpha 0.015400
*0.186240 : 6 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 128.5s 0.3s
*0.103080 : 6 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 79.7s 0.4s
*0.154200 : 6 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 112.6s 0.3s
*0.104320 : 6 passes : dbow+dmm 0.0s 1.0s
*0.102760 : 6 passes : dbow+dmc 0.0s 31.7s
completed pass 6 at alpha 0.013000
*0.177320 : 7 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 157.7s 0.5s
 0.103560 : 7 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 92.8s 0.4s
*0.151720 : 7 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 138.5s 0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.104080 : 7 passes : dbow+dmm 0.0s 1.0s
 0.103880 : 7 passes : dbow+dmc 0.0s 1.1s
completed pass 7 at alpha 0.010600


KeyboardInterrupt: 

In [417]:
for rate, name in sorted((rate, name) for name, rate in best_error.items()):
    print("%f %s" % (rate, name))

0.102760 dbow+dmc
0.103080 Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)
0.104080 dbow+dmm
0.104400 dbow+dmc_inferred
0.112400 dbow+dmm_inferred
0.113200 Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)_inferred
0.151720 Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)
0.177320 Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)
0.201200 Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)_inferred
0.207000 Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)_inferred


In [341]:
# save the model
count = 0
for model in simple_models:
    model.save("models/Doc2Vec_{}.model".format(str(count+1) ))
    count += 1

#### Conclusion: 
#### First, combined Doc2Vec model outperforms single Doc2Vec model. 
#### Second, different from the conclusion in original paper, PV-DBOW outperforms than PV-DM.

# Part 3. Evaluation

### Experiment (1): Given a sentence in corpus, let's see whether an inferred sentence is closed to precalculated sentence

In [352]:
doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc; re-run cell for more examples
print('for doc %d...' % doc_id)

# have a look at what this randomly picked sentence is
sentence = " ".join( doc_list[doc_id].words )
print( "This sentence is : \n %s" % (sentence) )

for doc 85840...
This sentence is : 
 in the middle of nowhere at a bar , the costumers including barkeeper ( clu gulagar ) are enjoying the booze until a mysterious man breaks in and warns them of creatures coming to kill them . they don't believe in him until a bizarre creature kills one of the costumers as each person must try to stay alive and inside the bar as possible to ward off these flesh-eating beasts . exciting and gory-as-hell action-horror comedy is one of the better horror flicks of late . from executive producers wes craven , chris moore , ben affleck and matt damon this is an adrenaline pumping roller-coaster that harks back to the old school style monster movie days . sure the movie may seem plot less even with no explanation of where the monsters come from until the sequels , but it's good quality entertainment for fans of the genre . the film co-stars pulp fiction's dwayne witaker , judah friedlander , henry rollins and balthazar getty ( " alias " ) as they give all 

In [353]:
# from each doc vector model, find the doc vector most similar to the inferred doc vector of given sentence
print('for doc (doc_id=) %d...' % doc_id)
for model in simple_models:
    inferred_docvec = model.infer_vector(alldocs[doc_id].words)
    print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))

for doc (doc_id=) 85840...
Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8):
 [(85840, 0.8668636083602905), (38419, 0.4256206154823303), (15248, 0.4232979118824005)]
Doc2Vec(dbow,d100,n5,mc2,s0.001,t8):
 [(85840, 0.9439302682876587), (27649, 0.7646265625953674), (81444, 0.7600319385528564)]
Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8):
 [(28388, 0.9203461408615112), (27931, 0.8856551647186279), (13568, 0.8851773738861084)]


### Experiment (2): Given a sentence the model has never seen, find the most similar sentence in corpus

In [348]:
# We try to build a new sentence by substuting some words in given sentences.
doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc; re-run cell for more examples

word_vocab = simple_models[1].wv.index2word  # the vocabulary containing all words
word_split = alldocs[doc_id].words

change_ratio = 0.1
num_change = int( len(word_split) * change_ratio ) # number of exchange randomly
print("Length of doc(doc_id={}) is {}, change_ratio = {}, number of changed word = {}\n".format
      (doc_id, len(word_split), change_ratio, num_change ) )

print("Before change : \n %s" % (" ".join(word_split)) )

for i in range(0, num_change):
    # 1. random choose 2 different indices（对应选中两个不同位置的单词）
    a = np.random.randint(0, len(word_split) )
    b = np.random.randint(0, len(word_vocab) )

    # 2. exchange the corresponding words of these 2 indices
    word_split[a] = word_vocab[b]

print("\nAfter change : \n %s" % (" ".join(word_split)) )

Length of doc(doc_id=28891) is 156, change_ratio = 0.1, number of changed word = 15

Before change : 
 i went to see this because i have some friends in the ukraine . but the film moved me beyond what i expected by turning out to be a perfect blend of belly holding laughs ( alex's strange use of english ) situational comedy and heaviness bordering on depressing . i loved the range . it made me want to jump in to an old car and hit the road for the ukraine . alex ( hutz ) plays his guide part perfectly and provides a great counterpoint to elijah woods' poker faced earnestness . the film shows the positive side of humanity when ppl of differing cultures can bond and do the right thing when they feel the sincerity of the situation , even when they went into it with preconceived notions and prejudices , and how this can open up doorways into deeply buried memories . there is a lot in this film .

After change : 
 i went to see this because i ironside some friends in the ukraine . but the f

In [350]:
print("Length of doc(doc_id={}) is {}, change_ratio = {}, number of changed word = {}\n".format
      (doc_id, len(word_split), change_ratio, num_change ) )

for model in simple_models:
    inferred_docvec3 = model.infer_vector( word_split )
    print( "%s :\n %s" % (model, model.docvecs.most_similar( [inferred_docvec3], topn=3 )) )

Length of doc(doc_id=28891) is 156, change_ratio = 0.1, number of changed word = 15

Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) :
 [(28891, 0.7933717966079712), (63339, 0.4593014121055603), (97994, 0.4356613755226135)]
Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) :
 [(28891, 0.9143658876419067), (66472, 0.5530301332473755), (30821, 0.5515593886375427)]
Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) :
 [(28388, 0.7515352368354797), (34497, 0.7475504279136658), (97459, 0.7347123026847839)]


#### Repaet this replacing experiment for multiple times, get average result

In [351]:
exp_num = 100  # numebr of experiment
change_ratio = 0.2

counts_model = [0, 0, 0]

for i in range(exp_num):
    doc_id = np.random.randint(simple_models[0].docvecs.count)
    
    word_vocab = simple_models[1].wv.index2word  # the vocabulary containing all words(list)
    word_split = alldocs[doc_id].words

    num_change = int( len(word_split) * change_ratio ) # number of exchange randomly

    for i in range(0, num_change):
        # 1. random choose 2 different indices（对应选中两个不同位置的单词）
        a = np.random.randint(0, len(word_split) )
        b = np.random.randint(0, len(word_vocab) )

        # 2. exchange the corresponding words of these 2 indices
        word_split[a] = word_vocab[b]

#     print("Experiment.no:{}, Length of doc(doc_id={}) is {}, change_ratio = {}, number of changed word = {}\n".format
#       (i+1, doc_id, len(word_split), change_ratio, num_change ) )
    
    # do inferring for each model
    count = 0
    for model in simple_models:
        inferred_docvec3 = model.infer_vector( word_split )
        similar_results = model.docvecs.most_similar( [inferred_docvec3], topn=3 )
        
        if doc_id in [ tup[0] for tup in similar_results ]:
            counts_model[count] += 1
        
#         print( "%s :\n %s" % (model,  similar_results ))
        
        count +=1 
#     print("\n\n")
    
print(counts_model)
print("Accuracy:{}".format( [ ele/exp_num for ele in counts_model ] ))

[100, 100, 45]
Accuracy:[1.0, 1.0, 0.45]


# ！！！ to do！！！
### Experiment(3): Does Doc2vec models perform well for inferring never-seen-before paragraphs?

#### (1) split data

In [431]:
import copy

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')

alldocs2 = []  # will hold all docs in original order
with open('aclImdb/alldata-id.txt') as alldata:
    for line_no, line in enumerate(alldata):
        # 0 <= line_no < 100000 
        
        tokens = gensim.utils.to_unicode(line).split()
        words = tokens[1:]
        tags = [line_no] # `tags = [tokens[0]]` would also work at extra memory cost
        
        if line_no >= 0 and line_no < 5000:
            split = 'unseen_pos'
        elif line_no >= 12500 and line_no < 17500:
            split = 'unseen_neg'
        elif line_no >= 50000:
            split = 'extra'  # 25k train, 25k test, 50k unlabled data
        elif line_no >= 5000 and line_no < 12500:  # 7500
            split = 'test'
        elif line_no >= 17500 and line_no < 25000:
            split = 'test'
        else:
            split = 'train'
        
        # 0～12499：positive; 12500~24999:negetive; 25000~37499:positive; 37500~49999: negative
        sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
        
        alldocs2.append(SentimentDocument(words, tags, split, sentiment))

train_docs2 = [doc for doc in alldocs2 if doc.split == 'train']
test_doc2 = [doc for doc in alldocs2 if doc.split == 'test']
# test_docs2 = [doc for doc in alldocs if doc.split == 'test']

unseen_pos_docs = [doc for doc in alldocs2 if doc.split == 'unseen_pos']
unseen_neg_docs = [doc for doc in alldocs2 if doc.split == 'unseen_neg']

doc_seen = [ele for ele in alldocs2 if
            (ele.split == 'train' or ele.split == 'test' or ele.split == 'extra')]  # the document the model has ever "seen"

print('%d docs: %d train-sentiment, %d test-sentiment, %d seen docs, %d unseen docs' % (len(alldocs2), len(train_docs2),
                                                                    len(test_doc2), len(doc_seen),
                                                                     len(unseen_pos_docs) + len(unseen_neg_docs)))



100000 docs: 25000 train-sentiment, 15000 test-sentiment, 90000 seen docs, 10000 unseen docs


In [432]:
doc_unseen = copy.deepcopy(unseen_pos_docs)
doc_unseen.extend( copy.deepcopy(unseen_neg_docs) )
shuffle(doc_seen)

print(doc_unseen.__len__())

10000


#### (2). build model

In [433]:
cores = multiprocessing.cpu_count()

simple_models2 = [
    # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),

    # PV-DBOW
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),

    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
]

simple_models2[0].build_vocab(alldocs2)  # PV-DM/concat requires one special NULL word so it serves as template

# speed setup by sharing results of 1st model's vocabulary scan
for model in simple_models2[1:]:
    model.reset_from(simple_models2[0])
    print(model)

models_by_name2 = OrderedDict((str(model), model) for model in simple_models2)  # 把三个单一模型加入进去

# # 把两个合成模型加入进去
models_by_name2['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models2[1], simple_models2[2]])
models_by_name2['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models2[1], simple_models2[0]])



Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)
Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)


In [434]:
simple_models2

[<gensim.models.doc2vec.Doc2Vec at 0x486f67250>,
 <gensim.models.doc2vec.Doc2Vec at 0x486f67350>,
 <gensim.models.doc2vec.Doc2Vec at 0x486f674d0>]

#### (3).Train Doc2vec model, and save the model

In [436]:
alpha, min_alpha, passes = (0.025, 0.001, 10)
alpha_delta = (alpha - min_alpha) / passes

model_save_path = "models2/"

best_error2 = defaultdict(lambda :1.0)
print("START %s" % datetime.datetime.now())

for epoch in range(passes):

    # doc_list就是之前定义的，其每个元素都是一个SentimentDocument
    shuffle(doc_seen)  # shuffling gets best results

    # models_by_name里面有5个模型（3单一 + 2合成）
    for name, train_model in models_by_name2.items():
        # train_model是当前所考察的模型

        # 1.train
        duration = 'na'
        train_model.alpha, train_model.min_alpha = alpha, alpha

        # 这是我们之前定义的上下文管理器
        with elapsed_timer() as elapsed:
            train_model.train(doc_seen, total_examples=900000, epochs=epoch)  # 每次训练的epoch数是递增的（不懂这样搞有什么意义）
            duration = '%.1f' % elapsed()  # 这里调用elapsed()，返回的是现在距离进入该上下文时的时间间隔
        # 所有句向量都训练好了（训练了"epoch"个epoch）

        # 2.evaluate
        eval_duration = ''

        # 这是我们之前定义的上下文管理器
        with elapsed_timer() as eval_elapsed:
            err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs2, test_doc2)

        eval_duration = '%.1f' % eval_elapsed()
        best_indicator = ' '
        if err <= best_error2[name]:
            best_error2[name] = err
            best_indicator = '*'  # 如果该模型这一轮的训练误差比上一轮低，就打个星号
        print("%s%f : %i passes : %s %ss %ss" % (best_indicator, err, epoch + 1, name, duration, eval_duration))


        # 3. infer
        if ((epoch + 1) % 5) == 0 or epoch == 0:
            eval_duration = ''
            with elapsed_timer() as eval_elapsed:
                infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs2, test_doc2, infer=True)
            eval_duration = '%.1f' % eval_elapsed()
            best_indicator = ' '
            if infer_err < best_error2[name + '_inferred']:
                best_error2[name + '_inferred'] = infer_err
                best_indicator = '*'
            print("%s%f : %i passes : %s %ss %ss" % (best_indicator, infer_err, epoch + 1, name + '_inferred', duration, eval_duration))

    print('completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta

print("END %s" % str(datetime.datetime.now()))

# save the model
count = 0
for model in simple_models2:
    model.save("models2/Doc2Vec_{}.model".format(str(count+1) ))
    count += 1

START 2020-04-19 08:30:09.688344
*0.512800 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 0.0s 0.5s
*0.506000 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)_inferred 0.0s 3.1s
*0.512800 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 0.0s 0.3s
*0.496667 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)_inferred 0.0s 1.3s
*0.512800 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 0.0s 0.3s
*0.495333 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)_inferred 0.0s 1.7s
*0.512933 : 1 passes : dbow+dmm 0.0s 0.5s
*0.502667 : 1 passes : dbow+dmm_inferred 0.0s 2.4s
*0.512933 : 1 passes : dbow+dmc 0.0s 0.4s
*0.480667 : 1 passes : dbow+dmc_inferred 0.0s 3.3s
completed pass 1 at alpha 0.025000
*0.433333 : 2 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 26.2s 0.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.266467 : 2 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 13.0s 0.5s
*0.262067 : 2 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 19.8s 0.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.224667 : 2 passes : dbow+dmm 0.0s 0.9s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.266333 : 2 passes : dbow+dmc 0.0s 0.9s
completed pass 2 at alpha 0.022600


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.331533 : 3 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 42.9s 0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.136000 : 3 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 25.3s 0.5s
*0.185400 : 3 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 37.9s 0.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.128400 : 3 passes : dbow+dmm 0.0s 1.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.134800 : 3 passes : dbow+dmc 0.0s 0.9s
completed pass 3 at alpha 0.020200
*0.264067 : 4 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 61.8s 0.4s
*0.109600 : 4 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 37.5s 0.4s
*0.166467 : 4 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 56.5s 0.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.109000 : 4 passes : dbow+dmm 0.0s 1.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.110800 : 4 passes : dbow+dmc 0.0s 1.0s
completed pass 4 at alpha 0.017800
*0.215400 : 5 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 81.5s 0.3s
*0.257333 : 5 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)_inferred 81.5s 2.2s
*0.102867 : 5 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 49.4s 0.3s
*0.133333 : 5 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)_inferred 49.4s 1.1s
*0.154800 : 5 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 74.9s 0.3s
*0.212000 : 5 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)_inferred 74.9s 1.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.103467 : 5 passes : dbow+dmm 0.0s 0.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.134000 : 5 passes : dbow+dmm_inferred 0.0s 2.6s
*0.105000 : 5 passes : dbow+dmc 0.0s 0.7s
*0.138000 : 5 passes : dbow+dmc_inferred 0.0s 3.5s
completed pass 5 at alpha 0.015400
*0.189067 : 6 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 101.8s 0.2s
*0.100267 : 6 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 61.9s 0.3s
*0.149333 : 6 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 93.8s 0.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.100733 : 6 passes : dbow+dmm 0.0s 0.8s
*0.101333 : 6 passes : dbow+dmc 0.0s 98.5s
completed pass 6 at alpha 0.013000
*0.178933 : 7 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 143.9s 0.3s
 0.103267 : 7 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 87.2s 0.5s
*0.146200 : 7 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 115.3s 0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


 0.102467 : 7 passes : dbow+dmm 0.0s 0.9s
 0.104000 : 7 passes : dbow+dmc 0.0s 0.7s
completed pass 7 at alpha 0.010600
*0.174733 : 8 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 143.0s 0.2s
 0.102933 : 8 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 86.7s 0.3s
*0.143267 : 8 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 130.4s 0.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


 0.101800 : 8 passes : dbow+dmm 0.0s 0.8s
 0.105267 : 8 passes : dbow+dmc 0.0s 0.7s
completed pass 8 at alpha 0.008200
*0.170733 : 9 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 162.5s 0.2s
 0.105000 : 9 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 99.0s 0.3s
*0.142467 : 9 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 152.2s 0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


 0.103067 : 9 passes : dbow+dmm 0.0s 0.8s
 0.106000 : 9 passes : dbow+dmc 0.0s 0.7s
completed pass 9 at alpha 0.005800
*0.168600 : 10 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 182.4s 0.2s
*0.210000 : 10 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)_inferred 182.4s 2.2s
 0.103933 : 10 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 110.3s 0.3s
 0.138667 : 10 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)_inferred 110.3s 1.1s
 0.142600 : 10 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 168.1s 0.4s
 0.223333 : 10 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)_inferred 168.1s 1.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


 0.103467 : 10 passes : dbow+dmm 0.0s 0.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


 0.135333 : 10 passes : dbow+dmm_inferred 0.0s 2.7s
 0.105267 : 10 passes : dbow+dmc 0.0s 0.7s
*0.134667 : 10 passes : dbow+dmc_inferred 0.0s 3.4s
completed pass 10 at alpha 0.003400
END 2020-04-19 09:12:13.070043


In [500]:
for rate, name in sorted((rate, name) for name, rate in best_error2.items()):
    print("%f %s" % (rate, name))

0.100267 Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)
0.100733 dbow+dmm
0.101333 dbow+dmc
0.133333 Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)_inferred
0.134000 dbow+dmm_inferred
0.134667 dbow+dmc_inferred
0.142467 Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)
0.168600 Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)
0.210000 Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)_inferred
0.212000 Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)_inferred


#### Next, get inferred vector for unknown sentences, and use these inferred paragraph vectors to do sentiment analysis

In [502]:
### we can tune the hyperparameter: infer_steps
for model in simple_models2:
    error_rate, error_count, test_count, predicator = error_rate_for_model(model, 
                                                train_docs2, doc_unseen, infer_steps=3, infer=True, infer_subsample=1) 
    
    print('model:{}, error_rate:{}, error_count:{}, test_count:{}'.format(model, error_rate, error_count, test_count))


model:Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8), error_rate:0.1852, error_count:1852, test_count:10000
model:Doc2Vec(dbow,d100,n5,mc2,s0.001,t8), error_rate:0.1281, error_count:1281, test_count:10000
model:Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8), error_rate:0.2025, error_count:2025, test_count:10000


### Experiment (4): Word vector and word similarity

#### Doc2vec model is based on Word2Vec model. We will evaluate whether the similarity of word vector has relation to the meaning similarity of words

In [498]:
import random
from IPython.display import HTML

word_models = simple_models[:]

# pick a random word with a suitable number of occurences
while True:
    word = random.choice(word_models[0].wv.index2word)
    if word_models[0].wv.vocab[word].count > 10:
        break

similars_per_model = [str(model.most_similar(word, topn=10)).replace('), ','),<br>\n') for model in word_models]
similars_per_model

  if sys.path[0] == '':


["[('countries', 0.7504833340644836),<br>\n('cities', 0.685369610786438),<br>\n('towns', 0.6629876494407654),<br>\n('cultures', 0.6337417364120483),<br>\n('civilizations', 0.6286609172821045),<br>\n('armies', 0.6059571504592896),<br>\n('tribes', 0.5915517807006836),<br>\n('regimes', 0.5871071815490723),<br>\n('communities', 0.5733639001846313),<br>\n('denominations', 0.5725681781768799)]",
 "[('comedy-drama', 0.44841188192367554),<br>\n('brownstone', 0.41120976209640503),<br>\n('commentary', 0.4006381034851074),<br>\n('displaced', 0.394894540309906),<br>\n('maplins', 0.38663017749786377),<br>\n('brainy', 0.3835850656032562),<br>\n('loudspeakers', 0.382948637008667),<br>\n('advices', 0.3815535604953766),<br>\n('opiate', 0.37693917751312256),<br>\n('healer', 0.37368127703666687)]",
 '[(\'countries\', 0.699311375617981),<br>\n(\'governments\', 0.6510030031204224),<br>\n(\'cultures\', 0.6487419605255127),<br>\n(\'organizations\', 0.6240066289901733),<br>\n(\'economies\', 0.6165152788162231

In [499]:
similar_table = ("<table><tr><th>" +
    "</th><th>".join([str(model) for model in word_models]) + 
    "</th></tr><tr><td>" +
    "</td><td>".join(similars_per_model) +
    "</td></tr></table>")
print("most similar words for '%s' (%d occurences)" % (word, simple_models[0].wv.vocab[word].count))
HTML(similar_table)

most similar words for 'nations' (240 occurences)


"Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)","Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)","Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)"
"[('countries', 0.7504833340644836), ('cities', 0.685369610786438), ('towns', 0.6629876494407654), ('cultures', 0.6337417364120483), ('civilizations', 0.6286609172821045), ('armies', 0.6059571504592896), ('tribes', 0.5915517807006836), ('regimes', 0.5871071815490723), ('communities', 0.5733639001846313), ('denominations', 0.5725681781768799)]","[('comedy-drama', 0.44841188192367554), ('brownstone', 0.41120976209640503), ('commentary', 0.4006381034851074), ('displaced', 0.394894540309906), ('maplins', 0.38663017749786377), ('brainy', 0.3835850656032562), ('loudspeakers', 0.382948637008667), ('advices', 0.3815535604953766), ('opiate', 0.37693917751312256), ('healer', 0.37368127703666687)]","[('countries', 0.699311375617981), ('governments', 0.6510030031204224), ('cultures', 0.6487419605255127), ('organizations', 0.6240066289901733), ('economies', 0.6165152788162231), (""states'"", 0.6150784492492676), ('institutions', 0.5912648439407349), ('societies', 0.5903887748718262), ('rulers', 0.5901978015899658), ('populations', 0.5853989124298096)]"


### By repeating running the above 2 cells again and again, we can see the experiment results of different selected words.

### Conclusion:
#### (1) Generally, words with similar meaning have similar word vectors;
#### (2) The word vectors in PV-DM models outperforms in PV-DBOW. That's because when training DBOW, word vectors are not trained with paragrah vectors, because requires only using  paragraph vectors to predict words in context.  