# CS5339 Project

# Part 1. Data Cleaning

In [9]:
import pandas as pd

from tempfile import mkstemp
from shutil import move, copymode
from os import fdopen, remove
import re
from sklearn.linear_model import LogisticRegression

def replace(file_path):
    #Create temp file
    fh, abs_path = mkstemp()
    with fdopen(fh,'w') as new_file:
        with open(file_path) as old_file:
            for line in old_file:
                new_file.write(re.sub("  " , " ", line))

    #Copy the file permissions from the old file to the new file
    copymode(file_path, abs_path)
    #Remove original file
    remove(file_path)
    #Move new file
    move(abs_path, file_path)

replace('aclImdb/test-neg.txt')
replace('aclImdb/train-pos.txt')
replace('aclImdb/test-pos.txt')
replace('aclImdb/test-neg.txt')
replace('aclImdb/train-unsup.txt')

train_neg = pd.read_csv('aclImdb/train-neg.txt', header = None, delimiter = "\n")
train_pos = pd.read_csv('aclImdb/train-pos.txt', header = None, delimiter = "\n")
train_unsup = pd.read_csv('aclImdb/train-unsup.txt', header = None, delimiter = "\n")
test_pos = pd.read_csv('aclImdb/test-pos.txt', header = None, delimiter = "\n")
test_neg = pd.read_csv('aclImdb/test-neg.txt', header = None, delimiter = "\n")


all_train = pd.concat([train_neg, train_pos, train_unsup], ignore_index=True)

all_train.head(10)

Unnamed: 0,0
0,story of a man who has unnatural feelings for ...
1,airport '77 starts as a brand new luxury 747 p...
2,this film lacked something i couldn't put my f...
3,"sorry everyone , , , i know this is suppose..."
4,when i was little my parents took me along to ...
5,""" it appears that many critics find the idea ..."
6,the second attempt by a new york intellectual ...
7,"i don't know who to blame , the timid writers..."
8,this film is mediocre at best . angie harmon ...
9,the film is bad . there is no other way to sa...


In [3]:
type(all_train)

pandas.core.frame.DataFrame

## Load dataset
https://ai.stanford.edu/~amaas/data/sentiment/

In [4]:
import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')

alldocs = []  # will hold all docs in original order
with open('aclImdb/alldata-id.txt') as alldata:
    for line_no, line in enumerate(alldata):
        tokens = gensim.utils.to_unicode(line).split()
        words = tokens[1:]
        tags = [line_no] # `tags = [tokens[0]]` would also work at extra memory cost
        split = ['train','test','extra','extra'][line_no//25000]  # 25k train, 25k test, 50k unlabled data
        sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
        alldocs.append(SentimentDocument(words, tags, split, sentiment))

train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:]  # for reshuffling per pass

print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))

100000 docs: 25000 train-sentiment, 25000 test-sentiment


In [5]:
# Check first document 
doc_list[0]

SentimentDocument(words=['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '"', 'teachers', '"', '.', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', "high's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"', 'teachers', '"', '.', 'the', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', '.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'i', 'immediately', 'recalled', '.', '.', '.', '.', '.', '.', '.', '.', '.', 'at', '.', '

# Part 2. Model

## two main categories of doc vector models: PV-DM and PV-DBOW

In [6]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
print(cores)
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

simple_models = [
    # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DBOW 
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
]

# speed setup by sharing results of 1st model's vocabulary scan
simple_models[0].build_vocab(alldocs)  # PV-DM/concat requires one special NULL word so it serves as template
print(simple_models[0])
for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

8




Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)
Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)
Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)


In [8]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec

# Like the author mentioned in the paper, we can concatenate different Doc2Vec models
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])

print(models_by_name)

OrderedDict([('Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)',
              <gensim.models.doc2vec.Doc2Vec at 0x17b45d7d0>),
             ('Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)',
              <gensim.models.doc2vec.Doc2Vec at 0x17b45d890>),
             ('Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)',
              <gensim.models.doc2vec.Doc2Vec at 0x17b59de90>),
             ('dbow+dmm',
              <gensim.test.test_doc2vec.ConcatenatedDoc2Vec at 0x105b09950>),
             ('dbow+dmc',
              <gensim.test.test_doc2vec.ConcatenatedDoc2Vec at 0x105b09810>)])

## Prediction

In [11]:
from collections import defaultdict
best_error = defaultdict(lambda :1.0)  # to selectively-print only best errors achieved

import numpy as np
import statsmodels.api as sm
from random import sample

# for timing
from contextlib import contextmanager
from timeit import default_timer
import time 

@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start
    
def logistic_predictor_from_data(train_targets, train_regressors):
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    #print(predictor.summary())
    return predictor

# to train a logistic regressor
def sk_logistic_regressor(y, X ):
    lr = LogisticRegression()
    lr.fit(X, y)
    return lr

def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]]) for doc in train_set])
    train_regressors = sm.add_constant(train_regressors)
    
#     predictor = logistic_predictor_from_data(train_targets, train_regressors)
    predictor = sk_logistic_regressor(train_targets, train_regressors)  
    # use doccument vector in training set to train logistic regressor

    test_data = test_set
    
    # case 1. for inference, a given sentence is feeded to the model, and a inferred doc vector will be given
    if infer:
        if infer_subsample < 1.0:
            test_data = sample(test_data, int(infer_subsample * len(test_data)))
        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
    
    # case 2. for evaluation, we use doc vectors in testing set to examine whether our logistic regressor performs well
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_regressors = sm.add_constant(test_regressors)
    
    # predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

## Training word vectors and doc vectors

In [13]:
from random import shuffle
import datetime

alpha, min_alpha, passes = (0.025, 0.001, 12)
alpha_delta = (alpha - min_alpha) / passes

print("START %s" % datetime.datetime.now())

for epoch in range(passes):
    shuffle(doc_list)  # shuffling gets best results
    
    for name, train_model in models_by_name.items():
        # train
        duration = 'na'
        train_model.alpha, train_model.min_alpha = alpha, alpha
        with elapsed_timer() as elapsed:
            train_model.train(doc_list,total_examples=100000, epochs=epoch)
            duration = '%.1f' % elapsed()
            
        # evaluate
        eval_duration = ''
        with elapsed_timer() as eval_elapsed:
            err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs)
        eval_duration = '%.1f' % eval_elapsed()
        best_indicator = ' '
        if err <= best_error[name]:
            best_error[name] = err
            best_indicator = '*' 
        print("%s%f : %i passes : %s %ss %ss" % (best_indicator, err, epoch + 1, name, duration, eval_duration))

        if ((epoch + 1) % 5) == 0 or epoch == 0:
            eval_duration = ''
            with elapsed_timer() as eval_elapsed:
                infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs, infer=True)
            eval_duration = '%.1f' % eval_elapsed()
            best_indicator = ' '
            if infer_err < best_error[name + '_inferred']:
                best_error[name + '_inferred'] = infer_err
                best_indicator = '*'
            print("%s%f : %i passes : %s %ss %ss" % (best_indicator, infer_err, epoch + 1, name + '_inferred', duration, eval_duration))

    print('completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta
    
print("END %s" % str(datetime.datetime.now()))

START 2020-04-18 13:10:24.846813
*0.506000 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 0.0s 0.3s
*0.501600 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)_inferred 0.0s 8.1s
*0.506000 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 0.0s 0.3s
*0.504800 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)_inferred 0.0s 3.3s
*0.506000 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 0.0s 0.3s
*0.490800 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)_inferred 0.0s 6.3s
*0.506040 : 1 passes : dbow+dmm 0.0s 0.6s
*0.516800 : 1 passes : dbow+dmm_inferred 0.0s 7.7s
*0.506040 : 1 passes : dbow+dmc 0.0s 0.7s
*0.505600 : 1 passes : dbow+dmc_inferred 0.0s 11.8s
completed pass 1 at alpha 0.025000
*0.431960 : 2 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 26.9s 0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.263920 : 2 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 14.5s 0.5s
*0.265440 : 2 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 21.0s 1.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.223880 : 2 passes : dbow+dmm 0.0s 1.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.262040 : 2 passes : dbow+dmc 0.0s 0.9s
completed pass 2 at alpha 0.023000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.329120 : 3 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 47.4s 0.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.130840 : 3 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 27.1s 0.6s
*0.193720 : 3 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 41.6s 0.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.125960 : 3 passes : dbow+dmm 0.0s 1.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.132080 : 3 passes : dbow+dmc 0.0s 1.4s
completed pass 3 at alpha 0.021000
*0.252120 : 4 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 65.3s 0.4s
*0.111480 : 4 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 39.3s 0.4s
*0.169320 : 4 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 61.3s 0.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.109440 : 4 passes : dbow+dmm 0.0s 0.9s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.110840 : 4 passes : dbow+dmc 0.0s 0.9s
completed pass 4 at alpha 0.019000
*0.202000 : 5 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 86.6s 0.3s
*0.205600 : 5 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)_inferred 86.6s 8.4s
*0.107840 : 5 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 52.1s 0.3s
*0.117200 : 5 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)_inferred 52.1s 3.0s
*0.157040 : 5 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 81.4s 0.3s
*0.185200 : 5 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)_inferred 81.4s 3.8s
*0.106440 : 5 passes : dbow+dmm 0.0s 0.9s
*0.118400 : 5 passes : dbow+dmm_inferred 0.0s 7.2s
*0.107720 : 5 passes : dbow+dmc 0.0s 0.8s
*0.120400 : 5 passes : dbow+dmc_inferred 0.0s 11.9s
completed pass 5 at alpha 0.017000
*0.181280 : 6 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 107.3s 0.3s
*0.105360 : 6 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 66.9s 0.4s
*0.151320 : 6 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 102.1s 0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.104640 : 6 passes : dbow+dmm 0.0s 1.0s
*0.105480 : 6 passes : dbow+dmc 0.0s 0.8s
completed pass 6 at alpha 0.015000
*0.170160 : 7 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 128.3s 0.2s
*0.105160 : 7 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 76.8s 0.3s
*0.147240 : 7 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 122.1s 0.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.104120 : 7 passes : dbow+dmm 0.0s 0.9s
*0.104680 : 7 passes : dbow+dmc 0.0s 0.8s
completed pass 7 at alpha 0.013000
*0.163320 : 8 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 148.4s 0.2s
*0.104600 : 8 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 90.3s 0.3s
*0.143480 : 8 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 141.9s 0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


 0.104480 : 8 passes : dbow+dmm 0.0s 0.9s
*0.104080 : 8 passes : dbow+dmc 0.0s 1.2s
completed pass 8 at alpha 0.011000
*0.161880 : 9 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 169.4s 0.2s
 0.106000 : 9 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 103.1s 0.3s
*0.142200 : 9 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 162.3s 0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


 0.105160 : 9 passes : dbow+dmm 0.0s 1.0s
 0.104200 : 9 passes : dbow+dmc 0.0s 0.7s
completed pass 9 at alpha 0.009000
 0.161920 : 10 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) 204.9s 0.3s
*0.186400 : 10 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)_inferred 204.9s 8.2s
 0.106280 : 10 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) 122.3s 0.3s
*0.103600 : 10 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)_inferred 122.3s 3.6s
*0.141440 : 10 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) 188.0s 0.3s
*0.182000 : 10 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)_inferred 188.0s 3.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


 0.105440 : 10 passes : dbow+dmm 0.0s 1.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


*0.100400 : 10 passes : dbow+dmm_inferred 0.0s 7.2s
 0.105480 : 10 passes : dbow+dmc 0.0s 1.4s
*0.109200 : 10 passes : dbow+dmc_inferred 0.0s 11.9s
completed pass 10 at alpha 0.007000


KeyboardInterrupt: 

In [14]:
for rate, name in sorted((rate, name) for name, rate in best_error.items()):
    print("%f %s" % (rate, name))

0.100400 dbow+dmm_inferred
0.103600 Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)_inferred
0.104080 dbow+dmc
0.104120 dbow+dmm
0.104600 Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)
0.109200 dbow+dmc_inferred
0.141440 Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)
0.161880 Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)
0.182000 Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8)_inferred
0.186400 Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8)_inferred


#### Conclusion: 
#### First, combined Doc2Vec model outperforms single Doc2Vec model. 
#### Second, different from the conclusion in original paper, PV-DBOW outperforms than PV-DM.

# Part 3. Evaluation

### Experiment (1): Given a sentence in corpus, let's see whether an inferred sentence is closed to precalculated sentence

In [20]:
doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc; re-run cell for more examples
print('for doc %d...' % doc_id)

# have a look at what this randomly picked sentence is
sentence = " ".join( doc_list[doc_id].words )
print( "This sentence is : \n %s" % (sentence) )

for doc 10298...
This sentence is : 
 the magical life of long tack sam is as engaging as watching 90 minutes of disjointed super-8 home movies of people you've never met , except with none of the cute kids or humor that might redeem the home movies . consider it the blair witch project without a story and with no acting . the story relates the filmmaker's progress in documenting the life of her distant grandfather who was a famous vaudeville magician and acrobat . the angle is that , while long tack sam traveled in the same circuit at other vaudeville artists who are globally known , he is unknown - totally forgotten . the filmmaker attempts to flesh out the man and to explore why he is forgotten . having seen the movie , i understand why he is forgotten . his story , at least as related in the movie , is wholly unengaging , and the film does him no favors by droning endlessly trying to make him interesting , when in fact , he is totally undistinguishable from the hundreds of other tw

In [21]:
# from each doc vector model, find the doc vector most similar to the inferred doc vector of given sentence
for model in simple_models:
    inferred_docvec = model.infer_vector(alldocs[doc_id].words)
    print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))

Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8):
 [(10298, 0.9198753237724304), (55042, 0.5310964584350586), (10303, 0.48414239287376404)]
Doc2Vec(dbow,d100,n5,mc2,s0.001,t8):
 [(10298, 0.9822378158569336), (10303, 0.699144184589386), (32998, 0.5950869917869568)]
Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8):
 [(10298, 0.8809810280799866), (3404, 0.674802303314209), (32909, 0.6560869216918945)]


### Experiment (2): Given a sentence the model has never seen, find the most similar sentence in corpus

In [23]:
# We try to build a new sentence by substuting some words in given sentences.

word_vocab = simple_models[0].wv.index2word  # the vocabulary containing all words
word_split = alldocs[doc_id].words

num_change = 70  # number of exchange randomly

print("Before change : \n %s" % (" ".join(word_split)) )

for i in range(0, num_change):
    # 1. random choose 2 different indices（对应选中两个不同位置的单词）
    a = np.random.randint(0, len(word_split) )
    b = np.random.randint(0, len(word_vocab) )

    # 2. exchange the corresponding words of these 2 indices
    word_split2[a] = word_vocab[b]

print("\nAfter change : \n %s" % (" ".join(word_split)) )

Before change : 
 " the muppets take manhattan " is different in a lot of ways to every other muppet movie made so far . for one , it remains the only muppet film not owned by disney . as of 2008 , the film still belongs to 20th century fox ( cbs fox at the time of its release ) even though disney owns the rights to the muppets . also , this film has a story line that's very non-linear , and events that are otherwise unpredictable . of course , it's very hard to beat the original " muppet movie " from 1979 , especially since that movie had more memorable songs than " manhattan " does . however , one way in which " the muppets take manhattan " is better than " the muppet movie " is perhaps the surprisingly realistic scenarios . in the first movie , all the muppets really have to do is go to hollywood , walk into an agent's office , and they are immediately given a " rich & famous " contract . in this movie , the muppets learn that they actually have to work for their desired success , a

In [35]:
for model in simple_models:
    inferred_docvec3 = model.infer_vector( word_split )
    print( "%s :\n %s" % (model, model.docvecs.most_similar( [inferred_docvec3], topn=3 )) )

Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t8) :
 [(10298, 0.8788644075393677), (55042, 0.4959641098976135), (60613, 0.44720274209976196)]
Doc2Vec(dbow,d100,n5,mc2,s0.001,t8) :
 [(10298, 0.8548401594161987), (32998, 0.5482186675071716), (10303, 0.5395064949989319)]
Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t8) :
 [(10298, 0.785269021987915), (3404, 0.5528035163879395), (90676, 0.5512930154800415)]
