In [23]:
from gensim.test.utils import get_tmpfile
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [24]:
c=1
simple_models=[]
for model in range(3): 
    simple_models.append(Doc2Vec.load(str(c)))
    c+=1

In [25]:
from smart_open import smart_open
import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')
alldocs = []
with smart_open('alldata-id.txt', 'rb', encoding='utf-8') as alldata:
    for line_no, line in enumerate(alldata):
        tokens = gensim.utils.to_unicode(line).split()
        words = tokens[1:]
        tags = [line_no] # 'tags = [tokens[0]]' would also work at extra memory cost
        split = ['train', 'test', 'extra', 'extra'][line_no//25000]  # 25k train, 25k test, 25k extra
        sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
        alldocs.append(SentimentDocument(words, tags, split, sentiment))

train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']

print('%d docs: %d train-sentiment, %d test-sentiment' % (len(alldocs), len(train_docs), len(test_docs)))



100000 docs: 25000 train-sentiment, 25000 test-sentiment


In [26]:
from random import shuffle
doc_list = alldocs[:]  
shuffle(doc_list)

In [27]:
import numpy as np
import statsmodels.api as sm
from random import sample
    
def logistic_predictor_from_data(train_targets, train_regressors):
    """Fit a statsmodel logistic predictor on supplied data"""
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    # print(predictor.summary())
    return predictor

def error_rate_for_model(test_model, train_set, test_set, 
                         reinfer_train=False, reinfer_test=False, 
                         infer_steps=None, infer_alpha=None, infer_subsample=0.2):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets = [doc.sentiment for doc in train_set]
    if reinfer_train:
        train_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in train_set]
    else:
        train_regressors = [test_model.docvecs[doc.tags[0]] for doc in train_set]
    train_regressors = sm.add_constant(train_regressors)
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_data = test_set
    if reinfer_test:
        if infer_subsample < 1.0:
            test_data = sample(test_data, int(infer_subsample * len(test_data)))
        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_regressors = sm.add_constant(test_regressors)
    
    # Predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

In [28]:
from collections import defaultdict
error_rates = defaultdict(lambda: 1.0)  # To selectively print only best errors achieved

In [29]:
for model in simple_models: 
    print("\nEvaluating %s" % model)
    err_rate, err_count, test_count, predictor = error_rate_for_model(model, train_docs, test_docs)
    error_rates[str(model)] = err_rate
    print("\n%f %s\n" % (err_rate, model))


Evaluating Doc2Vec(dbow,d100,n5,mc2,t8)


  train_regressors = [test_model.docvecs[doc.tags[0]] for doc in train_set]
  test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]



0.146240 Doc2Vec(dbow,d100,n5,mc2,t8)


Evaluating Doc2Vec(dm/m,d100,n5,w10,mc2,t8)

0.211760 Doc2Vec(dm/m,d100,n5,w10,mc2,t8)


Evaluating Doc2Vec(dm/c,d100,n5,w5,mc2,t8)

0.376800 Doc2Vec(dm/c,d100,n5,w5,mc2,t8)



In [30]:
print("Err_rate Model")
for rate, name in sorted((rate, name) for name, rate in error_rates.items()):
    print("%f %s" % (rate, name))

Err_rate Model
0.146240 Doc2Vec(dbow,d100,n5,mc2,t8)
0.211760 Doc2Vec(dm/m,d100,n5,w10,mc2,t8)
0.376800 Doc2Vec(dm/c,d100,n5,w5,mc2,t8)


In [35]:
word_models = simple_models[:]
import random
from IPython.display import HTML
# pick a random word with a suitable number of occurences

word = 'goodbye'
    
# or uncomment below line, to just pick a word from the relevant domain:
#word = 'comedy/drama'
similars_per_model = [str(model.wv.most_similar(word, topn=20)).replace('), ','),<br>\n') for model in word_models]
similar_table = ("<table><tr><th>" +
    "</th><th>".join([str(model) for model in word_models]) + 
    "</th></tr><tr><td>" +
    "</td><td>".join(similars_per_model) +
    "</td></tr></table>")
print("most similar words for '%s' (%d occurences)" % (word, simple_models[0].wv.key_to_index[word]))
HTML(similar_table)

most similar words for 'goodbye' (6157 occurences)


"Doc2Vec(dbow,d100,n5,mc2,t8)","Doc2Vec(dm/m,d100,n5,w10,mc2,t8)","Doc2Vec(dm/c,d100,n5,w5,mc2,t8)"
"[('parrish', 0.46560177206993103), ('career-dead', 0.4251338839530945), ('scarp', 0.42053326964378357), ('ojibway', 0.39109230041503906), ('anti-competitive', 0.3903639018535614), (""shaffer's"", 0.3867054581642151), ('splashed', 0.386642724275589), ('rigors', 0.37911680340766907), ('on-point', 0.376544713973999), (""lessons'"", 0.3757546544075012), ('nick@night', 0.3698621690273285), (""'back"", 0.36863091588020325), ('benita', 0.3670118749141693), ('wildman', 0.3651280403137207), ('draco', 0.36431047320365906), ('football', 0.3607681393623352), ('gamely', 0.35533466935157776), (""dench's"", 0.35527360439300537), ('haryanvi', 0.3539436161518097), (""resume's"", 0.3528524339199066)]","[('farewell', 0.7002571225166321), ('good-bye', 0.5677174925804138), ('hello', 0.5641282796859741), ('adieu', 0.5598751306533813), ('beaver', 0.5391549468040466), ('stairway', 0.535651683807373), ('havana', 0.5307672619819641), ('needless', 0.5158049464225769), ('paraphrase', 0.5150619745254517), ('hi', 0.5129165053367615), ('inmate/pilot', 0.5123285055160522), ('trouby', 0.5072649717330933), ('hush', 0.5031053423881531), ('denver', 0.5025613903999329), ('hereafter', 0.5008232593536377), ('yi-che', 0.49571216106414795), ('nahi', 0.49128982424736023), ('reply', 0.4890081584453583), ('cuddle', 0.48873332142829895), ('tuesday', 0.4840763211250305)]","[('goodnight', 0.6841808557510376), ('hello', 0.6766211986541748), ('farewell', 0.5839443802833557), (""'penis'"", 0.5594010353088379), ('good-bye', 0.5414263010025024), ('bye', 0.5398421883583069), ('incoherently', 0.5338912010192871), ('afterward', 0.5278379321098328), (""'yes'"", 0.5205431580543518), ('whaaa', 0.5163118839263916), ('brana', 0.509772002696991), ('belonging', 0.5021269917488098), ('kuch', 0.49723172187805176), (""'bizarre"", 0.49608299136161804), ('afterwards', 0.493028461933136), (""'welcome"", 0.4914415776729584), ('hi', 0.4895091652870178), ('whoa', 0.48633822798728943), ('-couple', 0.4839484691619873), ('lonnrot', 0.48105746507644653)]"


In [34]:
import random
doc_id = random.randint(0, len(test_docs))  # Pick random doc; re-run cell for more examples
print('for doc %d...' % doc_id)
for model in simple_models:
    inferred_docvec = model.infer_vector(alldocs[doc_id].words)
    print('%s:\n %s \n %s' % (model, model.dv.most_similar([inferred_docvec], topn=3),alldocs[doc_id].words))

for doc 7032...
Doc2Vec(dbow,d100,n5,mc2,t8):
 [(7032, 0.9037784934043884), (80985, 0.8294853568077087), (32572, 0.825411856174469)] 
 ['enterprise', ',', 'the', 'latest', 'high', 'budget', 'spin-off', 'to', 'the', 'most', 'successful', 'franchise', 'in', 'film', 'and', 'or', 'television', 'history', 'opens', 'to', 'the', 'tune', 'of', 'a', '90-minute', 'episode', 'called', "'broken", "bow'", '.', 'first', 'we', 'are', 'swept', 'into', 'a', 'massive', 'action', 'sequence', 'with', 'a', 'klingon', 'being', 'chased', 'by', 'some', 'suliban', '(', 'who', 'are', 'the', 'main', 'enemy', 'in', 'the', 'first', 'season', 'of', 'the', 'show', ')', '.', 'from', 'there', 'the', 'televised', 'movie', 'takes', 'us', 'on', 'a', 'journey', 'that', 'seldom', 'gets', 'as', 'good', 'as', 'it', 'is', ',', 'with', 'some', 'of', 'the', 'best', 'character', 'development', ',', 'story', 'and', 'action/visual', 'effects', 'ever', 'seen', 'in', 'such', 'a', 'short', 'amount', 'of', 'time', '.', 'the', 'opening

Doc2Vec(dm/m,d100,n5,w10,mc2,t8):
 [(7032, 0.7789039015769958), (98233, 0.6074587106704712), (62973, 0.5919082760810852)] 
 ['enterprise', ',', 'the', 'latest', 'high', 'budget', 'spin-off', 'to', 'the', 'most', 'successful', 'franchise', 'in', 'film', 'and', 'or', 'television', 'history', 'opens', 'to', 'the', 'tune', 'of', 'a', '90-minute', 'episode', 'called', "'broken", "bow'", '.', 'first', 'we', 'are', 'swept', 'into', 'a', 'massive', 'action', 'sequence', 'with', 'a', 'klingon', 'being', 'chased', 'by', 'some', 'suliban', '(', 'who', 'are', 'the', 'main', 'enemy', 'in', 'the', 'first', 'season', 'of', 'the', 'show', ')', '.', 'from', 'there', 'the', 'televised', 'movie', 'takes', 'us', 'on', 'a', 'journey', 'that', 'seldom', 'gets', 'as', 'good', 'as', 'it', 'is', ',', 'with', 'some', 'of', 'the', 'best', 'character', 'development', ',', 'story', 'and', 'action/visual', 'effects', 'ever', 'seen', 'in', 'such', 'a', 'short', 'amount', 'of', 'time', '.', 'the', 'opening-credits', 

Doc2Vec(dm/c,d100,n5,w5,mc2,t8):
 [(24784, 0.612922728061676), (7032, 0.605194628238678), (41347, 0.5834512114524841)] 
 ['enterprise', ',', 'the', 'latest', 'high', 'budget', 'spin-off', 'to', 'the', 'most', 'successful', 'franchise', 'in', 'film', 'and', 'or', 'television', 'history', 'opens', 'to', 'the', 'tune', 'of', 'a', '90-minute', 'episode', 'called', "'broken", "bow'", '.', 'first', 'we', 'are', 'swept', 'into', 'a', 'massive', 'action', 'sequence', 'with', 'a', 'klingon', 'being', 'chased', 'by', 'some', 'suliban', '(', 'who', 'are', 'the', 'main', 'enemy', 'in', 'the', 'first', 'season', 'of', 'the', 'show', ')', '.', 'from', 'there', 'the', 'televised', 'movie', 'takes', 'us', 'on', 'a', 'journey', 'that', 'seldom', 'gets', 'as', 'good', 'as', 'it', 'is', ',', 'with', 'some', 'of', 'the', 'best', 'character', 'development', ',', 'story', 'and', 'action/visual', 'effects', 'ever', 'seen', 'in', 'such', 'a', 'short', 'amount', 'of', 'time', '.', 'the', 'opening-credits', 'is