In [None]:
import locale
import glob
import os.path
import requests
import tarfile
import sys
import codecs
from smart_open import smart_open
import re

dirname = 'aclImdb'
filename = 'aclImdb_v1.tar.gz'
locale.setlocale(locale.LC_ALL, 'C')
all_lines = []

if sys.version > '3':
    control_chars = [chr(0x85)]
else:
    control_chars = [unichr(0x85)]

# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    norm_text = re.sub(r"([\.\",\(\)!\?;:])", " \\1 ", norm_text)
    return norm_text

if not os.path.isfile('aclImdb/alldata-id.txt'):
    if not os.path.isdir(dirname):
        if not os.path.isfile(filename):
            # Download IMDB archive
            print("Downloading IMDB archive...")
            url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename
            r = requests.get(url)
            with smart_open(filename, 'wb') as f:
                f.write(r.content)
        # if error here, try `tar xfz aclImdb_v1.tar.gz` outside notebook, then re-run this cell
        tar = tarfile.open(filename, mode='r')
        tar.extractall()
        tar.close()
    else:
        print("IMDB archive directory already available without download.")

    # Collect & normalize test/train data

if sys.version > '3':
    control_chars = [chr(0x85)]
else:
    control_chars = [unichr(0x85)]
all_lines=[]
def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    norm_text = re.sub(r"([\.\",\(\)!\?;:])", " \\1 ", norm_text)
    return norm_text
# Collect & normalize test/train data
print("Cleaning up dataset...")
folders = ['train/pos', 'train/neg', 'test/pos', 'test/neg', 'train/unsup']
for fol in folders:
    temp = u''
    newline = "\n".encode("utf-8")
    output = fol.replace('/', '-') + '.txt'
    # Is there a better pattern to use?
    txt_files = glob.glob(fol+'/*.txt')
    print(" %s: %i files" % (fol, len(txt_files)))
    with smart_open(output, "wb") as n:
        for i, txt in enumerate(txt_files):
            with smart_open(txt, "rb") as t:
                one_text = t.read().decode("utf-8")
                for c in control_chars:
                    one_text = one_text.replace(c, ' ')
                one_text = normalize_text(one_text)
                all_lines.append(one_text)
                n.write(one_text.encode("utf-8"))
                n.write(newline)

# Save to disk for instant re-use on any future runs
with smart_open('alldata-id.txt', 'wb') as f:
    for idx, line in enumerate(all_lines):
        num_line = u"_*{0} {1}\n".format(idx, line)
        f.write(num_line.encode("utf-8"))

print("Success, alldata-id.txt is available for next steps.")

Cleaning up dataset...
 train/pos: 12500 files




In [1]:
import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple
from smart_open import smart_open

# this data object class suffices as a `TaggedDocument` (with `words` and `tags`) 
# plus adds other state helpful for our later evaluation/reporting
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')

alldocs = []
with smart_open('alldata-id.txt', 'rb', encoding='utf-8') as alldata:
    for line_no, line in enumerate(alldata):
        tokens = gensim.utils.to_unicode(line).split()
        words = tokens[1:]
        tags = [line_no] # 'tags = [tokens[0]]' would also work at extra memory cost
        split = ['train', 'test', 'extra', 'extra'][line_no//25000]  # 25k train, 25k test, 25k extra
        sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
        alldocs.append(SentimentDocument(words, tags, split, sentiment))

train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']

print('%d docs: %d train-sentiment, %d test-sentiment' % (len(alldocs), len(train_docs), len(test_docs)))



100000 docs: 25000 train-sentiment, 25000 test-sentiment


In [2]:
from random import shuffle
doc_list = alldocs[:]  
shuffle(doc_list)

In [3]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

simple_models = [
    # PV-DBOW plain
    Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample=0, 
            epochs=3, workers=cores),
    # PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
    Doc2Vec(dm=1, vector_size=100, window=10, negative=5, hs=0, min_count=2, sample=0, 
            epochs=3, workers=cores, alpha=0.05, comment='alpha=0.05'),
    # PV-DM w/ concatenation - big, slow, experimental mode
    # window=5 (both sides) approximates paper's apparent 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, vector_size=100, window=5, negative=5, hs=0, min_count=2, sample=0, 
            epochs=3, workers=cores),
]

for model in simple_models:
    model.build_vocab(alldocs)
    print("%s vocabulary scanned & state initialized" % model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

Doc2Vec(dbow,d100,n5,mc2,t8) vocabulary scanned & state initialized
Doc2Vec(dm/m,d100,n5,w10,mc2,t8) vocabulary scanned & state initialized
Doc2Vec(dm/c,d100,n5,w5,mc2,t8) vocabulary scanned & state initialized


In [4]:
import numpy as np
import statsmodels.api as sm
from random import sample
    
def logistic_predictor_from_data(train_targets, train_regressors):
    """Fit a statsmodel logistic predictor on supplied data"""
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    # print(predictor.summary())
    return predictor

def error_rate_for_model(test_model, train_set, test_set, 
                         reinfer_train=False, reinfer_test=False, 
                         infer_steps=None, infer_alpha=None, infer_subsample=0.2):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets = [doc.sentiment for doc in train_set]
    if reinfer_train:
        train_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in train_set]
    else:
        train_regressors = [test_model.docvecs[doc.tags[0]] for doc in train_set]
    train_regressors = sm.add_constant(train_regressors)
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_data = test_set
    if reinfer_test:
        if infer_subsample < 1.0:
            test_data = sample(test_data, int(infer_subsample * len(test_data)))
        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_regressors = sm.add_constant(test_regressors)
    
    # Predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

In [5]:
from collections import defaultdict
error_rates = defaultdict(lambda: 1.0)  # To selectively print only best errors achieved

In [6]:
for model in simple_models: 
    print("Training %s" % model)
    model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs)
    
    print("\nEvaluating %s" % model)
    err_rate, err_count, test_count, predictor = error_rate_for_model(model, train_docs, test_docs)
    error_rates[str(model)] = err_rate
    print("\n%f %s\n" % (err_rate, model))

Training Doc2Vec(dbow,d100,n5,mc2,t8)

Evaluating Doc2Vec(dbow,d100,n5,mc2,t8)


  train_regressors = [test_model.docvecs[doc.tags[0]] for doc in train_set]
  test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]



0.146240 Doc2Vec(dbow,d100,n5,mc2,t8)

Training Doc2Vec(dm/m,d100,n5,w10,mc2,t8)

Evaluating Doc2Vec(dm/m,d100,n5,w10,mc2,t8)

0.211760 Doc2Vec(dm/m,d100,n5,w10,mc2,t8)

Training Doc2Vec(dm/c,d100,n5,w5,mc2,t8)

Evaluating Doc2Vec(dm/c,d100,n5,w5,mc2,t8)

0.376800 Doc2Vec(dm/c,d100,n5,w5,mc2,t8)



In [7]:
c=1
for model in simple_models: 
    model.save(str(c))
    c+=1

In [8]:
print("Err_rate Model")
for rate, name in sorted((rate, name) for name, rate in error_rates.items()):
    print("%f %s" % (rate, name))

Err_rate Model
0.146240 Doc2Vec(dbow,d100,n5,mc2,t8)
0.211760 Doc2Vec(dm/m,d100,n5,w10,mc2,t8)
0.376800 Doc2Vec(dm/c,d100,n5,w5,mc2,t8)


In [9]:
import random
doc_id = random.randint(0, len(test_docs))  # Pick random doc; re-run cell for more examples
print('for doc %d...' % doc_id)
for model in simple_models:
    inferred_docvec = model.infer_vector(alldocs[doc_id].words)
    print('%s:\n %s \n %s' % (model, model.dv.most_similar([inferred_docvec], topn=3),alldocs[doc_id].words))

for doc 20157...
Doc2Vec(dbow,d100,n5,mc2,t8):
 [(20157, 0.940758228302002), (71138, 0.8800333142280579), (63987, 0.8795598149299622)] 
 ['this', 'title', 'seems', 'more', 'like', 'a', 'filming', 'exercise', 'than', 'a', 'film', 'that', 'should', 'have', 'been', 'released', 'to', 'be', 'seen', 'by', 'the', 'public', '.', 'for', 'dafoe', 'and', 'his', 'wife', 'it', 'must', 'have', 'been', 'fun', 'working', 'together', 'in', 'a', 'film', 'for', 'the', 'first', 'time', ',', 'without', 'taking', 'into', 'consideration', 'that', 'people', 'might', 'actually', 'watch', 'it', '.', 'i', 'felt', 'like', 'it', 'was', '90mins', 'wasted', 'as', 'i', 'waited', 'anxiously', 'for', 'a', 'plot', 'to', 'develop', ',', 'or', 'even', 'begin', '.', 'try', 'to', 'fit', 'this', 'film', 'into', 'a', 'genre', 'and', 'you', "won't", ',', 'because', 'it', 'lacks', 'a', 'beginning', ',', 'middle', 'or', 'ending', '.', "i've", 'seen', "'arty'", 'movies', 'before', 'and', 'this', "doesn't", 'even', 'come', 'close'

In [15]:
word_models = simple_models[:]
import random
from IPython.display import HTML
# pick a random word with a suitable number of occurences
while True:
    word = random.choice(word_models[0].wv.index_to_key)
    if word_models[0].wv.key_to_index[word] > 10:
        break
# or uncomment below line, to just pick a word from the relevant domain:
#word = 'comedy/drama'
similars_per_model = [str(model.wv.most_similar(word, topn=20)).replace('), ','),<br>\n') for model in word_models]
similar_table = ("<table><tr><th>" +
    "</th><th>".join([str(model) for model in word_models]) + 
    "</th></tr><tr><td>" +
    "</td><td>".join(similars_per_model) +
    "</td></tr></table>")
print("most similar words for '%s' (%d occurences)" % (word, simple_models[0].wv.key_to_index[word]))
HTML(similar_table)

most similar words for 'goodbye' (6157 occurences)


"Doc2Vec(dbow,d100,n5,mc2,t8)","Doc2Vec(dm/m,d100,n5,w10,mc2,t8)","Doc2Vec(dm/c,d100,n5,w5,mc2,t8)"
"[('parrish', 0.46560177206993103), ('career-dead', 0.4251338839530945), ('scarp', 0.42053326964378357), ('ojibway', 0.39109230041503906), ('anti-competitive', 0.3903639018535614), (""shaffer's"", 0.3867054581642151), ('splashed', 0.386642724275589), ('rigors', 0.37911680340766907), ('on-point', 0.376544713973999), (""lessons'"", 0.3757546544075012), ('nick@night', 0.3698621690273285), (""'back"", 0.36863091588020325), ('benita', 0.3670118749141693), ('wildman', 0.3651280403137207), ('draco', 0.36431047320365906), ('football', 0.3607681393623352), ('gamely', 0.35533466935157776), (""dench's"", 0.35527360439300537), ('haryanvi', 0.3539436161518097), (""resume's"", 0.3528524339199066)]","[('farewell', 0.7002571225166321), ('good-bye', 0.5677174925804138), ('hello', 0.5641282796859741), ('adieu', 0.5598751306533813), ('beaver', 0.5391549468040466), ('stairway', 0.535651683807373), ('havana', 0.5307672619819641), ('needless', 0.5158049464225769), ('paraphrase', 0.5150619745254517), ('hi', 0.5129165053367615), ('inmate/pilot', 0.5123285055160522), ('trouby', 0.5072649717330933), ('hush', 0.5031053423881531), ('denver', 0.5025613903999329), ('hereafter', 0.5008232593536377), ('yi-che', 0.49571216106414795), ('nahi', 0.49128982424736023), ('reply', 0.4890081584453583), ('cuddle', 0.48873332142829895), ('tuesday', 0.4840763211250305)]","[('goodnight', 0.6841808557510376), ('hello', 0.6766211986541748), ('farewell', 0.5839443802833557), (""'penis'"", 0.5594010353088379), ('good-bye', 0.5414263010025024), ('bye', 0.5398421883583069), ('incoherently', 0.5338912010192871), ('afterward', 0.5278379321098328), (""'yes'"", 0.5205431580543518), ('whaaa', 0.5163118839263916), ('brana', 0.509772002696991), ('belonging', 0.5021269917488098), ('kuch', 0.49723172187805176), (""'bizarre"", 0.49608299136161804), ('afterwards', 0.493028461933136), (""'welcome"", 0.4914415776729584), ('hi', 0.4895091652870178), ('whoa', 0.48633822798728943), ('-couple', 0.4839484691619873), ('lonnrot', 0.48105746507644653)]"


In [11]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[0], simple_models[1]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[0], simple_models[2]])