In [17]:
import locale
import glob
import os.path
import requests
import tarfile
import sys
import codecs
import smart_open
import time

# from https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

In [18]:
locale.setlocale(locale.LC_ALL, 'C')

'C'

In [28]:
if sys.version > '3':
    control_chars = [chr(0x85)]
else:
    control_chars = [unichr(0x85)]

In [29]:
# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    for char in [':', '"', ',', '(', ')', '!', '?', ';']:
        norm_text = norm_text.replace(char, ' ')
    norm_text = norm_text.replace('.', ' ' + '.')
    return norm_text

In [30]:
dirname = '../Miniproject_test/aclImdb'

In [31]:
start = time.clock()
if not os.path.isfile('../Miniproject_test/aclImdb/alldata-id.txt'):
    folders = ['train/pos', 'train/neg', 'test/pos', 'test/neg']
    alldata = u''
    for fol in folders:
        temp = u''
        output = fol.replace('/', '-') + '.txt'
        txt_files = glob.glob(os.path.join(dirname, fol, '*.txt')) # get all text files
        for txt in txt_files:
            with smart_open.smart_open(txt, "rb") as t:
                t_clean = t.read().decode("utf-8")
                for c in control_chars:
                    t_clean = t_clean.replace(c, ' ')
                temp += t_clean
            temp += "\n"
        temp_norm = normalize_text(temp)
        with smart_open.smart_open(os.path.join(dirname, output), "wb") as n:
            n.write(temp_norm.encode("utf-8"))
        alldata += temp_norm
    with smart_open.smart_open(os.path.join(dirname, 'alldata-id.txt'), 'wb') as f:
        for idx, line in enumerate(alldata.splitlines()):
            num_line = u"_*{0} {1}\n".format(idx, line)
            f.write(num_line.encode("utf-8"))
end = time.clock()
print ("Total running time: ", end-start)

Total running time:  12.401086000000078


In [32]:
assert os.path.isfile("../Miniproject_test/aclImdb/alldata-id.txt"), "alldata-id.txt unavailable"

In [33]:
import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')
alldocs = []  # Will hold all docs in original order

with open('../Miniproject_test/aclImdb/alldata-id.txt', encoding='utf-8') as alldata:
    for line_no, line in enumerate(alldata):
        tokens = gensim.utils.to_unicode(line).split()
        words = tokens[1:]
        tags = [line_no]
        split = ['train', 'test'][line_no//25000]  # 25k train, 25k test
        sentiment = [1.0, 0.0, 1.0, 0.0][line_no//12500] #[12.5K pos, 12.5K neg]*2
        alldocs.append(SentimentDocument(words, tags, split, sentiment))

train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:]  # For reshuffling per pass

print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))

50000 docs: 25000 train-sentiment, 25000 test-sentiment


In [34]:
print(alldocs[0])

SentimentDocument(words=['for', 'a', 'movie', 'that', 'gets', 'no', 'respect', 'there', 'sure', 'are', 'a', 'lot', 'of', 'memorable', 'quotes', 'listed', 'for', 'this', 'gem', '.', 'imagine', 'a', 'movie', 'where', 'joe', 'piscopo', 'is', 'actually', 'funny', 'maureen', 'stapleton', 'is', 'a', 'scene', 'stealer', '.', 'the', 'moroni', 'character', 'is', 'an', 'absolute', 'scream', '.', 'watch', 'for', 'alan', 'the', 'skipper', 'hale', 'jr', '.', 'as', 'a', 'police', 'sgt', '.'], tags=[0], split='train', sentiment=1.0)


In [35]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

simple_models = [
    # PV-DM w/ concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DBOW 
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DM w/ average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
]

simple_models[0].build_vocab(alldocs)  # PV-DM w/ concat requires one special NULL word so it serves as template
print(simple_models[0])

for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4)
Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)
Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4)


In [36]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])

In [37]:
for model in models_by_name:
    print(models_by_name[model])

Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4)
Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)
Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4)
<gensim.test.test_doc2vec.ConcatenatedDoc2Vec object at 0x10ff0ec18>
<gensim.test.test_doc2vec.ConcatenatedDoc2Vec object at 0x10ff0edd8>


In [10]:
import numpy as np
import statsmodels.api as sm
from random import sample

# For timing
from contextlib import contextmanager
from timeit import default_timer

  from pandas.core import datetools


In [13]:
@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start

# train_regressors are inputs namely representational vectors for inputs
def logistic_predictor_from_data(train_targets, train_regressors):
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    # print(predictor.summary())
    return predictor

# test_model is the doc2vec model
def error_rate_for_model(test_model, train_set, test_set,
                         infer=False, infer_steps=3,
                         infer_alpha=0.1, infer_subsample=0.1):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]])
                                            for doc in train_set])
    train_regressors = sm.add_constant(train_regressors)
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_data = test_set
    if infer:
        if infer_subsample < 1.0:
            test_data = sample(test_data, int(infer_subsample * len(test_data)))
        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha)
                           for doc in test_data]
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_regressors = sm.add_constant(test_regressors) # Adds a column of ones to an array
    
    # Predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

In [14]:
from collections import defaultdict
best_error = defaultdict(lambda: 1.0)

In [15]:
from random import shuffle
import datetime

alpha, min_alpha, passes = (0.025, 0.001, 6)
alpha_delta = (alpha - min_alpha) / passes

print("START %s" % datetime.datetime.now())

for epoch in range(passes):
    shuffle(doc_list)  # Shuffling gets best results

    for name, train_model in models_by_name.items():
        # Train
        duration = 'na'
        train_model.alpha, train_model.min_alpha = alpha, alpha
        with elapsed_timer() as elapsed:
            train_model.train(doc_list, total_examples=len(doc_list), epochs=1)
            duration = '%.1f' % elapsed()

        # Evaluate
        eval_duration = ''
        with elapsed_timer() as eval_elapsed:
            err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs)
        eval_duration = '%.1f' % eval_elapsed()
        best_indicator = ' '
        if err <= best_error[name]:
            best_error[name] = err
            best_indicator = '*' 
        print("%s%f : %i passes : %s %ss %ss" % (best_indicator, err, epoch + 1, name, duration, eval_duration))

        if ((epoch + 1) % 5) == 0 or epoch == 0:
            eval_duration = ''
            with elapsed_timer() as eval_elapsed:
                infer_err, err_count, test_count, predictor = error_rate_for_model(train_model,
                                                                                   train_docs, 
                                                                                   test_docs, infer=True)
            eval_duration = '%.1f' % eval_elapsed()
            best_indicator = ' '
            if infer_err < best_error[name + '_inferred']:
                best_error[name + '_inferred'] = infer_err
                best_indicator = '*'
            print("%s%f : %i passes : %s %ss %ss" % (best_indicator, infer_err, epoch + 1,
                                                     name + '_inferred', duration, eval_duration))

    print('Completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta
    
print("END %s" % str(datetime.datetime.now()))

START 2018-01-23 16:10:50.121029
*0.412120 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 33.3s 0.7s
*0.362400 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4)_inferred 33.3s 7.4s
*0.297840 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 8.8s 0.9s
*0.230000 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)_inferred 8.8s 3.0s
*0.278080 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 13.2s 0.6s
*0.209200 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4)_inferred 13.2s 4.0s
*0.254480 : 1 passes : dbow+dmm 0.0s 1.8s
*0.176800 : 1 passes : dbow+dmm_inferred 0.0s 7.8s
*0.288600 : 1 passes : dbow+dmc 0.0s 1.7s
*0.238800 : 1 passes : dbow+dmc_inferred 0.0s 10.7s
Completed pass 1 at alpha 0.025000
*0.367720 : 2 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4) 30.6s 0.6s
*0.175000 : 2 passes : Doc2Vec(dbow,d100,n5,mc2,s0.001,t4) 8.7s 0.6s
*0.212320 : 2 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4) 13.1s 0.6s
*0.161560 : 2 passes : dbow+dmm 0.0s 1.9s
*0.173920 : 2 passes : dbow+

In [16]:
# Print best error rates achieved
print("Err rate Model")
for rate, name in sorted((rate, name) for name, rate in best_error.items()):
    print("%f %s" % (rate, name))

Err rate Model
0.120600 dbow+dmm
0.125680 dbow+dmc
0.125880 Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)
0.157200 dbow+dmm_inferred
0.159200 Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)_inferred
0.176120 Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4)
0.196800 Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4)_inferred
0.219600 dbow+dmc_inferred
0.312440 Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4)
0.362400 Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4)_inferred


### Another fantastic tutorial
http://linanqiu.github.io/2015/10/07/word2vec-sentiment/
https://medium.com/scaleabout/a-gentle-introduction-to-doc2vec-db3e8c0cce5e