In [119]:
from gensim.models import Word2Vec, KeyedVectors, TfidfModel
from gensim.parsing.preprocessing import STOPWORDS
from scipy.sparse.linalg import svds
from scipy.spatial.distance import cosine
import numpy as np

## Using pre-tained word2vec model

Download the pretrained model form [here](https://github.com/RaRe-Technologies/gensim-data/releases/tag/glove-wiki-gigaword-200)

In [120]:
word2vec_model = KeyedVectors.load_word2vec_format("glove-wiki-gigaword-200.gz")

In [3]:
np.shape(word2vec_model.syn0)

  """Entry point for launching an IPython kernel.


(400000, 200)

For doc2vec and sent2vec we'll train the model with [this dataset](http://ai.stanford.edu/~amaas/data/sentiment/).

IMDB Dataset: 100,000 movie-reviews.

## Preprocessing with dataset

In [4]:
import locale
import glob
import os.path
import requests
import tarfile
import sys
import codecs
import smart_open

dirname = 'aclImdb'
filename = 'aclImdb_v1.tar.gz'
locale.setlocale(locale.LC_ALL, 'C')

if sys.version > '3':
    control_chars = [chr(0x85)]
else:
    control_chars = [unichr(0x85)]

# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        norm_text = norm_text.replace(char, ' ' + char + ' ')
    return norm_text

import time
start = time.clock()

if not os.path.isfile('aclImdb/alldata-id.txt'):
    if not os.path.isdir(dirname):
        if not os.path.isfile(filename):
            # Download IMDB archive
            print("Downloading IMDB archive...")
            url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename
            r = requests.get(url)
            with open(filename, 'wb') as f:
                f.write(r.content)
        tar = tarfile.open(filename, mode='r')
        tar.extractall()
        tar.close()

    # Concatenate and normalize test/train data
    print("Cleaning up dataset...")
    folders = ['train/pos', 'train/neg', 'test/pos', 'test/neg', 'train/unsup']
    alldata = u''
    for fol in folders:
        temp = u''
        output = fol.replace('/', '-') + '.txt'
        # Is there a better pattern to use?
        txt_files = glob.glob(os.path.join(dirname, fol, '*.txt'))
        for txt in txt_files:
            with smart_open.smart_open(txt, "rb") as t:
                t_clean = t.read().decode("utf-8")
                for c in control_chars:
                    t_clean = t_clean.replace(c, ' ')
                temp += t_clean
            temp += "\n"
        temp_norm = normalize_text(temp)
        with smart_open.smart_open(os.path.join(dirname, output), "wb") as n:
            n.write(temp_norm.encode("utf-8"))
        alldata += temp_norm

    with smart_open.smart_open(os.path.join(dirname, 'alldata-id.txt'), 'wb') as f:
        for idx, line in enumerate(alldata.splitlines()):
            num_line = u"_*{0} {1}\n".format(idx, line)
            f.write(num_line.encode("utf-8"))

end = time.clock()
print ("Total running time: ", end-start)

('Total running time: ', 0.001635999999990645)


In [2]:
import os.path
assert os.path.isfile("aclImdb/alldata-id.txt"), "alldata-id.txt unavailable"

In [66]:
# -*- coding: utf-8 -*-
import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')

alldocs = []  # Will hold all docs in original order
with open('aclImdb/alldata-id.txt') as alldata:
    for line_no, line in enumerate(alldata):
        tokens = gensim.utils.to_unicode(line).split()
        words = tokens[1:]
        tags = [line_no] # 'tags = [tokens[0]]' would also work at extra memory cost
        split = ['train', 'test', 'extra', 'extra'][line_no//25000]  # 25k train, 25k test, 25k extra
        sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
        alldocs.append(SentimentDocument(words, tags, split, sentiment))

train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:]  # For reshuffling per pass

print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))

100000 docs: 25000 train-sentiment, 25000 test-sentiment


## Training Doc2Vec

In [3]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

doc2vec_model = Doc2Vec(dm=1, dm_concat=1, vector_size=200, window=5, negative=5, hs=0, min_count=2, workers=cores)
doc2vec_model.build_vocab(alldocs)
models_by_name = OrderedDict((str(model), model) for model in [doc2vec_model])

In [8]:
import numpy as np
import statsmodels.api as sm
from random import sample

# For timing
from contextlib import contextmanager
from timeit import default_timer
import time 

@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start
    
def logistic_predictor_from_data(train_targets, train_regressors):
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    # print(predictor.summary())
    return predictor

def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]]) for doc in train_set])
    train_regressors = sm.add_constant(train_regressors)
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_data = test_set
    if infer:
        if infer_subsample < 1.0:
            test_data = sample(test_data, int(infer_subsample * len(test_data)))
        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_regressors = sm.add_constant(test_regressors)
    
    # Predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

  from pandas.core import datetools


In [9]:
from collections import defaultdict
best_error = defaultdict(lambda: 1.0)  # To selectively print only best errors achieved

In [None]:
from random import shuffle
import datetime

alpha, min_alpha, passes = (0.025, 0.001, 20)
alpha_delta = (alpha - min_alpha) / passes

print("START %s" % datetime.datetime.now())

for epoch in range(passes):
    shuffle(doc_list)  # Shuffling gets best results
    
    for name, train_model in models_by_name.items():
        # Train
        duration = 'na'
        train_model.alpha, train_model.min_alpha = alpha, alpha
        with elapsed_timer() as elapsed:
            train_model.train(doc_list, total_examples=len(doc_list), epochs=1)
            duration = '%.1f' % elapsed()
            
        # Evaluate
        eval_duration = ''
        with elapsed_timer() as eval_elapsed:
            err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs)
        eval_duration = '%.1f' % eval_elapsed()
        best_indicator = ' '
        if err <= best_error[name]:
            best_error[name] = err
            best_indicator = '*' 
        print("%s%f : %i passes : %s %ss %ss" % (best_indicator, err, epoch + 1, name, duration, eval_duration))

        if ((epoch + 1) % 5) == 0 or epoch == 0:
            eval_duration = ''
            with elapsed_timer() as eval_elapsed:
                infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs, infer=True)
            eval_duration = '%.1f' % eval_elapsed()
            best_indicator = ' '
            if infer_err < best_error[name + '_inferred']:
                best_error[name + '_inferred'] = infer_err
                best_indicator = '*'
            print("%s%f : %i passes : %s %ss %ss" % (best_indicator, infer_err, epoch + 1, name + '_inferred', duration, eval_duration))

    print('Completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta
    
print("END %s" % str(datetime.datetime.now()))

In [None]:
# Print best error rates achieved
print("Err rate Model")
for rate, name in sorted((rate, name) for name, rate in best_error.items()):
    print("%f %s" % (rate, name))

In [None]:
doc2vec_model.save('doc2vec_model')

In [4]:
from gensim.models import Doc2Vec
doc2vec_model = Doc2Vec.load('doc2vec_model')

## Training Sent2Vec model

In [29]:
from gensim import utils
import smart_open

all_docs = []
with smart_open.smart_open('aclImdb/alldata-id.txt') as alldata:
    for line_no, line in enumerate(alldata):
        tokens = line.split()
        remove = ['(', ')', '"','?','!']
        tokens = tokens[1:]
        tokens = [word for word in tokens if word not in STOPWORDS]
        tokens = [word for word in tokens if word not in remove]
        tokens = [word for word in tokens if word.isalpha()]
        all_docs.append(tokens)
print(all_docs[0])
temp = u""
with smart_open.smart_open('sent2vec.txt', 'w') as f:
    for review in all_docs:
        for item in review:
            f.write("%s " % item)
        f.write("\n")

['bizarre', 'horror', 'movie', 'filled', 'famous', 'faces', 'stolen', 'cristina', 'raines', 'later', 'flamingo', 'road', 'pretty', 'somewhat', 'unstable', 'model', 'gummy', 'smile', 'slated', 'pay', 'attempted', 'suicides', 'guarding', 'gateway', 'hell', 'scenes', 'raines', 'modeling', 'captured', 'mood', 'music', 'perfect', 'deborah', 'raffin', 'charming', 'pal', 'raines', 'moves', 'creepy', 'brooklyn', 'heights', 'brownstone', 'inhabited', 'blind', 'priest', 'floor', 'things', 'start', 'cooking', 'neighbors', 'including', 'fantastically', 'wicked', 'burgess', 'meredith', 'kinky', 'couple', 'sylvia', 'miles', 'beverly', 'diabolical', 'lot', 'eli', 'wallach', 'great', 'fun', 'wily', 'police', 'detective', 'movie', 'nearly', 'baby', 'exorcist', 'combination', 'based', 'jeffrey', 'konvitz', 'sentinel', 'entertainingly', 'spooky', 'shocks', 'brought', 'director', 'michael', 'winner', 'mounts', 'thoughtfully', 'downbeat', 'ending', 'skill']


In [32]:
! ../sent2vec-master/./fasttext sent2vec -input sent2vec.txt -output my_model -dropoutK 0 -dim 200 -epoch 9 -lr 0.2 -thread 10 -bucket 100000

Read 9M words
Number of words:  50771
Number of labels: 0
Progress: 73.8%  words/sec/thread: 34629  lr: 0.052383  loss: 2.594971  eta: 0h1m 3m m %  words/sec/thread: 12652  lr: 0.199283  loss: 4.122703  eta: 0h11m 0.4%  words/sec/thread: 13513  lr: 0.199201  loss: 4.043160  eta: 0h10m 9m 0.5%  words/sec/thread: 15865  lr: 0.198932  loss: 3.867796  eta: 0h9m 0h7m 0.9%  words/sec/thread: 20135  lr: 0.198257  loss: 3.626870  eta: 0h7m m 1.1%  words/sec/thread: 22442  lr: 0.197702  loss: 3.534271  eta: 0h6m h6m 0.196779  loss: 3.430077  eta: 0h5m 1.6%  words/sec/thread: 25071  lr: 0.196734  loss: 3.428211  eta: 0h5m 1.7%  words/sec/thread: 25485  lr: 0.196520  loss: 3.410576  eta: 0h5m   eta: 0h5m 5m 2.5%  words/sec/thread: 27728  lr: 0.195015  loss: 3.319271  eta: 0h5m 0h5m m 29899  lr: 0.192352  loss: 3.221656  eta: 0h4m %  words/sec/thread: 29973  lr: 0.192191  loss: 3.216617  eta: 0h4m 4.1%  words/sec/thread: 30220  lr: 0.191860  loss: 3.208455  eta: 0h4m 4.9%  words/sec/thread: 30792 

In [83]:
import sent2vec
sent2vec_model = sent2vec.Sent2vecModel()
sent2vec_model.load_model('my_model.bin')

## Training Classic LSI model

In [8]:
from gensim.corpora import Dictionary, MmCorpus

dictionary = Dictionary(line.split() for line in open('sent2vec.txt'))
print dictionary
corpus = [dictionary.doc2bow(text) for text in sentences]
MmCorpus.serialize('lsi_model.mm', corpus) 

Dictionary(131056 unique tokens: [u'fawn', u'tsukino', u'woode', u'nunnery', u'sonja']...)


In [9]:
from gensim.models import LsiModel
lsi_model = LsiModel(corpus,id2word=dictionary)
lsi_corpus = lsi_model[corpus]

In [13]:
print(lsi_model.print_topics(2))
print(lsi_corpus)

[(0, u'-0.447*"infinity" + -0.447*"war" + -0.447*"eagerly" + -0.447*"waiting" + -0.447*"avengers" + 0.000*"baby" + -0.000*"attempted" + -0.000*"day" + -0.000*"completely" + -0.000*"different"'), (1, u'-0.640*"dog" + -0.396*"sample" + -0.396*"cat" + -0.396*"sentence" + -0.245*"day" + -0.245*"cute" + 0.000*"baby" + -0.000*"skipper" + 0.000*"attempted" + -0.000*"absolute"')]
<gensim.interfaces.TransformedCorpus object at 0x103c44150>


In [11]:
for doc in lsi_corpus:
    print(doc)

[(1, -1.8280155323464498), (3, 0.811393377776851)]
[(2, 1.7320508075688772)]
[(1, -1.1297757309528396), (3, -1.3128620634895272)]
[(0, -2.2360679774997907)]
[(4, -1.4142135623730954)]


## Implementation of mixture methods

In [113]:
def simple_average(sent):
    sents_emd = []
    for s in sent:
        sent_emd = []
        for w in s:
            if w in word2vec_model:
                sent_emd.append(word2vec_model[w])
        sent_emd_ar = np.array(sent_emd)
        sum_ = sent_emd_ar.sum(axis=0)
        result = sum_/np.sqrt((sum_**2).sum())
        sents_emd.append(result)
    return sents_emd

In [114]:
def tf_idf(sent):
    word_counter = {}
    total_count = 0
    no_of_sentences = 0
    for s in sent:
        for w in s:
            if w in word_counter:
                word_counter[w] = word_counter[w] + 1
            else:
                word_counter[w] = 1
        total_count = total_count + len(s)
        no_of_sentences = no_of_sentences +  1
    sents_emd = []
    for s in sent:
        sent_emd = []
        for word in s:
            tf = word_counter[word]/float(len(s))
            idf = np.log(no_of_sentences/float(1+ word_counter[word]))
            try:
                emd = tf*idf*word2vec_model[word]
                sent_emd.append(emd)
            except:
                continue
        sent_emd = np.array(sent_emd)
        sum_ = sent_emd.sum(axis=0)
        result = sum_/np.sqrt((sum_**2).sum())
        sents_emd.append(result)
    return sents_emd

Or we could use the TFIDF API from gensim

In [179]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary


def tf_idf_v2(sent):
    dct = Dictionary(sent)
    corpus = [dct.doc2bow(line) for line in sent]
    tf_idf_model = TfidfModel(corpus)
    vector = tf_idf_model[corpus]
    d = {dct.get(id): value for doc in vector for id, value in doc}
    sents_emd = []
    no_of_sent = sum(1 for i in sent)
    for i in xrange(no_of_sent):
        sent_emd = []
        for j in xrange(len(sent[i])):
            word = sent[i][j]
            if word in word2vec_model:
                emd = d[word]*word2vec_model[word]
                sent_emd.append(emd)
        sent_emd_np = np.array(sent_emd)
        sum_ = sent_emd_np.sum(axis=0)
        result = sum_/np.sqrt((sum_**2).sum())
        sents_emd.append(result)
    
    return sents_emd

In [180]:
def smooth_inverse_frequency(sent, a=0.001):
    word_counter = {}
    sentences = []
    total_count = 0
    no_of_sentences = 0
    for s in sent:
        for w in s:
            if w in word_counter:
                word_counter[w] = word_counter[w] + 1
            else:
                word_counter[w] = 1
        total_count = total_count + len(s)
        no_of_sentences = no_of_sentences + 1
    sents_emd = []
    for s in sent:
        sent_emd = []
        for word in s:
            if word in word2vec_model:
                emd = (a/(a + (word_counter[word]/total_count)))*word2vec_model[word]
                sent_emd.append(emd)
        sum_ = np.array(sent_emd).sum(axis=0)
        sentence_emd = sum_/float(no_of_sentences)
        sents_emd.append(sentence_emd)
    u  = np.array(svds(sents_emd, k=1))
    u = u[2]
    new_sents_emd = []
    for s in sents_emd:
        s = s - u.dot(u.transpose())*s
        new_sents_emd.append(s)
    return new_sents_emd

In [181]:
s1_s = "this is a sample sentence with cat and dog"
s1 = s1_s.lower().split()
s1 = [w for w in s1 if w not in STOPWORDS]
s2_s = "there was a time when computers were very expensive"
s2 = s2_s.lower().split()
s2 = [w for w in s2 if w not in STOPWORDS]
s3_s = "one more day with cute dog"
s3 = s3_s.lower().split()
s3 = [w for w in s3 if w not in STOPWORDS]
s4_s = "eagerly waiting for Avengers Infinity War"
s4 = s4_s.lower().split()
s4 = [w for w in s4 if w not in STOPWORDS]
s5_s = "this is a completely different"
s5 = s5_s.lower().split()
s5 = [w for w in s5 if w not in STOPWORDS]
sentences = [s1, s2, s3, s4, s5]
sentences_s = [s1_s, s2_s, s3_s, s4_s, s5_s,]
print(sentences, sentences_s)

([['sample', 'sentence', 'cat', 'dog'], ['time', 'computers', 'expensive'], ['day', 'cute', 'dog'], ['eagerly', 'waiting', 'avengers', 'infinity', 'war'], ['completely', 'different']], ['this is a sample sentence with cat and dog', 'there was a time when computers were very expensive', 'one more day with cute dog', 'eagerly waiting for Avengers Infinity War', 'this is a completely different'])


In [182]:
sentences_emd1 = smooth_inverse_frequency(sentences)
sentences_emd2 = tf_idf_v2(sentences)
sentences_emd3 = simple_average(sentences)

Benchmarking with cosine distance

In [183]:
d1 = cosine(sentences_emd1[0],sentences_emd1[2])
d2 = cosine(sentences_emd2[0],sentences_emd2[2])
d3 = cosine(sentences_emd3[0],sentences_emd3[2])
print("SIF: {} tfIdf: {} SimAvg: {}".format(d1, d2, d3))
d4 = cosine(sentences_emd1[1],sentences_emd1[3])
d5 = cosine(sentences_emd2[1],sentences_emd2[3])
d6 = cosine(sentences_emd3[1],sentences_emd3[3])
print("SIF: {} tfIdf: {} SimAvg: {}".format(d4, d5, d6))

SIF: 0.308996856213 tfIdf: 0.384991586208 SimAvg: 0.297450304031
SIF: 0.592167794704 tfIdf: 0.569366067648 SimAvg: 0.569366067648


In [184]:
doc_d1 = doc2vec_model.infer_vector(s1)
doc_d2 = doc2vec_model.infer_vector(s3)
print("doc2vec for s1 and s3: {}".format(cosine(doc_d1,doc_d2)))
doc_d3 = doc2vec_model.infer_vector(s1)
doc_d4 = doc2vec_model.infer_vector(s4)
print("doc2vec for s1 and s4: {}".format(cosine(doc_d3,doc_d4)))

doc2vec for s1 and s3: 0.599111407995
doc2vec for s1 and s4: 0.928267449141


In [185]:
embs_sent2vec = sent2vec_model.embed_sentences(sentences_s)
print("sent2vec for s1 and s3 {}".format(cosine(embs_sent2vec[0],embs_sent2vec[2])))
print("sent2vec for s1 and s4 {}".format(cosine(embs_sent2vec[0],embs_sent2vec[3])))

sent2vec for s1 and s3 0.374465703964
sent2vec for s1 and s4 0.9011329934


## Evaluation with SICK 2014

In [223]:
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error as mse

import os


def load_data(loc='./data/'):
    """
    Load the SICK dataset
    """
    trainA, trainB, devA, devB, testA, testB = [],[],[],[],[],[]
    trainS, devS, testS = [],[],[]
    with open(os.path.join(loc, 'SICK_test_annotated.txt'), 'rb') as f:
        for line in f:
            text = line.strip().split('\t')
            testA.append(text[1])
            testB.append(text[2])
            testS.append(text[3])
    testS = [float(s) for s in testS[1:]]

    return [testA[1:], testB[1:]], testS

def evaluate_sick(model, model_name, evaltest=1):
    test, scores = load_data()
    if evaltest:
            print 'Computing test sentence vectors...'
            if model_name == 'sent2vec':
                testA = np.array(model.embed_sentences(test[0]))
                testB = np.array(model.embed_sentences(test[1]))
            elif model_name == 'doc2vec':
                testA = np.array([model.infer_vector(example.split(' ')) for example in test[0]])
                testB = np.array([model.infer_vector(example.split(' ')) for example in test[1]])
            elif model_name == 'word2vec_sif':
                testA = smooth_inverse_frequency([example.split(' ') for example in test[0]])
                testB = smooth_inverse_frequency([example.split(' ') for example in test[1]])
            elif model_name == 'word2vec_tfidf':
                testA = tf_idf_v2([example.split(' ') for example in test[0]])
                testB = tf_idf_v2([example.split(' ') for example in test[1]])
            else:
                testA = simple_average([example.split(' ') for example in test[0]])
                testB = simple_average([example.split(' ') for example in test[1]])

            print 'Computing feature combinations...'
            result = []
            for i in range(len(testA)):
                result.append(5.0*(1 - cosine(testA[i],testB[i])))
#             print result

            print 'Evaluating...'
            pr = pearsonr(result, scores)[0]
            print 'Test Pearson: ' + str(pr)
            sr = spearmanr(result, scores)[0]
            print 'Test Spearman: ' + str(sr)
            se = mse(result, scores)
            print 'Test MSE: ' + str(se)

In [224]:
evaluate_sick(word2vec_model,'word2vec') # simple average

Computing test sentence vectors...
Computing feature combinations...
Evaluating...
Test Pearson: 0.6036488695603748
Test Spearman: 0.517107076620976
Test MSE: 1.886883490561824


In [229]:
evaluate_sick(word2vec_model,'word2vec_sif') # smooth inverse frequency

Computing test sentence vectors...
Computing feature combinations...
Evaluating...
Test Pearson: 0.5968195918608273
Test Spearman: 0.5098569770117501
Test MSE: 1.7127764394605596


In [226]:
evaluate_sick(word2vec_model,'word2vec_tfidf') # tfidf

Computing test sentence vectors...
Computing feature combinations...
Evaluating...
Test Pearson: 0.6386344221803884
Test Spearman: 0.5186580421530436
Test MSE: 0.977400413016066


In [227]:
evaluate_sick(doc2vec_model,'doc2vec')

Computing test sentence vectors...
Computing feature combinations...
Evaluating...
Test Pearson: 0.3509808745592773
Test Spearman: 0.344297777011621
Test MSE: 4.549327606679235


In [228]:
evaluate_sick(sent2vec_model,'sent2vec')

Computing test sentence vectors...
Computing feature combinations...
Evaluating...
Test Pearson: 0.5258587190142471
Test Spearman: 0.4542118616338372
Test MSE: 1.6227105838354625
