In [16]:
import os, glob
from smart_open import smart_open

dataDir = 'aclImdb'

if not os.path.isdir(dataDir):
    raise Exception('Download Data')

folders = ['train', 'test', 'unsup']
sentiments = {'pos': 1, 'neg': -1, 'unsup': 0}

In [57]:
%%time

from gensim.utils import tokenize as _tokenize
from collections import namedtuple
import re

SentimentDocument = namedtuple('SentimentDocument', 'words tags movie_id sentiment')

# return generator of lines for a file
def read(f):
    with smart_open(f, 'rb') as f:
        for l in f:
            yield l.decode('utf-8')
    # return smart_open(f, 'rb').read().decode('utf-8')

def readURL(f):
    with smart_open(f, 'rb') as f:
        yeild 
# Generator of generators
def flatMap(ls, func=lambda x: x):
    return (func(i) for l in ls for i in l)

# generator of lines -> generator of tokens
def tokenize(lines):
    return (token for line in lines for token in _tokenize(line, lowercase=True, deacc=True))

urlmatcher = re.compile(r"http://www.imdb.com/title/(.*)/usercomments")

def genFiles():
    for d in folders:
        for s, si in sentiments.items():
            if not os.path.isdir("{}/{}/{}".format(dataDir, d, s)): continue
            filelist = sorted(glob.glob("{}/{}/{}/*.txt".format(dataDir, d, s)))
            with smart_open("{}/{}/urls_{}.txt".format(dataDir, d, s), 'r') as urlfile:
                urls = (urlmatcher.match(url).group(1) for url in urlfile)
                for url, f in zip(urls, filelist):
                    yield si, url, f

# s and list of filenames generator
# fileLists = ((i, sorted(glob.glob("{}/{}/{}/*.txt".format(dataDir, d, s)))) for s, i in sentiments.items() for d in folders)

# s and filename generator
# files = ((s,f) for s, fs in fileLists for f in fs)

# gen of gen of tokens
tokenLists = ((s, url, tokenize(read(f))) for s, url, f in genFiles())

if not os.path.isfile('all_data.txt'):
    with smart_open('all_data.txt', 'wb') as f:
        for s, url, tl in tokenLists:
            f.write('{} {} {}\n'.format(s, url,' '.join(tl)).encode('utf-8'))
else:
    print('data file exists')

data file exists
CPU times: user 2.51 ms, sys: 143 µs, total: 2.66 ms
Wall time: 393 µs


In [58]:
%%time
from random import shuffle

# docs
allDocs = []
with smart_open('all_data.txt', 'rb') as f:
    for i, line in enumerate(f):
        l = line.decode('utf-8').split()
        allDocs.append(SentimentDocument(l[2:], [i], l[1], int(l[0])))
        
shuffle(allDocs)

CPU times: user 1.89 s, sys: 166 ms, total: 2.06 s
Wall time: 1.85 s


In [59]:
import multiprocessing

cpuCount = multiprocessing.cpu_count()

**Models**
Fix vector size to 100, and min_count (cutoff) to 3 (as per the Pang&Lee). 

For each model, ID string = "model:window_size:epoch", and store the model to a file.

In [60]:
from gensim.models.doc2vec import Doc2Vec
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from collections import OrderedDict

archs = ['dbow', 'dmm', 'dmc', 'dbow+dmm', 'dbow+dmc']
ws = [10]
vs = 100
mc = 3
s = 0
es = [20, 30]

readModels = OrderedDict()

def getIDstr(arch, w, e):
    return "{}:{}:{}".format(arch, w, e)

def genIDstrs():
    for w in ws:
        for e in es:
            for arch in archs:
                yield getIDstr(arch, w, e)

def createModel(idstr):
    ids = idstr.split(':')
    w = int(ids[1])
    e = int(ids[2])
    if ids[0] == 'dbow':
        return Doc2Vec(dm=0, window=w, vector_size=vs, min_count=mc, sample=s, epochs=e, workers=cpuCount)
    elif ids[0] == 'dmm':
        return Doc2Vec(dm=1, window=w, vector_size=vs, min_count=mc, sample=s, epochs=e, alpha=0.05, comment='alpha=0.05', workers=cpuCount)
    elif ids[0] == 'dmc':
        return Doc2Vec(dm=1, dm_concat=1, window=w//2, vector_size=vs, sample=s, min_count=mc, epochs=e, workers=cpuCount)
    else:
        raise Exception('arch does not exist')

def createConcatModel(idstr):
    ids = idstr.split(':')
    w = int(ids[1])
    e = int(ids[2])
    return ConcatenatedDoc2Vec([getModel(getIDstr(arch, w, e)) for arch in ids[0].split('+')])

def getModel(idstr):
    if idstr in readModels:
        return readModels[idstr]
    f = os.path.abspath('models/{}'.format(idstr))
    if os.path.isfile(f):
        print('loading model {} from file'.format(idstr))
        model = Doc2Vec.load(f)
    elif '+' in idstr:
        print('creating concat model {}'.format(idstr))
        model = createConcatModel(idstr)
    else:     
        print('creating {}'.format(idstr))
        model = createModel(idstr)
        model.build_vocab(allDocs)
        print('training {}'.format(idstr))
        model.train(allDocs, total_examples=len(allDocs), epochs=model.epochs)
        print('saving {}'.format(idstr))
        model.save(f)
    readModels[idstr] = model
    return model

In [61]:
%%time

models = [getModel(idstr) for idstr in genIDstrs()]

loading model dbow:10:20 from file
loading model dmm:10:20 from file
loading model dmc:10:20 from file
creating concat model dbow+dmm:10:20
creating concat model dbow+dmc:10:20
loading model dbow:10:30 from file
loading model dmm:10:30 from file
loading model dmc:10:30 from file
creating concat model dbow+dmm:10:30
creating concat model dbow+dmc:10:30
CPU times: user 5.48 s, sys: 432 ms, total: 5.91 s
Wall time: 5.38 s


In [35]:
 from itertools import islice

# Cross Validation

def cv(docs):
    folds = tuple(docs[i::10] for i in range(10))
    return folds[0], [doc for docs in folds[1:] for doc in docs]

posDocs = (doc for doc in allDocs if doc.sentiment == 1)
negDocs = (doc for doc in allDocs if doc.sentiment == -1)
    
docs = list(islice(posDocs, 1000))
docs.extend(list(islice(negDocs, 1000)))

initialTest, initialTrain = cv(docs)
test, train = cv(initialTrain)

In [36]:
import numpy as np
from scipy import sparse
import sys

sys.path.append('/home/jaeyeun/Cambridge/nlp/thundersvm/python')
from thundersvmScikit import SVC
    
def modelToSVM(model, trainSet):
    svm = SVC(kernel='linear')
    train_y = np.array([doc.sentiment for doc in trainSet])
    train_X = np.array([model.infer_vector(doc.words) for doc in trainSet])
    svm.fit(train_X, train_y)
    return svm

print('train SVMs')
%time initialSVMs = OrderedDict((name, modelToSVM(getModel(name), initialTrain)) for name in genIDstrs())

train SVMs
CPU times: user 18.8 s, sys: 4.88 s, total: 23.7 s
Wall time: 21.7 s


In [25]:
def classify(svm, model, testSet):
    test_X = np.array([model.infer_vector(doc.words) for doc in testSet])
    return svm.predict(test_X)

def score(svm, model, testSet):
    test_X = np.array([model.infer_vector(doc.words) for doc in testSet])
    test_y = np.array([doc.sentiment for doc in testSet])
    return svm.score(test_X, test_y)

In [37]:
for idstr in genIDstrs():
    print("Accuracy of {}".format(idstr))
    model = getModel(idstr)
    svm = initialSVMs[idstr]
    mean_acc = score(svm, model, initialTest)
    print(mean_acc)

Accuracy of dbow:10:20
0.865
Accuracy of dmm:10:20
0.785
Accuracy of dmc:10:20
0.735
Accuracy of dbow+dmm:10:20
0.875
Accuracy of dbow+dmc:10:20
0.825
Accuracy of dbow:10:30
0.875
Accuracy of dmm:10:30
0.765
Accuracy of dmc:10:30
0.72
Accuracy of dbow+dmm:10:30
0.865
Accuracy of dbow+dmc:10:30
0.845


In [38]:
# the real one
finalModel = getModel('dbow+dmc:10:30')
finalSVM = modelToSVM(finalModel, train, True)
test_X = np.array([finalModel.infer_vector(doc.words) for doc in test])
test_y = np.array([doc.sentiment for doc in test])
print(finalSVM.score(test_X, test_y))
finalResults = finalSVM.predict(test_X)

0.8666666666666667


In [39]:
# prev
from trainingset import TrainingSet
from bag import BagOfFrequency
from svm import SVM

ts = TrainingSet(BagOfFrequency, {1}, 3)
for doc in train:
    ts.add(doc.sentiment, doc.words)
prevSVM = SVM(ts)

In [40]:
test_X = [doc.words for doc in test]

print(prevSVM.score(test_X, test_y))
prevResults = prevSVM.classify(test_X)

0.8277777777777777


# Permutation Testing

In [41]:
from sklearn.metrics import cohen_kappa_score as kappa

def yesOrNo(actual, results):
    assert len(actual) == len(results)
    return [1 if actual[i] == results[i] else 0 for i in range(len(actual))]

final = yesOrNo(test_y, finalResults)
prev = yesOrNo(test_y, prevResults)

def permTest(xs, ys, r):
    n = len(xs)
    diff = np.abs(np.mean(xs) - np.mean(ys))
    k = 1
    for i in range(r):
        rands = np.random.rand(n)
        gx, gy = zip(*tuple(((x, y) if rand >= 0.5 else (y, x)) for rand, x, y in zip(rands, xs, ys)))
        k += diff <= np.abs(np.mean(gx) - np.mean(gy))
    return k / (r + 1)

def kappaPermTest(xs, ys, actual, r):
    n = len(xs)
    diff = np.abs(kappa(xs, actual) - kappa(ys, actual))
    k = 1
    for i in range(r):
        rands = np.random.rand(n)
        gx, gy = zip(*tuple(((x, y) if rand >= 0.5 else (y, x)) for rand, x, y in zip(rands, xs, ys)))
        k += diff <= np.abs(kappa(gx, actual) - kappa(gy, actual))
    return k / (r + 1)

p = permTest(final, prev, 5000)
print(p)
kp = kappaPermTest(finalResults, prevResults, test_y, 5000)
print(kp)

0.2699460107978404
0.26774645070985803
