In [2]:
import gensim
import os
import collections
import smart_open
import random
import string
from scipy import spatial
from sklearn.linear_model import LogisticRegression

Using TensorFlow backend.


In [23]:
# Gets positive and negative review folders
base_data_dir = ".." + os.sep + "txt_sentoken"
pos_files = base_data_dir + os.sep + 'pos'
neg_files = base_data_dir + os.sep + 'neg'

In [24]:
def get_files(directory, label):
    reviews = []
    for filename in os.listdir(directory):
        file = directory + os.sep + filename
        with smart_open.smart_open(file, encoding="iso-8859-1") as f:
            review = bytearray()
            for i, line in enumerate(f):
                review += line
            reviews.append({"review": review, "label": label})
    return reviews

In [64]:
# converts files
pos_reviews = get_files(pos_files, 1)
neg_reviews = get_files(neg_files, 0)
p = list(pos_reviews)
n = list(neg_reviews)
random.shuffle(p)
random.shuffle(n)

In [65]:
def read_corpus(filename, tokens_only=False):
    with smart_open.smart_open(filename, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [89]:
def test_train_split(docs):
    test = []
    train = []
    labels = {}
    count = 2
    for i, doc_set in enumerate(docs):
        for j, doc in enumerate(doc_set):
            if j < len(doc_set) / 2:
                test.append({"doc": gensim.utils.simple_preprocess(doc_set[j]["review"]), "label": doc_set[j]["label"]})
            else:
                # For training data, add tags
                res = gensim.models.doc2vec.TaggedDocument(
                    gensim.utils.simple_preprocess(doc_set[j]["review"]), [count, doc_set[j]["label"]])
                if doc_set[j]["label"] in labels:
                    labels[doc_set[j]["label"]].append(count)
                else:
                    labels[doc_set[j]["label"]] = [count]
                count += 1
                train.append(res)
    return test, train, labels

In [90]:
test_corpus, train_corpus, train_labels = test_train_split([p, n])
random.shuffle(train_corpus)
# print(train_corpus)
# print(test_corpus)

In [147]:
model = gensim.models.doc2vec.Doc2Vec(size=100, min_count=5, iter=100, dm=0, dbow_words=1, window=5) 
# size is vector size
# min_count is the number of times a word needs to be used
#iter is number of training iterations

In [148]:
# gets vocab
model.build_vocab(train_corpus)

In [149]:
#trains model on texts
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

Wall time: 4min 1s


46407726

In [150]:
model.most_similar("good")

[('bad', 0.6571986675262451),
 ('but', 0.6016497611999512),
 ('pretty', 0.5924533009529114),
 ('really', 0.5748573541641235),
 ('decent', 0.5742709636688232),
 ('very', 0.5679540634155273),
 ('great', 0.5480868816375732),
 ('potent', 0.5196850895881653),
 ('still', 0.5169124603271484),
 ('amazing', 0.5094337463378906)]

In [151]:
model.most_similar("awful")

[('casted', 0.4692346453666687),
 ('shelved', 0.45204877853393555),
 ('wire', 0.4443809688091278),
 ('bad', 0.4373916983604431),
 ('previously', 0.42407336831092834),
 ('made', 0.40651363134384155),
 ('rosanna', 0.40070217847824097),
 ('frustrating', 0.40007495880126953),
 ('encountered', 0.40002942085266113),
 ('originally', 0.3927207291126251)]

In [152]:
model.most_similar("horror")

[('films', 0.5964456796646118),
 ('genre', 0.575140655040741),
 ('thriller', 0.5289715528488159),
 ('slasher', 0.5172624588012695),
 ('movies', 0.5072332620620728),
 ('flicks', 0.505022406578064),
 ('film', 0.5042519569396973),
 ('rocky', 0.5004055500030518),
 ('fi', 0.4986109435558319),
 ('flick', 0.49328815937042236)]

In [153]:
#assess model
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

In [154]:
# counts what ranks each document was classified as
# collections.Counter(ranks)  # Results vary due to random seeding and very small corpus

In [155]:
print(train_corpus[0][1][1])

1


In [156]:
classifier = LogisticRegression()
x = [model.infer_vector(d[0]) for d in train_corpus]
y = [d[1][1] for d in train_corpus]
classifier.fit(x,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [157]:
classifier.score([model.infer_vector(d['doc']) for d in test_corpus], [d['label'] for d in test_corpus])

0.84699999999999998

In [135]:
model.save('./imdb-log.d2v')