In [3]:
import gensim
import os
import collections
import smart_open
import random
import string
from scipy import spatial
from sklearn.linear_model import LogisticRegression

Using TensorFlow backend.


In [4]:
# Gets positive and negative review folders
base_data_dir = ".." + os.sep + "txt_sentoken"
pos_files = base_data_dir + os.sep + 'pos'
neg_files = base_data_dir + os.sep + 'neg'

In [5]:
def get_files(directory, label):
    reviews = []
    for filename in os.listdir(directory):
        file = directory + os.sep + filename
        with smart_open.smart_open(file, encoding="iso-8859-1") as f:
            review = bytearray()
            for i, line in enumerate(f):
                review += line
            reviews.append({"review": review, "label": label})
    return reviews

In [6]:
# converts files
pos_reviews = get_files(pos_files, 1)
neg_reviews = get_files(neg_files, 0)
p = list(pos_reviews)
n = list(neg_reviews)
random.shuffle(p)
random.shuffle(n)

In [7]:
def read_corpus(filename, tokens_only=False):
    with smart_open.smart_open(filename, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [8]:
def test_train_split(docs):
    test = []
    train = []
    labels = {}
    count = 2
    for i, doc_set in enumerate(docs):
        for j, doc in enumerate(doc_set):
            if j < len(doc_set) / 2:
                test.append({"doc": gensim.utils.simple_preprocess(doc_set[j]["review"]), "label": doc_set[j]["label"]})
            else:
                # For training data, add tags
                res = gensim.models.doc2vec.TaggedDocument(
                    gensim.utils.simple_preprocess(doc_set[j]["review"]), [count, doc_set[j]["label"]])
                if doc_set[j]["label"] in labels:
                    labels[doc_set[j]["label"]].append(count)
                else:
                    labels[doc_set[j]["label"]] = [count]
                count += 1
                train.append(res)
    return test, train, labels

In [9]:
test_corpus, train_corpus, train_labels = test_train_split([p, n])
random.shuffle(train_corpus)
# print(train_corpus)
# print(test_corpus)

In [10]:
model = gensim.models.doc2vec.Doc2Vec(size=100, min_count=5, iter=100, dm=0, dbow_words=1, window=5) 
# size is vector size
# min_count is the number of times a word needs to be used
#iter is number of training iterations

In [11]:
# gets vocab
model.build_vocab(train_corpus)

In [12]:
#trains model on texts
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

Wall time: 3min 56s


45848237

In [13]:
model.most_similar("musical")

[('score', 0.5738385319709778),
 ('music', 0.5452611446380615),
 ('singers', 0.4798170030117035),
 ('band', 0.4696308672428131),
 ('broadway', 0.4524494409561157),
 ('numbers', 0.4466969668865204),
 ('soundtrack', 0.44653695821762085),
 ('number', 0.4410157799720764),
 ('sequences', 0.44027411937713623),
 ('meatloaf', 0.42949533462524414)]

In [14]:
model.most_similar("jazz")

[('macdowell', 0.44383466243743896),
 ('clarence', 0.4428166151046753),
 ('jenna', 0.4401274025440216),
 ('housewife', 0.43081384897232056),
 ('connick', 0.4238712191581726),
 ('feisty', 0.42261895537376404),
 ('natured', 0.41716793179512024),
 ('maurice', 0.4149838089942932),
 ('contribute', 0.4112398624420166),
 ('catchy', 0.41089701652526855)]

In [15]:
model.most_similar("actress")

[('role', 0.5017963647842407),
 ('baranski', 0.4889695644378662),
 ('forlani', 0.48303040862083435),
 ('actor', 0.4811933636665344),
 ('performance', 0.47210466861724854),
 ('marisa', 0.47180262207984924),
 ('preston', 0.4624374210834503),
 ('natasha', 0.4604119062423706),
 ('played', 0.45575565099716187),
 ('nina', 0.4555533230304718)]

In [16]:
#assess model
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

In [17]:
# counts what ranks each document was classified as
# collections.Counter(ranks)  # Results vary due to random seeding and very small corpus

In [35]:
correct = 0
k = 5
for doc in test_corpus:
    inferred_vector = model.infer_vector(doc["doc"])
    k_correct = 0
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    j = 0
    for i in range(0, k):
        while sims[j][0] < 2:
            j += 1
        if (sims[j][0]) in train_labels[doc["label"]]:
            k_correct += 1
    if k_correct > k / 2:
        correct += 1
print(correct / len(test_corpus))

0.721


In [34]:
model.save('./imdb-k-nearest.d2v')