In [5]:
import gensim
import os
import collections
import smart_open
import random
import string
from scipy import spatial

Using TensorFlow backend.


In [9]:
# Gets positive and negative review folders
base_data_dir = ".." + os.sep + "txt_sentoken"
pos_files = base_data_dir + os.sep + 'pos'
neg_files = base_data_dir + os.sep + 'neg'

In [10]:
def get_files(directory):
    reviews = []
    for filename in os.listdir(directory):
        file = directory + os.sep + filename
        with smart_open.smart_open(file, encoding="iso-8859-1") as f:
            review = bytearray()
            for i, line in enumerate(f):
                review += line
            reviews.append(review)
    return reviews

In [11]:
# converts files
pos_reviews = get_files(pos_files)
neg_reviews = get_files(neg_files)
random.shuffle(pos_reviews)
random.shuffle(neg_reviews)

In [12]:
def read_corpus(filename, tokens_only=False):
    with smart_open.smart_open(filename, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [15]:
def test_train_split(docs, labels):
    test = []
    train = []
    count = 0
    for i, doc_set in enumerate(docs):
        for j, doc in enumerate(doc_set):
            if j < len(doc_set) / 2:
                test.append({"doc": gensim.utils.simple_preprocess(doc_set[j]), "label": labels[i]})
            else:
                # For training data, add tags
                res = gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(doc_set[j]), [count, labels[i]])
                count += 1
                train.append(res)
    return test, train

In [16]:
test_corpus, train_corpus = test_train_split([pos_reviews, neg_reviews], ["pos", "neg"])
random.shuffle(train_corpus)
random.shuffle(test_corpus)
# print(train_corpus)
# print(test_corpus)

[{'doc': ['minutes', 'not', 'rated', 'though', 'suspect', 'it', 'would', 'be', 'rated', 'pg', 'for', 'adult', 'themes', 'and', 'language', 'mamoru', 'oshii', 'is', 'name', 'that', 'probably', 'isn', 'very', 'well', 'known', 'to', 'most', 'american', 'audiences', 'but', 'perhaps', 'it', 'should', 'be', 'oshii', 'was', 'the', 'director', 'of', 'last', 'year', 'best', 'known', 'japanese', 'animated', 'film', 'import', 'in', 'the', 'shell_', 'and', 'is', 'known', 'among', 'fans', 'of', 'japanese', 'animation', 'or', 'anime', 'for', 'making', 'films', 'with', 'deep', 'philosophical', 'bent', 'one', 'of', 'these', 'films', 'was', 'the', 'first', 'theatrical', 'venture', 'for', 'the', 'patlabor', 'animated', 'series', 'in', 'its', 'many', 'incarnations', 'graphic', 'novels', 'manga', 'two', 'different', 'runs', 'of', 'made', 'for', 'video', 'episodes', 'original', 'animation', 'videos', 'or', 'oavs', 'for', 'short', 'and', 'television', 'series', 'running', 'to', 'nearly', 'fifty', 'episodes'

In [16]:
model = gensim.models.doc2vec.Doc2Vec(size=100, min_count=10, iter=200) 
# size is vector size
# min_count is the number of times a word needs to be used
#iter is number of training iterations

In [17]:
# gets vocab
model.build_vocab(train_corpus)

In [18]:
#trains model on texts
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

Wall time: 1min 44s


84975058

In [19]:
#assess model
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

In [20]:
# counts what ranks each document was classified as
# collections.Counter(ranks)  # Results vary due to random seeding and very small corpus

In [21]:
correct = 0
pos_vec = model.docvecs['pos']
neg_vec = model.docvecs['neg']
for doc in test_corpus:
    inferred_vector = model.infer_vector(doc["doc"])
    pos_dist = spatial.distance.cosine(inferred_vector, pos_vec)
    neg_dist = spatial.distance.cosine(inferred_vector, neg_vec)
    if pos_dist < neg_dist and doc["label"] == "pos":        
        correct += 1
    elif neg_dist < pos_dist and doc["label"] == "neg":
        correct += 1
print(correct / len(test_corpus))

0.79


In [22]:
model.save('./imdb.d2v')