In [1]:
# https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb
import gensim
import os
import collections
import smart_open
import random

In [2]:
# Set file names for train and test data
test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
lee_test_file = test_data_dir + os.sep + 'lee.cor'

In [5]:
print(test_data_dir)

/Users/kojin/anaconda/envs/ml/lib/python3.6/site-packages/gensim/test/test_data


In [6]:
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [7]:
train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

In [9]:
print(type(train_corpus[0]))

<class 'gensim.models.doc2vec.TaggedDocument'>


In [7]:
print(test_corpus[:2])

[['the', 'national', 'executive', 'of', 'the', 'strife', 'torn', 'democrats', 'last', 'night', 'appointed', 'little', 'known', 'west', 'australian', 'senator', 'brian', 'greig', 'as', 'interim', 'leader', 'shock', 'move', 'likely', 'to', 'provoke', 'further', 'conflict', 'between', 'the', 'party', 'senators', 'and', 'its', 'organisation', 'in', 'move', 'to', 'reassert', 'control', 'over', 'the', 'party', 'seven', 'senators', 'the', 'national', 'executive', 'last', 'night', 'rejected', 'aden', 'ridgeway', 'bid', 'to', 'become', 'interim', 'leader', 'in', 'favour', 'of', 'senator', 'greig', 'supporter', 'of', 'deposed', 'leader', 'natasha', 'stott', 'despoja', 'and', 'an', 'outspoken', 'gay', 'rights', 'activist'], ['cash', 'strapped', 'financial', 'services', 'group', 'amp', 'has', 'shelved', 'million', 'plan', 'to', 'buy', 'shares', 'back', 'from', 'investors', 'and', 'will', 'raise', 'million', 'in', 'fresh', 'capital', 'after', 'profits', 'crashed', 'in', 'the', 'six', 'months', 'to'

In [8]:
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2, iter=55)

In [9]:
model.build_vocab(train_corpus)

In [10]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

CPU times: user 5.31 s, sys: 304 ms, total: 5.61 s
Wall time: 2.67 s


2348318

In [11]:
model.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires'])

array([-0.05206083,  0.03141917, -0.09455411, -0.0615157 ,  0.07723559,
        0.01942795, -0.00992103,  0.01044649,  0.0030084 ,  0.07915336,
       -0.03134976,  0.03893861,  0.01952419, -0.04929464,  0.07685329,
       -0.08989383,  0.01360929, -0.09574849, -0.00496708,  0.06427038,
       -0.15261079,  0.06012472,  0.05371454, -0.06814294,  0.08504394,
       -0.03055946, -0.02440474, -0.12614577, -0.11276575,  0.02076258,
        0.00205871, -0.02376   ,  0.04448971,  0.09966746, -0.01750847,
       -0.01731499, -0.09333503,  0.01385808, -0.1872206 , -0.0399087 ,
       -0.04890673,  0.0279649 ,  0.02592936, -0.01222236, -0.0133103 ,
        0.03480035, -0.01124926, -0.07989611,  0.08215286, -0.06276517], dtype=float32)

In [12]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

In [13]:
collections.Counter(ranks)

Counter({0: 290, 1: 10})

In [15]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (299): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not v

In [16]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(train_corpus))

# Compare and print the most/median/least similar documents from the train corpus
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (265): «the federal government is under fire from unions over new departmental report which recommends australia outsource information technology it to india the document says india has low cost skilled workforce the minister for foreign affairs and trade alexander downer has given his support to the document from his department entitled india new economy old economy the report says sectors like it finance and offer attractive direct investment opportunities it also says australian firms could become more competitive by outsourcing to the indian it sector the community and public sector union wendy caird says the government seems to be encouraging local companies to export jobs to india think that quite alarming obviously labour is great deal cheaper in india and that assisted by the indian government removing labour laws and bankruptcy laws ms caird said the union says while the initiative may create jobs in india it will not help australia rising unemployment»

Similar

In [17]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus))
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (14): «very few women have been appointed to head independent schools thwarting efforts to show women as good leaders according to the victorian independent education union although they make up two thirds of teaching staff women hold only one third of principal positions the union general secretary tony keenan said he believed some women were reluctant to become principals because of the long hours and the nature of the work but in other cases they were shut out of the top position because of perceptions about their ability to lead and provide discipline»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (124, 0.6273884773254395): «the federal opposition wants tougher penalties for ships which spill oil after last week spill which affected phillip island volunteers hope to clean up the last of the oil at phillip island today authorities are still trying to track down the source of the spill which affected fairy penguins shadow environment mi