In [1]:
# https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb
import gensim
import os
import collections
import smart_open
import random

In [2]:
# Set file names for train and test data
test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
lee_test_file = test_data_dir + os.sep + 'lee.cor'

In [3]:
print(test_data_dir)

/Users/kojin/anaconda/envs/ml/lib/python3.6/site-packages/gensim/test/test_data


In [4]:
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [5]:
train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

In [6]:
print(type(train_corpus[0]))

<class 'gensim.models.doc2vec.TaggedDocument'>


In [7]:
print(test_corpus[:2])

[['the', 'national', 'executive', 'of', 'the', 'strife', 'torn', 'democrats', 'last', 'night', 'appointed', 'little', 'known', 'west', 'australian', 'senator', 'brian', 'greig', 'as', 'interim', 'leader', 'shock', 'move', 'likely', 'to', 'provoke', 'further', 'conflict', 'between', 'the', 'party', 'senators', 'and', 'its', 'organisation', 'in', 'move', 'to', 'reassert', 'control', 'over', 'the', 'party', 'seven', 'senators', 'the', 'national', 'executive', 'last', 'night', 'rejected', 'aden', 'ridgeway', 'bid', 'to', 'become', 'interim', 'leader', 'in', 'favour', 'of', 'senator', 'greig', 'supporter', 'of', 'deposed', 'leader', 'natasha', 'stott', 'despoja', 'and', 'an', 'outspoken', 'gay', 'rights', 'activist'], ['cash', 'strapped', 'financial', 'services', 'group', 'amp', 'has', 'shelved', 'million', 'plan', 'to', 'buy', 'shares', 'back', 'from', 'investors', 'and', 'will', 'raise', 'million', 'in', 'fresh', 'capital', 'after', 'profits', 'crashed', 'in', 'the', 'six', 'months', 'to'

In [8]:
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2, iter=55)

In [9]:
model.build_vocab(train_corpus)

In [10]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

CPU times: user 5.28 s, sys: 254 ms, total: 5.54 s
Wall time: 2.68 s


2348026

In [11]:
model.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires'])

array([-0.07311641,  0.09990416,  0.04155187, -0.03754788,  0.02057275,
       -0.01194504,  0.0835572 , -0.08197542,  0.00481572,  0.07434512,
        0.0016029 ,  0.03379913,  0.02198007,  0.05052391, -0.0149633 ,
        0.06293601, -0.02347993, -0.08924441,  0.04137841,  0.01395027,
       -0.04538746, -0.07690246, -0.00499757,  0.13392173, -0.06262973,
       -0.11540522, -0.03026734, -0.01575517,  0.03151207,  0.08027396,
        0.0192104 ,  0.09253498, -0.04774636,  0.04662389,  0.03134349,
        0.0464799 , -0.07430699, -0.00939943, -0.03465239,  0.126945  ,
       -0.00210944,  0.01217539, -0.08839434,  0.03851219,  0.01400306,
       -0.04105096,  0.06818597,  0.04657355,  0.01403926, -0.02323933], dtype=float32)

In [12]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

In [13]:
collections.Counter(ranks)

Counter({0: 292, 1: 8})

In [14]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (299): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not v

In [15]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(train_corpus))

# Compare and print the most/median/least similar documents from the train corpus
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (233): «three us troops and five members of the afghan opposition were killed by stray us bomb near kandahar in afghanistan the pentagon said the pentagon had earlier confirmed that two us special forces soldiers were killed and others wounded north of kandahar when bomber dropped pound bomb too close to them the was flying in support of opposition forces north of kandahar said pentagon spokeswoman victori clark we have an update since this morning and unfortunately the number of us forces killed is now three rival afghan factions signed an historic power sharing agreement to form post taliban government and set the country on the road to recovery and democracy after two decades of war the accord was sealed after nine days of exhausting negotiations and paves the way for six month interim administration headed by moderate muslim hamid karzai from the dominant pashtun ethnic group the deal gives the northern alliance control of three key portfolios in the member cabinet w

In [17]:
test_corpus[doc_id]

['the',
 'saudi',
 'interior',
 'ministry',
 'on',
 'sunday',
 'confirmed',
 'it',
 'is',
 'holding',
 'year',
 'old',
 'saudi',
 'man',
 'the',
 'fbi',
 'is',
 'seeking',
 'for',
 'alleged',
 'links',
 'to',
 'the',
 'sept',
 'hijackers',
 'authorities',
 'are',
 'interrogating',
 'saud',
 'abdulaziz',
 'saud',
 'al',
 'rasheed',
 'and',
 'if',
 'it',
 'is',
 'proven',
 'that',
 'he',
 'was',
 'connected',
 'to',
 'terrorism',
 'he',
 'will',
 'be',
 'referred',
 'to',
 'the',
 'sharia',
 'islamic',
 'court',
 'the',
 'official',
 'saudi',
 'press',
 'agency',
 'quoted',
 'an',
 'unidentified',
 'ministry',
 'official',
 'as',
 'saying']

In [18]:
test_corpus

[['the',
  'national',
  'executive',
  'of',
  'the',
  'strife',
  'torn',
  'democrats',
  'last',
  'night',
  'appointed',
  'little',
  'known',
  'west',
  'australian',
  'senator',
  'brian',
  'greig',
  'as',
  'interim',
  'leader',
  'shock',
  'move',
  'likely',
  'to',
  'provoke',
  'further',
  'conflict',
  'between',
  'the',
  'party',
  'senators',
  'and',
  'its',
  'organisation',
  'in',
  'move',
  'to',
  'reassert',
  'control',
  'over',
  'the',
  'party',
  'seven',
  'senators',
  'the',
  'national',
  'executive',
  'last',
  'night',
  'rejected',
  'aden',
  'ridgeway',
  'bid',
  'to',
  'become',
  'interim',
  'leader',
  'in',
  'favour',
  'of',
  'senator',
  'greig',
  'supporter',
  'of',
  'deposed',
  'leader',
  'natasha',
  'stott',
  'despoja',
  'and',
  'an',
  'outspoken',
  'gay',
  'rights',
  'activist'],
 ['cash',
  'strapped',
  'financial',
  'services',
  'group',
  'amp',
  'has',
  'shelved',
  'million',
  'plan',
  'to',
 

In [16]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus))
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (27): «the saudi interior ministry on sunday confirmed it is holding year old saudi man the fbi is seeking for alleged links to the sept hijackers authorities are interrogating saud abdulaziz saud al rasheed and if it is proven that he was connected to terrorism he will be referred to the sharia islamic court the official saudi press agency quoted an unidentified ministry official as saying»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (75, 0.629386842250824): «us president george bush has marked the th day of the campaign against terrorism by calling on his allies to freeze the assets of two non us organisations suspected of supporting terrorism one of the groups is based in kashmir the other is alleged to have helped al qaeda develop nuclear weapons president bush says former scientist at pakistan atomic program had established group called utn after assisting osama bin laden network develop nuclear bomb utn claims to serve the hungry 