In [1]:
import gensim
import os
import collections
import smart_open
import random

In [7]:
test_data_dir = f"{(os.sep).join([gensim.__path__[0], 'test', 'test_data'])}"
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
lee_test_file = test_data_dir + os.sep + 'lee.cor'

In [10]:
def read_corpus(fname, token_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if token_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line),[i])

In [28]:
train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file))


In [14]:
len(train_corpus), len(test_corpus)

(300, 50)

In [19]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [20]:
model.build_vocab(train_corpus)

In [22]:
%%time
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

Wall time: 2.75 s


In [23]:
model.infer_vector(['only', 'you','can','prevent','forest','fires'])

array([ 0.36365494,  0.19693178,  0.15819734,  0.22401616,  0.07935365,
        0.02680831,  0.2490949 , -0.27364165, -0.4074056 , -0.03416133,
        0.15029038, -0.25972533, -0.42780748,  0.6386398 , -0.25233492,
       -0.05697685, -0.03870874,  0.3275111 , -0.1006775 ,  0.10472111,
        0.56647074,  0.0897837 , -0.08180477, -0.15258615, -0.15435378,
        0.05217075,  0.30950886, -0.21387246, -0.17471673,  0.50945264,
       -0.32427102, -0.0347627 , -0.15499485,  0.2349172 , -0.03282439,
       -0.5570285 , -0.27317727,  0.21489811,  0.34856558,  0.5376766 ,
       -0.07393651, -0.21479756,  0.24452966,  0.49882564,  0.01144324,
        0.06095293,  0.20694113, -0.02479087,  0.18230718,  0.22986288],
      dtype=float32)

In [30]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    second_ranks.append(sims[1])

In [31]:
collections.Counter(ranks)

Counter({0: 291, 1: 9})

In [32]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (299): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not v

In [33]:
# Pick a random document from the corpus and infer a vector from the model
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (114): «the federal cabinet has today endorsed series of anti terrorism measures at meeting in sydney new legislation will give more power to commonwealth agencies including allowing asio to detain people with information about terrorism for up to hours without legal representation terrorism offences will also be inserted in the criminal code and security agencies will be given the power to access unread emails attorney general daryl williams says the measures should win widespread community support the balancing process that we ve undergone in working out the asio paths believe is fair one and believe that the public will strongly support it as said hope the labor party will get behind it as well he said mr williams has appealed for the opposition to support the new anti terrorism measures we re looking for the labor party to get behind us on all of these proposals and we need the cooperation of the states and territories in relation to some of them mr williams said mr 

In [50]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id].words)
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (14): «very few women have been appointed to head independent schools thwarting efforts to show women as good leaders according to the victorian independent education union although they make up two thirds of teaching staff women hold only one third of principal positions the union general secretary tony keenan said he believed some women were reluctant to become principals because of the long hours and the nature of the work but in other cases they were shut out of the top position because of perceptions about their ability to lead and provide discipline»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (4, 0.45552027225494385): «six midwives have been suspended at wollongong hospital south of sydney for inappropriate use of nitrous oxide during work hours on some occasions while women were in labour the illawarra area health service says that following an investigation of unprofessional conduct further four midwives have been relocated to 

In [18]:
gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(te), 0)

TaggedDocument(words=['新华社北京', '日电', '记者', '许可', '金砖国家外长会晤', '日在北京钓鱼台国宾馆举行', '外交部长王毅主持会晤', '南非国际关系与合作部长马沙巴内', '巴西外长努内斯', '俄罗斯外长拉夫罗夫', '印度外交国务部长辛格出席', '王毅表示', '金砖合作机制成立十年来', '在五国领导人有力指引下', '各方秉持开放', '包容', '合作', '共赢的金砖精神', '推动金砖合作从无到有', '由浅入深', '取得长足发展', '五国经济总量占全球比重上升', '贸易投资大幅提升', '合作领域全面拓展', '形成全范围', '宽领域', '多层次的合作架构', '在联合国', '二十国集团等国际组织中紧密协作', '维护广大发展中国家团结和利益', '共同应对全球性挑战', '王毅指出', '中国接任金砖国家轮值主席', '中方愿同其他四国一道', '继续筑牢和充实政治安全', '经济金融', '人文交流三大合作支柱', '积极拓展更多新兴领域合作', '推动金砖合作取得更多成果', '开启金砖国家第二个', '金色十年', '五国应深化务实合作', '促进共同发展', '加强全球治理', '共同应对挑战', '开展人文交流', '夯实民意基础', '推进机制建设', '构建更广泛伙伴关系', '王毅强调', '金砖国家领导人第九次会晤将于', '月在厦门举行', '这是一次承前启后', '继往开来的重要会晤', '中方愿同其他四国一道', '积极开展相关筹备工作', '马沙巴内', '努内斯', '拉夫罗夫', '辛格表示', '金砖国家合作强劲有力', '取得重要成就', '五国应继续秉持金砖精神', '推动五国间合作深入发展', '在全球事务中发挥更重要作用', '承诺全力支持', '配合中方筹备好领导人第九次会晤', '确保会晤取得圆满成功', '就金砖合作下一步发展', '各方一致认为', '继续致力于维护国际公平正义', '促进世界和平稳定', '推动政治解决热点问题', '携手应对全球性挑战', '努力构建合作共赢的新型国际关系', '促进世界多极化和国际关系民主化', '为人类社会集体繁荣进步贡献更多', '金砖智慧', '金砖方案', '会晤后', '五国外长共同会见记者'

In [16]:
gensim.utils.simple_preprocess(te)

['新华社北京',
 '日电',
 '记者',
 '许可',
 '金砖国家外长会晤',
 '日在北京钓鱼台国宾馆举行',
 '外交部长王毅主持会晤',
 '南非国际关系与合作部长马沙巴内',
 '巴西外长努内斯',
 '俄罗斯外长拉夫罗夫',
 '印度外交国务部长辛格出席',
 '王毅表示',
 '金砖合作机制成立十年来',
 '在五国领导人有力指引下',
 '各方秉持开放',
 '包容',
 '合作',
 '共赢的金砖精神',
 '推动金砖合作从无到有',
 '由浅入深',
 '取得长足发展',
 '五国经济总量占全球比重上升',
 '贸易投资大幅提升',
 '合作领域全面拓展',
 '形成全范围',
 '宽领域',
 '多层次的合作架构',
 '在联合国',
 '二十国集团等国际组织中紧密协作',
 '维护广大发展中国家团结和利益',
 '共同应对全球性挑战',
 '王毅指出',
 '中国接任金砖国家轮值主席',
 '中方愿同其他四国一道',
 '继续筑牢和充实政治安全',
 '经济金融',
 '人文交流三大合作支柱',
 '积极拓展更多新兴领域合作',
 '推动金砖合作取得更多成果',
 '开启金砖国家第二个',
 '金色十年',
 '五国应深化务实合作',
 '促进共同发展',
 '加强全球治理',
 '共同应对挑战',
 '开展人文交流',
 '夯实民意基础',
 '推进机制建设',
 '构建更广泛伙伴关系',
 '王毅强调',
 '金砖国家领导人第九次会晤将于',
 '月在厦门举行',
 '这是一次承前启后',
 '继往开来的重要会晤',
 '中方愿同其他四国一道',
 '积极开展相关筹备工作',
 '马沙巴内',
 '努内斯',
 '拉夫罗夫',
 '辛格表示',
 '金砖国家合作强劲有力',
 '取得重要成就',
 '五国应继续秉持金砖精神',
 '推动五国间合作深入发展',
 '在全球事务中发挥更重要作用',
 '承诺全力支持',
 '配合中方筹备好领导人第九次会晤',
 '确保会晤取得圆满成功',
 '就金砖合作下一步发展',
 '各方一致认为',
 '继续致力于维护国际公平正义',
 '促进世界和平稳定',
 '推动政治解决热点问题',
 '携手应对全球性挑战',
 '努力构建合作共赢的新型国际关系',
 '促进世界多极化和国际关系民主化',
