In [1]:
import os, nltk, re, codecs, gensim, collections
from nltk import sent_tokenize, word_tokenize
from random import shuffle
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

In [2]:
directory = os.path.abspath(os.path.join(os.path.dirname("__file__"), 'train'))
filenames = os.listdir(directory)

In [3]:
def read_corpus():

    alldata = []

    for file in filenames:
        f = os.path.join(directory, file)
        with codecs.open(f, 'r', encoding='ISO-8859-1') as t:
            t_clean = t.read()
            slines = t_clean.splitlines()
            for line in slines:
                line = line.lower()
                line = re.sub('^\t', '', line)
                word_tokens = word_tokenize(line)
                if len(word_tokens) != 0 and word_tokens[0].startswith('abstract'):
                    alldata.append(word_tokens)
    
    for i, abstract in enumerate(alldata):
        yield gensim.models.doc2vec.TaggedDocument(abstract, [i])

train_corpus = list(read_corpus())
print(train_corpus[:3])

[TaggedDocument(words=['abstract', ':', 'surgical', 'training', 'is', 'evolving', 'from', 'an', 'observership', 'model', 'towards', 'a', 'new', 'paradigm', 'that', 'includes', 'virtual', 'reality', '(', 'vr', ')', 'simulation', '.', 'in', 'otolaryngology', ',', 'temporal', 'bone', 'dissection', 'has', 'become', 'intimately', 'linked', 'with', 'vr', 'simulation', 'as', 'the', 'complexity', 'of', 'anatomy', 'demands', 'a', 'high', 'level', 'of', 'surgeon', 'aptitude', 'and', 'confidence', '.', 'while', 'an', 'adequate', '3d', 'visualization', 'of', 'the', 'surgical', 'site', 'is', 'available', 'in', 'current', 'simulators', ',', 'the', 'force', 'feedback', 'rendered', 'during', 'haptic', 'interaction', 'does', 'not', 'convey', 'vibrations', '.', 'this', 'lack', 'of', 'vibration', 'rendering', 'limits', 'the', 'simulation', 'fidelity', 'of', 'a', 'surgical', 'drill', 'such', 'as', 'that', 'used', 'in', 'temporal', 'bone', 'dissection', '.', 'in', 'order', 'to', 'develop', 'an', 'immersive

In [4]:
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2, iter=55)

In [5]:
model.build_vocab(train_corpus)

In [6]:
%time model.train(train_corpus, total_examples=model.corpus_count)

CPU times: user 2.65 s, sys: 186 ms, total: 2.84 s
Wall time: 1.35 s


976148

In [7]:
example = 'this is an ideal invention'
e_tokens = word_tokenize(example)
print(model.infer_vector(e_tokens))

[ 0.01329507  0.04116756 -0.00503034  0.05900662 -0.03607457 -0.08635619
 -0.03194524 -0.03875767  0.03668084  0.05477848 -0.02121238 -0.01474856
 -0.0047201  -0.01893769  0.01623149  0.00525685 -0.04530971 -0.08397169
  0.08543845 -0.03915385  0.07119739 -0.07579942  0.07293773  0.01938784
 -0.00485407 -0.00208733 -0.08627415  0.00816097 -0.0759835  -0.04010138
 -0.06425526 -0.08384609  0.01393502 -0.06227853  0.03690941 -0.06471564
 -0.00250818  0.07395116  0.0132476   0.04540789  0.01075058 -0.06535107
  0.00090674 -0.00290885  0.09275433 -0.04156831 -0.02543715 -0.05737796
 -0.00434358 -0.04069665]


In [8]:
patent = '''Anarticulating surgical instrument suited for endoscopic use 
includes a lateral articulation control into a handle portion 
that provides an intuitive visual and tactile indication to the 
clinician as to the amount and direction of articulation of an 
end effector at a distal end of a shaft. Lateral movement of 
a lateral control actuator is converted into a longitudinal 
motion or a rotational motion transferred by the shaft to an articulation mechanism. 
Aversion of a lateral articulation control for a rotationally driven articulation mechanism 
incorporates an articulation backdrive lockout that prevents forces on the end effector from 
causing the selected amount of articulation from being changed.'''
patent_tokens = word_tokenize(patent)
patent_vec = model.infer_vector(patent_tokens)

In [9]:
sims = model.docvecs.most_similar([patent_vec], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document: Patent')
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document: Patent
SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (146, 0.805659294128418): «abstract : this paper presents the development of an underactuated compliant gripper using a biocompatible superelastic alloy namely mtinol this gripper hay two fingers with five phalanges each and can be used as the end effector of an endoscopic instrument optmization procedures are required to obtain the geometry of the transmission mechanism because of its underactuated nature and its underlying complexity a driving mechanism further incorporated m the gripper to distribute actuation to both fingers and accomplish the grasping of asymmetrical objects without requiring supplementary inputs is also discussed finally the results of numerical simulations with different materials and different grasped objects are prevented and discussed [ doi 10.1115/1.3089249 ]»

MEDIAN (66, 0.502617359161377): «abstract : this research develops a robot plan for a system that a

In [10]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

In [11]:
'''Basically, greater than 95% of the inferred documents are found to be most similar to 
itself and about 5% of the time it is mistakenly most similar to another document. 
the checking of an inferred-vector against a training-vector is a sort of 'sanity check' as 
to whether the model is behaving in a usefully consistent manner, though not a real 'accuracy' 
value.'''
collections.Counter(ranks)

Counter({0: 146, 1: 10, 2: 1})

In [12]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (156): «abstract : minimally invasive surgery ( mis ) has become an important field in the health care sector over the last decade . still , there is the need for improving existing instruments and developing new tools providing increased functionality . this work presents innovative solutions and experimental results for a new generation of innovative polymer-based shaft instruments for minimally invasive surgery . the investigated components comprise a new kind of end-effector mechanism and all improved force transmission for actuating the effector .»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (156, 0.8287349939346313): «abstract : minimally invasive surgery ( mis ) has become an important field in the health care sector over the last decade . still , there is the need for improving existing instruments and developing new tools providing increased functionality . this work presents innovative solutions and experimental results for a new g