In [18]:
import gensim
import os
import collections
import smart_open
import random
from pprint import pprint

In [2]:
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [3]:
#gensim.utils.simple_preprocess??
gensim.models.doc2vec.TaggedDocument??

In [4]:
train_file = "ontClassTopology.txt"

train_corpus = list(read_corpus(train_file))

In [5]:
train_corpus[:2]

[TaggedDocument(words=['snomed', 'ct', 'concept', 'physical', 'object', 'device', 'clinical', 'equipment', 'and', 'or', 'device', 'biomedical', 'equipment', 'laboratory', 'instruments', 'and', 'equipment', 'culture', 'plate', 'air', 'sampling', 'culture', 'plate', 'air', 'sampling', 'culture', 'plate', 'air', 'sampling', 'culture', 'plate'], tags=[0]),
 TaggedDocument(words=['snomed', 'ct', 'concept', 'clinical', 'finding', 'evaluation', 'finding', 'morphologic', 'finding', 'size', 'finding', 'increased', 'diameter', 'increased', 'diameter', 'increased', 'diameter'], tags=[1])]

In [11]:
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2, iter=55)

In [None]:
cores = multiprocessing.cpu_count()

models = [
    # PV-DBOW 
    Doc2Vec(dm=0, dbow_words=1, size=200, window=8, min_count=19, iter=10, workers=cores),
    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, size=200, window=8, min_count=19, iter =10, workers=cores),
]


In [None]:
models[0].build_vocab(documents)
print(str(models[0]))
models[1].reset_from(models[0])
print(str(models[1]))

In [None]:
for model in models:
    %%time model.train(documents, total_examples=model.corpus_count, epochs=model.iter)

In [None]:
for model in models:
    print(str(model))
    pprint(model.docvecs.most_similar(positive=["Machine learning"], topn=20))

In [None]:
for model in models:
    print(str(model))
    vec = [model.docvecs["Lady Gaga"] - model["american"] + model["japanese"]]
    pprint([m for m in model.docvecs.most_similar(vec, topn=11) if m[0] != "Lady Gaga"])

In [13]:
model.build_vocab(train_corpus)

In [33]:
len(model.wv.vocab)

63753

In [14]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

CPU times: user 42min 18s, sys: 1min 30s, total: 43min 49s
Wall time: 19min 29s


1161145570

In [32]:
# inferred_vector = model.infer_vector(['clinical', 'finding', 'evaluation', 'prevent', 'sampling', 'foot'])
inferred_vector = model.infer_vector(['congenital', 'prolong', 'rupture', 'premature', 'membrane', 'lung'])


sims = model.docvecs.most_similar([inferred_vector], topn=10)


for sim in sims:
    print(train_corpus[sim[0]].words, "score: ", sim[1])

['snomed', 'ct', 'concept', 'physical', 'object', 'device', 'clinical', 'equipment', 'and', 'or', 'device', 'biomedical', 'device', 'implant', 'device', 'respiratory', 'system', 'implant', 'respiratory', 'system', 'implant', 'respiratory', 'system', 'implant', 'artificial', 'lung', 'device', 'nasal', 'prosthesis', 'custom', 'nasal', 'prosthesis'] score:  0.5977195501327515
['snomed', 'ct', 'concept', 'clinical', 'finding', 'disease', 'finding', 'by', 'site', 'congenital', 'disease', 'viscus', 'structure', 'finding', 'finding', 'of', 'body', 'region', 'respiratory', 'finding', 'disorder', 'by', 'body', 'site', 'congenital', 'malformation', 'finding', 'of', 'trunk', 'structure', 'lower', 'respiratory', 'tract', 'finding', 'disorder', 'of', 'body', 'system', 'finding', 'of', 'upper', 'trunk', 'disorder', 'of', 'trunk', 'disorder', 'of', 'respiratory', 'system', 'finding', 'of', 'region', 'of', 'thorax', 'disorder', 'of', 'thoracic', 'segment', 'of', 'trunk', 'congenital', 'anomaly', 'of',

In [22]:
print("Total: ",len(train_corpus))


ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    print("Processing: ", doc_id)
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

Total:  336902
Processing:  0
Processing:  1
Processing:  2
Processing:  3
Processing:  4
Processing:  5
Processing:  6
Processing:  7
Processing:  8
Processing:  9
Processing:  10
Processing:  11
Processing:  12
Processing:  13
Processing:  14
Processing:  15
Processing:  16
Processing:  17
Processing:  18
Processing:  19
Processing:  20
Processing:  21
Processing:  22
Processing:  23
Processing:  24
Processing:  25
Processing:  26
Processing:  27
Processing:  28
Processing:  29
Processing:  30
Processing:  31
Processing:  32
Processing:  33
Processing:  34
Processing:  35
Processing:  36
Processing:  37
Processing:  38
Processing:  39
Processing:  40
Processing:  41
Processing:  42
Processing:  43
Processing:  44
Processing:  45
Processing:  46
Processing:  47
Processing:  48
Processing:  49
Processing:  50
Processing:  51
Processing:  52
Processing:  53
Processing:  54
Processing:  55
Processing:  56
Processing:  57
Processing:  58
Processing:  59
Processing:  60
Processing:  61
Pro

KeyboardInterrupt: 

In [23]:
collections.Counter(ranks) 

Counter({0: 55,
         1: 20,
         2: 10,
         3: 8,
         4: 7,
         5: 4,
         6: 2,
         7: 4,
         8: 3,
         9: 1,
         10: 3,
         12: 1,
         14: 3,
         18: 1,
         20: 1,
         21: 2,
         22: 1,
         28: 1,
         30: 1,
         33: 1,
         41: 1,
         47: 1,
         49: 1,
         69: 1,
         107: 1,
         120: 1,
         135: 1,
         163: 1,
         230: 1,
         293: 1,
         317: 1,
         365: 1,
         372: 1,
         385: 1,
         412: 1,
         515: 1,
         655: 1,
         978: 1,
         1439: 1,
         1829: 1,
         1842: 1,
         2163: 1,
         2353: 1,
         2625: 1,
         2646: 1,
         3723: 1,
         3733: 1,
         4923: 1,
         5034: 1,
         5106: 1,
         5156: 1,
         5692: 1,
         6168: 1,
         6361: 1,
         7484: 1,
         7594: 1,
         9157: 1,
         9586: 1,
         10023: 1,
      