In [1]:
from common import *
from matplotlib import pyplot as plt
import smart_open

In [2]:
with open(join(DATA_FOLDER, 'gold_mongo.json'), 'r') as f:
    gold = json.load(f)

In [None]:
def iter_docs(fnames):
    for i, fn in enumerate(fnames):
#         logging.info("%s: " % i + fn)
        with GzipFile(fn, 'rb') as f:
            docs = ujson.loads(f.read())
        for k,doc in docs.items():
            yield k,doc

                
class Documents(object):
    def __init__(self, folder, tokens_only=False):
        self.folder = folder
        self.tokens_only = tokens_only
 
    def __iter__(self):
        fnames = glob(join(self.folder, '*.json.gz'))
        for k, doc in iter_docs(fnames):
            unlisted = [w for s in doc for w in s]
            if self.tokens_only:
                yield unlisted
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(unlisted, [k])

In [None]:
folder = join(DATA_FOLDER, 'documents/')

model = Doc2Vec(Documents(folder), size=200, window=15, min_count=5, dm=1, workers=cpu_count, iter=10)

2017-09-30 18:59:42,721 [MainThread  ] [INFO ]  collecting all words and their counts
2017-09-30 18:59:42,793 [MainThread  ] [INFO ]  PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-09-30 18:59:48,312 [MainThread  ] [INFO ]  PROGRESS: at example #10000, processed 5214517 words (945145/s), 113415 word types, 10000 tags
2017-09-30 18:59:54,478 [MainThread  ] [INFO ]  PROGRESS: at example #20000, processed 10791718 words (904626/s), 157599 word types, 20000 tags
2017-09-30 19:00:04,194 [MainThread  ] [INFO ]  PROGRESS: at example #30000, processed 19494360 words (895872/s), 197295 word types, 30000 tags
2017-09-30 19:00:16,325 [MainThread  ] [INFO ]  PROGRESS: at example #40000, processed 30923454 words (942234/s), 249001 word types, 40000 tags
2017-09-30 19:00:29,758 [MainThread  ] [INFO ]  PROGRESS: at example #50000, processed 42448329 words (858070/s), 299432 word types, 50000 tags
2017-09-30 19:00:46,365 [MainThread  ] [INFO ]  PROGRESS: at example #60000,

In [None]:
model.save('../data/saved/d2v_model_s200_w15_mc5_dm1_iter10')

In [8]:
most = model.most_similar(u'стол')
for w, score in most:
    print(w + ', %s' % score)

нитрозативный, 0.328898489475
мамматон, 0.315056562424
деметоксилировать, 0.310829102993
теносклеропластика, 0.309251964092
пискаревский, 0.30451887846
технологичность, 0.302434027195
ddip, 0.299423635006
перекомпиляция, 0.294496178627
диметоксикоричный, 0.294087588787
mulchandani, 0.292344450951


In [None]:
test_docs = {}
fnames = glob(join(folder, '*.json.gz'))
for k,doc in iter_docs(fnames):
    if k in gold.keys():
        unlisted = [w for s in doc for w in s]
        test_docs[k] = unlisted

In [None]:
preds = {}
for k, doc in tqdm_notebook(test_docs.items()):
    inferred_vector = model.infer_vector(doc, steps=5)
    sims = model.docvecs.most_similar([inferred_vector], topn=200)
    preds[k] = [i for i,score in sims]

In [None]:
result = evaluate(preds, gold)

ax = result['acc10'].hist()
ax.set_xlabel("acc10")
plt.show()

ax = result['acc20'].hist()
ax.set_xlabel("acc20")
plt.show()

ax = result['acc200'].hist()
ax.set_xlabel("acc200")
plt.show()