In [None]:
# import sys
# !{sys.executable} -m pip install numpy pandas matplotlib sklearn seaborn
# !{sys.executable} -m pip install --upgrade gensim
# !{sys.executable} -m pip install nltk
# import nltk
# nltk.download('punkt')
# !{sys.executable} -m pip install beautifulsoup4

In [1]:
import os
import re
import gensim
import multiprocessing
import random
import logging
import numpy as np
import zipfile
import tensorflow as tf

from urllib import request

from pathlib import Path
from os import listdir
from os.path import isfile, join
from nltk.tokenize import RegexpTokenizer
from bs4 import BeautifulSoup

from sklearn.cluster import KMeans
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [12]:
SKIP_FILES = ""
NEWLINE = '\n'

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

"""
read training files
"""
def read_files(path):
    print("path: {}...".format(path))
    for root, dirnames, filenames in os.walk(path):
        for dirname in dirnames:
            read_files(os.path.join(root, dirname))
        for filename in filenames:
            if filename not in SKIP_FILES:
                filepath = os.path.join(root, filename)
                if os.path.isfile(filepath):
                    lines = []
                    f = open(filepath, encoding='latin-1')
                    for line in f:
                        lines.append(line)
                    f.close()
                    content = NEWLINE.join(lines)
                    yield filename, content

def download(url, filename):
    """Download a file if not present"""
    if not os.path.exists(filename):
        print("downloading {}...".format(filename))
        filename, _ = request.urlretrieve(url + filename, filename)

        print("extracting {}...".format(filename))
        with zipfile.ZipFile(filename) as f:
            f.extractall()

    return filename

In [13]:
def get_data(url, filename):
    filename = download(url, filename)
    data_path = filename.replace('.zip','')
    documents = []
    print("building documents...")
    for fname, text in read_files(data_path):
        documents.append(clean_str(text).split(' '))
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
    print("building documents done")
    return documents

In [14]:
# download data training
documents = get_data(url='https://github.com/kadriansyah/notebook/raw/master/alodokter-doc2vec-article/', filename="data.zip")
print("we have {} documents".format(len(documents)))

downloading data.zip...
extracting data.zip...
building documents...
path: data...
building documents done
we have 7932 documents


In [15]:
documents[0]

TaggedDocument(words=['penyebab', 'bayi', 'muntah', 'setelah', 'minum', 'asi', 'dan', 'cara', 'mengatasinya', 'bayi', 'muntah', 'setelah', 'minum', 'asi', '(', 'air', 'susu', 'ibu', ')', 'adalah', 'keluhan', 'yang', 'sering', 'terjadi', 'sebagian', 'bayi', 'bahkan', 'mengalaminya', 'hampir', 'setiap', 'kali', 'selesai', 'menyusu', 'meski', 'umumnya', 'normal', 'kondisi', 'ini', 'bisa', 'juga', 'disebabkan', 'oleh', 'gangguan', 'berbahaya', 'yang', 'harus', 'diwaspadai', 'bayi', 'muntah', 'setelah', 'minum', 'asi', 'dikenal', 'dengan', 'istilah', 'gumoh', 'gumoh', 'dikatakan', 'normal', 'apabila', 'tidak', 'menyebabkan', 'bayi', 'rewel', 'atau', 'sesak', 'napas', 'meskipun', 'dapat', 'dicegah', 'kondisi', 'tersebut', 'tidak', 'memerlukan', 'penanganan', 'khusus', 'dan', 'normal', 'terjadi', 'penyebab', 'bayi', 'muntah', 'setelah', 'minum', 'asi', 'gumoh', 'disebabkan', 'oleh', 'asi', 'atau', 'susu', 'yang', 'ditelan', 'bayi', 'kembali', 'ke', 'kerongkongan', 'karena', 'otot', 'di', 'sal

In [16]:
def evaluate(model, documents, steps):
    percentiles = np.zeros(steps)
    for step in range(steps):
        docid = np.random.randint(model.docvecs.count)
        inferred_vector = model.infer_vector(documents[docid][0])
        similars = model.docvecs.most_similar(positive=[inferred_vector], topn=10)
        for idx,simdoc in enumerate(similars):
            if simdoc[0] == docid:
                print("found similar document with id {} in position {} with similarity score {}".format(simdoc[0], idx, simdoc[1]))
                percentiles[step] = ((len(similars) - idx) / len(similars)) * 100
                break
    return np.mean(percentiles)

def train(documents=documents, model_name="model/alodokter-articles-doc2vec.model", max_epochs=50, patience=3):
    best_mean_percentiles = 0
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Doc2Vec(dm=1, vector_size=300, window=2, alpha=0.1, min_alpha=0.0001, min_count=5, epochs=1, workers=5)
    model.build_vocab(documents)
    for epoch in range(max_epochs):
        print('training epoch {:d} ...'.format(epoch))
        model.train(documents, total_examples=model.corpus_count,epochs=model.epochs)
        mean_percentiles = evaluate(model,documents,10)
        print('mean percentiles: {:.2f}'.format(mean_percentiles))
        
        if mean_percentiles < best_mean_percentiles:
            print("current mean_percentiles: {:.2f}, best: {:.2f}".format(mean_percentiles, best_mean_percentiles))
            patience = patience-1
        else:
            best_mean_percentiles = mean_percentiles
            print("========== Saving best model with mean_percentiles: {:.2f} ==========".format(mean_percentiles))
            model.save(model_name)
            patience = patience+1
        
        if patience == 0:
            print("early stop...")
            print("========== Saving best model with mean_percentiles: {:.2f} ==========".format(best_mean_percentiles))
            break
    
    return model

In [17]:
model = train(documents)

2020-02-23 08:41:29,648 : INFO : collecting all words and their counts
2020-02-23 08:41:29,649 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-02-23 08:41:30,310 : INFO : collected 54008 word types and 7932 unique tags from a corpus of 7932 examples and 4866436 words
2020-02-23 08:41:30,311 : INFO : Loading a fresh vocabulary
2020-02-23 08:41:30,343 : INFO : effective_min_count=5 retains 15409 unique words (28% of original 54008, drops 38599)
2020-02-23 08:41:30,343 : INFO : effective_min_count=5 leaves 4811064 word corpus (98% of original 4866436, drops 55372)
2020-02-23 08:41:30,383 : INFO : deleting the raw counts dictionary of 54008 items
2020-02-23 08:41:30,385 : INFO : sample=0.001 downsamples 51 most-common words
2020-02-23 08:41:30,386 : INFO : downsampling leaves estimated 4105293 word corpus (85.3% of prior 4811064)
2020-02-23 08:41:30,422 : INFO : estimated required memory for 15409 words and 300 dimensions: 54204500 bytes
2020-02-23 08:4

training epoch 0 ...


2020-02-23 08:41:35,088 : INFO : EPOCH 1 - PROGRESS: at 37.10% examples, 1515933 words/s, in_qsize 10, out_qsize 0
2020-02-23 08:41:36,092 : INFO : EPOCH 1 - PROGRESS: at 75.08% examples, 1532878 words/s, in_qsize 9, out_qsize 0
2020-02-23 08:41:36,818 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 08:41:36,821 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 08:41:36,822 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 08:41:36,825 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 08:41:36,827 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 08:41:36,827 : INFO : EPOCH - 1 : training on 4866436 raw words (4113463 effective words) took 2.7s, 1495929 effective words/s
2020-02-23 08:41:36,828 : INFO : training on a 4866436 raw words (4113463 effective words) took 2.8s, 1493561 effective words/s
2020-02-23 08:41:36,830 : INFO : precomputing L2-n

found similar document with id 7771 in position 0 with similarity score 0.618470311164856
found similar document with id 4449 in position 0 with similarity score 0.6838314533233643
found similar document with id 7650 in position 0 with similarity score 0.6442806124687195
found similar document with id 4448 in position 0 with similarity score 0.7068660259246826
found similar document with id 960 in position 0 with similarity score 0.6356767416000366
found similar document with id 4337 in position 0 with similarity score 0.6365523338317871
found similar document with id 3311 in position 0 with similarity score 0.6094967126846313
found similar document with id 7759 in position 0 with similarity score 0.5914686322212219
mean percentiles: 80.00


2020-02-23 08:41:37,407 : INFO : saved model/alodokter-articles-doc2vec.model
2020-02-23 08:41:37,408 : INFO : training model with 5 workers on 15409 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2


training epoch 1 ...


2020-02-23 08:41:38,414 : INFO : EPOCH 1 - PROGRESS: at 32.22% examples, 1322448 words/s, in_qsize 9, out_qsize 0
2020-02-23 08:41:39,416 : INFO : EPOCH 1 - PROGRESS: at 66.75% examples, 1373973 words/s, in_qsize 10, out_qsize 0
2020-02-23 08:41:40,311 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 08:41:40,313 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 08:41:40,321 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 08:41:40,322 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 08:41:40,325 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 08:41:40,325 : INFO : EPOCH - 1 : training on 4866436 raw words (4113476 effective words) took 2.9s, 1412168 effective words/s
2020-02-23 08:41:40,326 : INFO : training on a 4866436 raw words (4113476 effective words) took 2.9s, 1410178 effective words/s
2020-02-23 08:41:40,359 : INFO : saving Doc2Vec ob

found similar document with id 380 in position 1 with similarity score 0.5233646035194397
found similar document with id 4849 in position 0 with similarity score 0.7199378609657288
found similar document with id 2750 in position 0 with similarity score 0.6436284780502319
found similar document with id 2805 in position 0 with similarity score 0.7999988198280334
found similar document with id 2684 in position 0 with similarity score 0.7287818193435669
found similar document with id 2196 in position 0 with similarity score 0.6625823974609375
found similar document with id 710 in position 8 with similarity score 0.41550952196121216
found similar document with id 664 in position 0 with similarity score 0.629470705986023
found similar document with id 7719 in position 0 with similarity score 0.6686818599700928
found similar document with id 4874 in position 0 with similarity score 0.6538910269737244
mean percentiles: 91.00


2020-02-23 08:41:40,780 : INFO : saved model/alodokter-articles-doc2vec.model
2020-02-23 08:41:40,781 : INFO : training model with 5 workers on 15409 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2


training epoch 2 ...


2020-02-23 08:41:41,789 : INFO : EPOCH 1 - PROGRESS: at 32.03% examples, 1311347 words/s, in_qsize 10, out_qsize 0
2020-02-23 08:41:42,797 : INFO : EPOCH 1 - PROGRESS: at 66.16% examples, 1356705 words/s, in_qsize 9, out_qsize 0
2020-02-23 08:41:43,720 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 08:41:43,721 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 08:41:43,721 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 08:41:43,722 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 08:41:43,723 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 08:41:43,724 : INFO : EPOCH - 1 : training on 4866436 raw words (4113264 effective words) took 2.9s, 1399904 effective words/s
2020-02-23 08:41:43,724 : INFO : training on a 4866436 raw words (4113264 effective words) took 2.9s, 1397843 effective words/s
2020-02-23 08:41:43,753 : INFO : saving Doc2Vec ob

found similar document with id 2343 in position 0 with similarity score 0.6491297483444214
found similar document with id 6264 in position 0 with similarity score 0.6432464122772217
found similar document with id 5264 in position 0 with similarity score 0.7168463468551636
found similar document with id 315 in position 0 with similarity score 0.5046045780181885
found similar document with id 4661 in position 0 with similarity score 0.6944633722305298
found similar document with id 6456 in position 0 with similarity score 0.7050161361694336
found similar document with id 4388 in position 0 with similarity score 0.6985473036766052
found similar document with id 6606 in position 0 with similarity score 0.7297171354293823
found similar document with id 6508 in position 0 with similarity score 0.7628588676452637
found similar document with id 4372 in position 0 with similarity score 0.6826696395874023
mean percentiles: 100.00


2020-02-23 08:41:44,164 : INFO : saved model/alodokter-articles-doc2vec.model
2020-02-23 08:41:44,165 : INFO : training model with 5 workers on 15409 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2


training epoch 3 ...


2020-02-23 08:41:45,171 : INFO : EPOCH 1 - PROGRESS: at 33.55% examples, 1379413 words/s, in_qsize 10, out_qsize 0
2020-02-23 08:41:46,173 : INFO : EPOCH 1 - PROGRESS: at 69.59% examples, 1431320 words/s, in_qsize 9, out_qsize 0
2020-02-23 08:41:46,954 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 08:41:46,960 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 08:41:46,962 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 08:41:46,963 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 08:41:46,968 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 08:41:46,969 : INFO : EPOCH - 1 : training on 4866436 raw words (4112892 effective words) took 2.8s, 1469352 effective words/s
2020-02-23 08:41:46,969 : INFO : training on a 4866436 raw words (4112892 effective words) took 2.8s, 1467204 effective words/s
2020-02-23 08:41:47,000 : INFO : saving Doc2Vec ob

found similar document with id 2353 in position 0 with similarity score 0.5704185366630554
found similar document with id 7312 in position 0 with similarity score 0.6147781610488892
found similar document with id 4404 in position 0 with similarity score 0.6751293540000916
found similar document with id 6989 in position 0 with similarity score 0.630764365196228
found similar document with id 6862 in position 0 with similarity score 0.6600054502487183
found similar document with id 2847 in position 0 with similarity score 0.4859165847301483
found similar document with id 5461 in position 0 with similarity score 0.6169723272323608
found similar document with id 7024 in position 0 with similarity score 0.6728838682174683
found similar document with id 4956 in position 0 with similarity score 0.6004559993743896
found similar document with id 1795 in position 0 with similarity score 0.6219866871833801
mean percentiles: 100.00


2020-02-23 08:41:47,417 : INFO : saved model/alodokter-articles-doc2vec.model
2020-02-23 08:41:47,419 : INFO : training model with 5 workers on 15409 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2


training epoch 4 ...


2020-02-23 08:41:48,424 : INFO : EPOCH 1 - PROGRESS: at 33.93% examples, 1393621 words/s, in_qsize 10, out_qsize 1
2020-02-23 08:41:49,439 : INFO : EPOCH 1 - PROGRESS: at 69.77% examples, 1424979 words/s, in_qsize 10, out_qsize 0
2020-02-23 08:41:50,274 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 08:41:50,276 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 08:41:50,277 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 08:41:50,278 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 08:41:50,284 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 08:41:50,285 : INFO : EPOCH - 1 : training on 4866436 raw words (4112857 effective words) took 2.9s, 1436213 effective words/s
2020-02-23 08:41:50,285 : INFO : training on a 4866436 raw words (4112857 effective words) took 2.9s, 1435016 effective words/s
2020-02-23 08:41:50,313 : INFO : training model w

found similar document with id 4416 in position 0 with similarity score 0.539395809173584
found similar document with id 4446 in position 0 with similarity score 0.4877135753631592
found similar document with id 7717 in position 0 with similarity score 0.5399603843688965
found similar document with id 2272 in position 1 with similarity score 0.49606576561927795
found similar document with id 2121 in position 0 with similarity score 0.4883211851119995
found similar document with id 2137 in position 0 with similarity score 0.610095739364624
found similar document with id 2303 in position 0 with similarity score 0.4863532781600952
found similar document with id 2613 in position 0 with similarity score 0.5358152389526367
found similar document with id 7425 in position 0 with similarity score 0.5046092867851257
mean percentiles: 89.00
current mean_percentiles: 89.00, best: 100.00
training epoch 5 ...


2020-02-23 08:41:51,325 : INFO : EPOCH 1 - PROGRESS: at 35.26% examples, 1441089 words/s, in_qsize 9, out_qsize 0
2020-02-23 08:41:52,327 : INFO : EPOCH 1 - PROGRESS: at 71.60% examples, 1466438 words/s, in_qsize 9, out_qsize 0
2020-02-23 08:41:53,031 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 08:41:53,040 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 08:41:53,041 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 08:41:53,042 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 08:41:53,049 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 08:41:53,050 : INFO : EPOCH - 1 : training on 4866436 raw words (4112984 effective words) took 2.7s, 1504796 effective words/s
2020-02-23 08:41:53,050 : INFO : training on a 4866436 raw words (4112984 effective words) took 2.7s, 1502751 effective words/s
2020-02-23 08:41:53,075 : INFO : training model wit

found similar document with id 5169 in position 0 with similarity score 0.6398381590843201
found similar document with id 4191 in position 0 with similarity score 0.6157698035240173
found similar document with id 6475 in position 0 with similarity score 0.6017863154411316
found similar document with id 4073 in position 0 with similarity score 0.6039165258407593
found similar document with id 7805 in position 0 with similarity score 0.6157862544059753
found similar document with id 2025 in position 0 with similarity score 0.6671191453933716
found similar document with id 5544 in position 0 with similarity score 0.5649850368499756
found similar document with id 5414 in position 0 with similarity score 0.5075386762619019
mean percentiles: 80.00
current mean_percentiles: 80.00, best: 100.00
training epoch 6 ...


2020-02-23 08:41:54,085 : INFO : EPOCH 1 - PROGRESS: at 36.28% examples, 1482031 words/s, in_qsize 9, out_qsize 0
2020-02-23 08:41:55,091 : INFO : EPOCH 1 - PROGRESS: at 75.44% examples, 1542107 words/s, in_qsize 9, out_qsize 0
2020-02-23 08:41:55,698 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 08:41:55,701 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 08:41:55,702 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 08:41:55,703 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 08:41:55,705 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 08:41:55,706 : INFO : EPOCH - 1 : training on 4866436 raw words (4112831 effective words) took 2.6s, 1565129 effective words/s
2020-02-23 08:41:55,706 : INFO : training on a 4866436 raw words (4112831 effective words) took 2.6s, 1563089 effective words/s
2020-02-23 08:41:55,731 : INFO : training model wit

found similar document with id 5987 in position 0 with similarity score 0.6235440969467163
found similar document with id 7579 in position 0 with similarity score 0.5886069536209106
found similar document with id 5081 in position 0 with similarity score 0.5626044273376465
found similar document with id 6315 in position 0 with similarity score 0.5298286080360413
found similar document with id 4279 in position 0 with similarity score 0.5991606116294861
found similar document with id 535 in position 0 with similarity score 0.42960047721862793
found similar document with id 7027 in position 0 with similarity score 0.549343466758728
found similar document with id 5610 in position 0 with similarity score 0.569473385810852
found similar document with id 6295 in position 0 with similarity score 0.5916352868080139
mean percentiles: 90.00
current mean_percentiles: 90.00, best: 100.00
training epoch 7 ...


2020-02-23 08:41:56,735 : INFO : EPOCH 1 - PROGRESS: at 37.90% examples, 1562272 words/s, in_qsize 9, out_qsize 0
2020-02-23 08:41:57,737 : INFO : EPOCH 1 - PROGRESS: at 77.05% examples, 1582388 words/s, in_qsize 9, out_qsize 0
2020-02-23 08:41:58,310 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 08:41:58,318 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 08:41:58,321 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 08:41:58,322 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 08:41:58,323 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 08:41:58,324 : INFO : EPOCH - 1 : training on 4866436 raw words (4113355 effective words) took 2.6s, 1588614 effective words/s
2020-02-23 08:41:58,324 : INFO : training on a 4866436 raw words (4113355 effective words) took 2.6s, 1586686 effective words/s
2020-02-23 08:41:58,346 : INFO : training model wit

found similar document with id 991 in position 0 with similarity score 0.5198101997375488
found similar document with id 3045 in position 0 with similarity score 0.6133900880813599
found similar document with id 7699 in position 0 with similarity score 0.5554860830307007
found similar document with id 5714 in position 0 with similarity score 0.5412174463272095
found similar document with id 6919 in position 0 with similarity score 0.520093560218811
found similar document with id 5699 in position 0 with similarity score 0.5821018218994141
found similar document with id 2839 in position 0 with similarity score 0.5411393046379089
found similar document with id 3437 in position 0 with similarity score 0.47233492136001587
found similar document with id 6512 in position 0 with similarity score 0.5540144443511963
mean percentiles: 90.00
current mean_percentiles: 90.00, best: 100.00
training epoch 8 ...


2020-02-23 08:41:59,354 : INFO : EPOCH 1 - PROGRESS: at 37.90% examples, 1557353 words/s, in_qsize 10, out_qsize 0
2020-02-23 08:42:00,356 : INFO : EPOCH 1 - PROGRESS: at 73.39% examples, 1506569 words/s, in_qsize 9, out_qsize 0
2020-02-23 08:42:01,094 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 08:42:01,099 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 08:42:01,101 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 08:42:01,107 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 08:42:01,108 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 08:42:01,108 : INFO : EPOCH - 1 : training on 4866436 raw words (4112987 effective words) took 2.8s, 1491274 effective words/s
2020-02-23 08:42:01,108 : INFO : training on a 4866436 raw words (4112987 effective words) took 2.8s, 1489491 effective words/s
2020-02-23 08:42:01,140 : INFO : training model wi

found similar document with id 7512 in position 0 with similarity score 0.5821084976196289
found similar document with id 3810 in position 0 with similarity score 0.5245790481567383
found similar document with id 5674 in position 0 with similarity score 0.582007884979248
found similar document with id 1349 in position 7 with similarity score 0.4493425786495209
found similar document with id 1311 in position 4 with similarity score 0.4602513909339905
found similar document with id 3572 in position 0 with similarity score 0.4742148518562317
found similar document with id 2951 in position 0 with similarity score 0.4879973530769348
found similar document with id 6860 in position 0 with similarity score 0.4633292257785797
mean percentiles: 69.00
current mean_percentiles: 69.00, best: 100.00
training epoch 9 ...


2020-02-23 08:42:02,147 : INFO : EPOCH 1 - PROGRESS: at 36.54% examples, 1502694 words/s, in_qsize 10, out_qsize 0
2020-02-23 08:42:03,149 : INFO : EPOCH 1 - PROGRESS: at 73.84% examples, 1515827 words/s, in_qsize 10, out_qsize 1
2020-02-23 08:42:03,782 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 08:42:03,789 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 08:42:03,790 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 08:42:03,791 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 08:42:03,794 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 08:42:03,795 : INFO : EPOCH - 1 : training on 4866436 raw words (4113005 effective words) took 2.7s, 1551364 effective words/s
2020-02-23 08:42:03,795 : INFO : training on a 4866436 raw words (4113005 effective words) took 2.7s, 1549290 effective words/s
2020-02-23 08:42:03,817 : INFO : training model w

found similar document with id 402 in position 1 with similarity score 0.39155665040016174
found similar document with id 4305 in position 0 with similarity score 0.4275767207145691
found similar document with id 4799 in position 0 with similarity score 0.4248434901237488
found similar document with id 775 in position 4 with similarity score 0.3866235017776489
found similar document with id 2293 in position 9 with similarity score 0.3794844448566437
found similar document with id 6129 in position 0 with similarity score 0.5287708640098572
found similar document with id 694 in position 7 with similarity score 0.3622402250766754
found similar document with id 3377 in position 0 with similarity score 0.46711117029190063
found similar document with id 6086 in position 0 with similarity score 0.4845796227455139
mean percentiles: 69.00
current mean_percentiles: 69.00, best: 100.00
training epoch 10 ...


2020-02-23 08:42:04,827 : INFO : EPOCH 1 - PROGRESS: at 35.70% examples, 1461237 words/s, in_qsize 10, out_qsize 1
2020-02-23 08:42:05,829 : INFO : EPOCH 1 - PROGRESS: at 75.47% examples, 1546078 words/s, in_qsize 9, out_qsize 0
2020-02-23 08:42:06,444 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 08:42:06,445 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 08:42:06,450 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 08:42:06,454 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 08:42:06,456 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 08:42:06,456 : INFO : EPOCH - 1 : training on 4866436 raw words (4113866 effective words) took 2.6s, 1561320 effective words/s
2020-02-23 08:42:06,456 : INFO : training on a 4866436 raw words (4113866 effective words) took 2.6s, 1559173 effective words/s


found similar document with id 557 in position 4 with similarity score 0.4253980815410614
found similar document with id 6400 in position 0 with similarity score 0.4955461621284485
found similar document with id 5756 in position 0 with similarity score 0.47075992822647095
found similar document with id 4463 in position 2 with similarity score 0.4597526490688324
found similar document with id 1244 in position 3 with similarity score 0.3155973255634308
found similar document with id 467 in position 3 with similarity score 0.44359251856803894
found similar document with id 1760 in position 0 with similarity score 0.5016111731529236
found similar document with id 4002 in position 3 with similarity score 0.44518810510635376
mean percentiles: 65.00
current mean_percentiles: 65.00, best: 100.00
early stop...
