In [None]:
# import sys
# !{sys.executable} -m pip install numpy pandas matplotlib sklearn seaborn
# !{sys.executable} -m pip install --upgrade gensim
# !{sys.executable} -m pip install nltk
# !{sys.executable} -m pip install beautifulsoup4
# import nltk
# nltk.download('punkt')

In [1]:
import os
import re
import gensim
import multiprocessing
import random
import logging
import numpy as np
import zipfile

from urllib import request

from pathlib import Path
from os import listdir
from os.path import isfile, join
from nltk.tokenize import RegexpTokenizer
from bs4 import BeautifulSoup

from sklearn.cluster import KMeans
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
SKIP_FILES = ""
NEWLINE = '\n'

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

"""
read training files
"""
def read_files(path):
    print("path: {}...".format(path))
    for root, dirnames, filenames in os.walk(path):
        for dirname in dirnames:
            read_files(os.path.join(root, dirname))
        for filename in filenames:
            if filename not in SKIP_FILES:
                filepath = os.path.join(root, filename)
                if os.path.isfile(filepath):
                    lines = []
                    f = open(filepath, encoding='latin-1')
                    for line in f:
                        lines.append(line)
                    f.close()
                    content = NEWLINE.join(lines)
                    yield filename, content

def download(url, filename):
    """Download a file if not present"""
    if not os.path.exists(filename):
        print("downloading {}...".format(filename))
        filename, _ = request.urlretrieve(url + filename, filename)

        print("extracting {}...".format(filename))
        with zipfile.ZipFile(filename) as f:
            f.extractall()
            print("extracting {} done".format(filename))

    """directory data"""
    data_path = filename.replace('.zip','')
    if not os.path.exists(data_path):
        print("extracting {}...".format(filename))
        with zipfile.ZipFile(filename) as f:
            f.extractall()
            print("extracting {} done".format(filename))

    """directory model for saving model while training"""
    if not os.path.exists('model'):
        os.mkdir('model')
        print("directory model created...")

    return data_path

In [3]:
def get_data(url, filename):
    data_path = download(url, filename)
    documents = []
    print("building documents...")
    for fname, text in read_files(data_path):
        documents.append(clean_str(text).split(' '))
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
    print("building documents done")
    return documents

In [4]:
# download data training
documents = get_data(url='https://github.com/kadriansyah/notebook/raw/master/alodokter-doc2vec-article/', filename="data.zip")
print("we have {} documents".format(len(documents)))

downloading data.zip...
extracting data.zip...
directory model created...
building documents...
path: data...
building documents done
we have 7932 documents


In [5]:
documents[0]

TaggedDocument(words=['penyebab', 'bayi', 'muntah', 'setelah', 'minum', 'asi', 'dan', 'cara', 'mengatasinya', 'bayi', 'muntah', 'setelah', 'minum', 'asi', '(', 'air', 'susu', 'ibu', ')', 'adalah', 'keluhan', 'yang', 'sering', 'terjadi', 'sebagian', 'bayi', 'bahkan', 'mengalaminya', 'hampir', 'setiap', 'kali', 'selesai', 'menyusu', 'meski', 'umumnya', 'normal', 'kondisi', 'ini', 'bisa', 'juga', 'disebabkan', 'oleh', 'gangguan', 'berbahaya', 'yang', 'harus', 'diwaspadai', 'bayi', 'muntah', 'setelah', 'minum', 'asi', 'dikenal', 'dengan', 'istilah', 'gumoh', 'gumoh', 'dikatakan', 'normal', 'apabila', 'tidak', 'menyebabkan', 'bayi', 'rewel', 'atau', 'sesak', 'napas', 'meskipun', 'dapat', 'dicegah', 'kondisi', 'tersebut', 'tidak', 'memerlukan', 'penanganan', 'khusus', 'dan', 'normal', 'terjadi', 'penyebab', 'bayi', 'muntah', 'setelah', 'minum', 'asi', 'gumoh', 'disebabkan', 'oleh', 'asi', 'atau', 'susu', 'yang', 'ditelan', 'bayi', 'kembali', 'ke', 'kerongkongan', 'karena', 'otot', 'di', 'sal

In [6]:
def evaluate(model, documents, steps):
    percentiles = np.zeros(steps)
    for step in range(steps):
        docid = np.random.randint(model.docvecs.count)
        inferred_vector = model.infer_vector(documents[docid][0])
        similars = model.docvecs.most_similar(positive=[inferred_vector], topn=10)
        for idx,simdoc in enumerate(similars):
            if simdoc[0] == docid:
                print("found similar document with id {} in position {} with similarity score {}".format(simdoc[0], idx, simdoc[1]))
                percentiles[step] = ((len(similars) - idx) / len(similars)) * 100
                break
    return np.mean(percentiles)

def train(documents=documents, model_name="model/alodokter-articles-doc2vec.model", max_epochs=50, patience=3):
    best_mean_percentiles = 0
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Doc2Vec(dm=1, vector_size=300, window=2, alpha=0.1, min_alpha=0.0001, min_count=5, epochs=1, workers=5)
    model.build_vocab(documents)
    for epoch in range(max_epochs):
        print('training epoch {:d} ...'.format(epoch))
        model.train(documents, total_examples=model.corpus_count,epochs=model.epochs)
        mean_percentiles = evaluate(model,documents,10)
        print('mean percentiles: {:.2f}'.format(mean_percentiles))
        
        if mean_percentiles < best_mean_percentiles:
            print("current mean_percentiles: {:.2f}, best: {:.2f}".format(mean_percentiles, best_mean_percentiles))
            patience = patience-1
        else:
            best_mean_percentiles = mean_percentiles
            print("========== Saving best model with mean_percentiles: {:.2f} ==========".format(mean_percentiles))
            model.save(model_name)
            patience = patience+1
        
        if patience == 0:
            print("early stop...")
            print("========== Saving best model with mean_percentiles: {:.2f} ==========".format(best_mean_percentiles))
            break
    
    return model

In [7]:
model = train(documents)

2020-02-23 09:59:53,541 : INFO : collecting all words and their counts
2020-02-23 09:59:53,542 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-02-23 09:59:54,211 : INFO : collected 54008 word types and 7932 unique tags from a corpus of 7932 examples and 4866436 words
2020-02-23 09:59:54,212 : INFO : Loading a fresh vocabulary
2020-02-23 09:59:54,248 : INFO : effective_min_count=5 retains 15409 unique words (28% of original 54008, drops 38599)
2020-02-23 09:59:54,248 : INFO : effective_min_count=5 leaves 4811064 word corpus (98% of original 4866436, drops 55372)
2020-02-23 09:59:54,290 : INFO : deleting the raw counts dictionary of 54008 items
2020-02-23 09:59:54,292 : INFO : sample=0.001 downsamples 51 most-common words
2020-02-23 09:59:54,293 : INFO : downsampling leaves estimated 4105293 word corpus (85.3% of prior 4811064)
2020-02-23 09:59:54,333 : INFO : estimated required memory for 15409 words and 300 dimensions: 54204500 bytes
2020-02-23 09:5

training epoch 0 ...


2020-02-23 09:59:58,943 : INFO : EPOCH 1 - PROGRESS: at 36.91% examples, 1515329 words/s, in_qsize 9, out_qsize 0
2020-02-23 09:59:59,947 : INFO : EPOCH 1 - PROGRESS: at 75.26% examples, 1540463 words/s, in_qsize 9, out_qsize 0
2020-02-23 10:00:00,621 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 10:00:00,629 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 10:00:00,631 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 10:00:00,631 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 10:00:00,632 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 10:00:00,633 : INFO : EPOCH - 1 : training on 4866436 raw words (4113427 effective words) took 2.7s, 1525990 effective words/s
2020-02-23 10:00:00,634 : INFO : training on a 4866436 raw words (4113427 effective words) took 2.7s, 1524820 effective words/s
2020-02-23 10:00:00,636 : INFO : precomputing L2-no

found similar document with id 3700 in position 0 with similarity score 0.6231238842010498
found similar document with id 7568 in position 0 with similarity score 0.7852587103843689
found similar document with id 5502 in position 0 with similarity score 0.7272913455963135
found similar document with id 4699 in position 0 with similarity score 0.750701367855072
found similar document with id 4810 in position 0 with similarity score 0.7558729648590088
found similar document with id 4295 in position 0 with similarity score 0.8224493265151978
found similar document with id 6448 in position 0 with similarity score 0.8298599720001221
found similar document with id 2632 in position 0 with similarity score 0.7019071578979492
found similar document with id 3084 in position 0 with similarity score 0.585481584072113
mean percentiles: 90.00


2020-02-23 10:00:01,124 : INFO : saved model/alodokter-articles-doc2vec.model
2020-02-23 10:00:01,125 : INFO : training model with 5 workers on 15409 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2


training epoch 1 ...


2020-02-23 10:00:02,141 : INFO : EPOCH 1 - PROGRESS: at 26.63% examples, 1074402 words/s, in_qsize 8, out_qsize 1
2020-02-23 10:00:03,142 : INFO : EPOCH 1 - PROGRESS: at 60.33% examples, 1234665 words/s, in_qsize 9, out_qsize 0
2020-02-23 10:00:04,142 : INFO : EPOCH 1 - PROGRESS: at 96.75% examples, 1323094 words/s, in_qsize 10, out_qsize 0
2020-02-23 10:00:04,211 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 10:00:04,214 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 10:00:04,222 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 10:00:04,229 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 10:00:04,233 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 10:00:04,233 : INFO : EPOCH - 1 : training on 4866436 raw words (4113436 effective words) took 3.1s, 1325287 effective words/s
2020-02-23 10:00:04,234 : INFO : training on a 4866436 raw words

found similar document with id 571 in position 1 with similarity score 0.5982423424720764
found similar document with id 7040 in position 0 with similarity score 0.6924117207527161
found similar document with id 1551 in position 0 with similarity score 0.5424140095710754
found similar document with id 6438 in position 0 with similarity score 0.669685959815979
found similar document with id 373 in position 9 with similarity score 0.4449630379676819
found similar document with id 5437 in position 0 with similarity score 0.6495814323425293
found similar document with id 4804 in position 0 with similarity score 0.6243957877159119
found similar document with id 2464 in position 0 with similarity score 0.637941837310791
found similar document with id 3942 in position 0 with similarity score 0.6084734797477722
found similar document with id 6217 in position 0 with similarity score 0.7411275506019592
mean percentiles: 90.00


2020-02-23 10:00:04,660 : INFO : saved model/alodokter-articles-doc2vec.model
2020-02-23 10:00:04,661 : INFO : training model with 5 workers on 15409 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2


training epoch 2 ...


2020-02-23 10:00:05,673 : INFO : EPOCH 1 - PROGRESS: at 36.91% examples, 1510696 words/s, in_qsize 9, out_qsize 0
2020-02-23 10:00:06,677 : INFO : EPOCH 1 - PROGRESS: at 74.89% examples, 1530604 words/s, in_qsize 10, out_qsize 0
2020-02-23 10:00:07,439 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 10:00:07,441 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 10:00:07,443 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 10:00:07,445 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 10:00:07,450 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 10:00:07,451 : INFO : EPOCH - 1 : training on 4866436 raw words (4113873 effective words) took 2.8s, 1476558 effective words/s
2020-02-23 10:00:07,451 : INFO : training on a 4866436 raw words (4113873 effective words) took 2.8s, 1474992 effective words/s
2020-02-23 10:00:07,479 : INFO : training model wi

found similar document with id 6266 in position 0 with similarity score 0.4655887484550476
found similar document with id 4497 in position 0 with similarity score 0.592694878578186
found similar document with id 2337 in position 0 with similarity score 0.5482645630836487
found similar document with id 2086 in position 0 with similarity score 0.6456488370895386
found similar document with id 3489 in position 0 with similarity score 0.6533195376396179
found similar document with id 4807 in position 0 with similarity score 0.670346736907959
found similar document with id 5934 in position 0 with similarity score 0.6296235918998718
found similar document with id 1630 in position 2 with similarity score 0.526195764541626
found similar document with id 4401 in position 0 with similarity score 0.5927433371543884
mean percentiles: 88.00
current mean_percentiles: 88.00, best: 90.00
training epoch 3 ...


2020-02-23 10:00:08,483 : INFO : EPOCH 1 - PROGRESS: at 30.08% examples, 1232685 words/s, in_qsize 9, out_qsize 0
2020-02-23 10:00:09,485 : INFO : EPOCH 1 - PROGRESS: at 63.64% examples, 1310117 words/s, in_qsize 10, out_qsize 0
2020-02-23 10:00:10,420 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 10:00:10,424 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 10:00:10,427 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 10:00:10,432 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 10:00:10,433 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 10:00:10,433 : INFO : EPOCH - 1 : training on 4866436 raw words (4112803 effective words) took 3.0s, 1393835 effective words/s
2020-02-23 10:00:10,434 : INFO : training on a 4866436 raw words (4112803 effective words) took 3.0s, 1392241 effective words/s
2020-02-23 10:00:10,465 : INFO : saving Doc2Vec ob

found similar document with id 6927 in position 0 with similarity score 0.5845781564712524
found similar document with id 4407 in position 0 with similarity score 0.7064756155014038
found similar document with id 6178 in position 0 with similarity score 0.611943781375885
found similar document with id 678 in position 3 with similarity score 0.5245034694671631
found similar document with id 3654 in position 0 with similarity score 0.6501209735870361
found similar document with id 7185 in position 0 with similarity score 0.5646973848342896
found similar document with id 3661 in position 0 with similarity score 0.5748719573020935
found similar document with id 5724 in position 0 with similarity score 0.6617432832717896
found similar document with id 2752 in position 0 with similarity score 0.6111876368522644
found similar document with id 3916 in position 0 with similarity score 0.7313289642333984
mean percentiles: 97.00


2020-02-23 10:00:10,874 : INFO : saved model/alodokter-articles-doc2vec.model
2020-02-23 10:00:10,875 : INFO : training model with 5 workers on 15409 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2


training epoch 4 ...


2020-02-23 10:00:11,890 : INFO : EPOCH 1 - PROGRESS: at 37.10% examples, 1513927 words/s, in_qsize 10, out_qsize 1
2020-02-23 10:00:12,899 : INFO : EPOCH 1 - PROGRESS: at 75.08% examples, 1528522 words/s, in_qsize 10, out_qsize 0
2020-02-23 10:00:13,525 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 10:00:13,528 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 10:00:13,529 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 10:00:13,530 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 10:00:13,535 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 10:00:13,536 : INFO : EPOCH - 1 : training on 4866436 raw words (4113198 effective words) took 2.7s, 1548181 effective words/s
2020-02-23 10:00:13,536 : INFO : training on a 4866436 raw words (4113198 effective words) took 2.7s, 1545907 effective words/s
2020-02-23 10:00:13,562 : INFO : training model w

found similar document with id 7296 in position 5 with similarity score 0.4947265684604645
found similar document with id 3299 in position 0 with similarity score 0.5944305658340454
found similar document with id 7384 in position 0 with similarity score 0.563948392868042
found similar document with id 2174 in position 0 with similarity score 0.5458066463470459
found similar document with id 7795 in position 0 with similarity score 0.6145461797714233
found similar document with id 6517 in position 0 with similarity score 0.6841855049133301
found similar document with id 3347 in position 0 with similarity score 0.5656232833862305
found similar document with id 4263 in position 0 with similarity score 0.5948383212089539
found similar document with id 733 in position 2 with similarity score 0.534643292427063
mean percentiles: 83.00
current mean_percentiles: 83.00, best: 97.00
training epoch 5 ...


2020-02-23 10:00:14,565 : INFO : EPOCH 1 - PROGRESS: at 37.10% examples, 1531295 words/s, in_qsize 10, out_qsize 0
2020-02-23 10:00:15,567 : INFO : EPOCH 1 - PROGRESS: at 74.26% examples, 1525621 words/s, in_qsize 10, out_qsize 0
2020-02-23 10:00:16,257 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 10:00:16,261 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 10:00:16,265 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 10:00:16,268 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 10:00:16,273 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 10:00:16,274 : INFO : EPOCH - 1 : training on 4866436 raw words (4114013 effective words) took 2.7s, 1518458 effective words/s
2020-02-23 10:00:16,274 : INFO : training on a 4866436 raw words (4114013 effective words) took 2.7s, 1516778 effective words/s
2020-02-23 10:00:16,300 : INFO : training model w

found similar document with id 4536 in position 0 with similarity score 0.5941367745399475
found similar document with id 1575 in position 0 with similarity score 0.5020698308944702
found similar document with id 4938 in position 0 with similarity score 0.5367515683174133
found similar document with id 1693 in position 0 with similarity score 0.5017469525337219
found similar document with id 6706 in position 0 with similarity score 0.5239243507385254
found similar document with id 5156 in position 0 with similarity score 0.5190706849098206
found similar document with id 5807 in position 0 with similarity score 0.6640538573265076
found similar document with id 5658 in position 0 with similarity score 0.6732567548751831
found similar document with id 7867 in position 9 with similarity score 0.49959293007850647
mean percentiles: 81.00
current mean_percentiles: 81.00, best: 97.00
training epoch 6 ...


2020-02-23 10:00:17,310 : INFO : EPOCH 1 - PROGRESS: at 35.70% examples, 1460172 words/s, in_qsize 9, out_qsize 0
2020-02-23 10:00:18,311 : INFO : EPOCH 1 - PROGRESS: at 73.01% examples, 1497440 words/s, in_qsize 9, out_qsize 0
2020-02-23 10:00:19,020 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 10:00:19,025 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 10:00:19,029 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 10:00:19,032 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 10:00:19,037 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 10:00:19,037 : INFO : EPOCH - 1 : training on 4866436 raw words (4113792 effective words) took 2.7s, 1504702 effective words/s
2020-02-23 10:00:19,038 : INFO : training on a 4866436 raw words (4113792 effective words) took 2.7s, 1503053 effective words/s
2020-02-23 10:00:19,066 : INFO : saving Doc2Vec obj

found similar document with id 1086 in position 0 with similarity score 0.5050361156463623
found similar document with id 7821 in position 0 with similarity score 0.4977290630340576
found similar document with id 4293 in position 0 with similarity score 0.5938915014266968
found similar document with id 1289 in position 0 with similarity score 0.4444548189640045
found similar document with id 5550 in position 0 with similarity score 0.5768442153930664
found similar document with id 3930 in position 0 with similarity score 0.5669462084770203
found similar document with id 5120 in position 0 with similarity score 0.540306806564331
found similar document with id 5317 in position 0 with similarity score 0.6424815654754639
found similar document with id 1765 in position 1 with similarity score 0.49373659491539
found similar document with id 4473 in position 0 with similarity score 0.5734233856201172
mean percentiles: 99.00


2020-02-23 10:00:19,470 : INFO : saved model/alodokter-articles-doc2vec.model
2020-02-23 10:00:19,471 : INFO : training model with 5 workers on 15409 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2


training epoch 7 ...


2020-02-23 10:00:20,482 : INFO : EPOCH 1 - PROGRESS: at 36.91% examples, 1511032 words/s, in_qsize 9, out_qsize 0
2020-02-23 10:00:21,484 : INFO : EPOCH 1 - PROGRESS: at 72.44% examples, 1483106 words/s, in_qsize 9, out_qsize 0
2020-02-23 10:00:22,245 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 10:00:22,246 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 10:00:22,252 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 10:00:22,254 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 10:00:22,257 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 10:00:22,258 : INFO : EPOCH - 1 : training on 4866436 raw words (4112647 effective words) took 2.8s, 1476996 effective words/s
2020-02-23 10:00:22,258 : INFO : training on a 4866436 raw words (4112647 effective words) took 2.8s, 1475870 effective words/s
2020-02-23 10:00:22,282 : INFO : training model wit

found similar document with id 2386 in position 0 with similarity score 0.6560759544372559
found similar document with id 2935 in position 0 with similarity score 0.5385875701904297
found similar document with id 5445 in position 6 with similarity score 0.4988081455230713
found similar document with id 2887 in position 0 with similarity score 0.5284514427185059
found similar document with id 4150 in position 0 with similarity score 0.565554141998291
found similar document with id 962 in position 0 with similarity score 0.5409508347511292
found similar document with id 4267 in position 0 with similarity score 0.5429056882858276
found similar document with id 5097 in position 0 with similarity score 0.6230177879333496
mean percentiles: 74.00
current mean_percentiles: 74.00, best: 99.00
training epoch 8 ...


2020-02-23 10:00:23,287 : INFO : EPOCH 1 - PROGRESS: at 35.26% examples, 1450329 words/s, in_qsize 10, out_qsize 0
2020-02-23 10:00:24,293 : INFO : EPOCH 1 - PROGRESS: at 73.63% examples, 1509079 words/s, in_qsize 10, out_qsize 1
2020-02-23 10:00:24,980 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 10:00:24,984 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 10:00:24,987 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 10:00:24,989 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 10:00:24,990 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 10:00:24,991 : INFO : EPOCH - 1 : training on 4866436 raw words (4113100 effective words) took 2.7s, 1520113 effective words/s
2020-02-23 10:00:24,991 : INFO : training on a 4866436 raw words (4113100 effective words) took 2.7s, 1518216 effective words/s
2020-02-23 10:00:25,016 : INFO : saving Doc2Vec o

found similar document with id 6952 in position 0 with similarity score 0.5690685510635376
found similar document with id 5084 in position 0 with similarity score 0.6109544038772583
found similar document with id 3548 in position 0 with similarity score 0.5299829840660095
found similar document with id 4701 in position 0 with similarity score 0.5516369342803955
found similar document with id 583 in position 0 with similarity score 0.44746965169906616
found similar document with id 4207 in position 0 with similarity score 0.53471839427948
found similar document with id 7704 in position 0 with similarity score 0.5501301884651184
found similar document with id 7734 in position 0 with similarity score 0.6207619905471802
found similar document with id 4241 in position 0 with similarity score 0.609315037727356
found similar document with id 6880 in position 0 with similarity score 0.5950736999511719
mean percentiles: 100.00


2020-02-23 10:00:25,414 : INFO : saved model/alodokter-articles-doc2vec.model
2020-02-23 10:00:25,415 : INFO : training model with 5 workers on 15409 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2


training epoch 9 ...


2020-02-23 10:00:26,423 : INFO : EPOCH 1 - PROGRESS: at 37.71% examples, 1551746 words/s, in_qsize 10, out_qsize 0
2020-02-23 10:00:27,428 : INFO : EPOCH 1 - PROGRESS: at 72.63% examples, 1488935 words/s, in_qsize 9, out_qsize 0
2020-02-23 10:00:28,219 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 10:00:28,223 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 10:00:28,225 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 10:00:28,226 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 10:00:28,228 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 10:00:28,228 : INFO : EPOCH - 1 : training on 4866436 raw words (4113059 effective words) took 2.8s, 1464859 effective words/s
2020-02-23 10:00:28,229 : INFO : training on a 4866436 raw words (4113059 effective words) took 2.8s, 1462053 effective words/s
2020-02-23 10:00:28,252 : INFO : training model wi

found similar document with id 7623 in position 1 with similarity score 0.5045701861381531
found similar document with id 5692 in position 0 with similarity score 0.4925502836704254
found similar document with id 3873 in position 0 with similarity score 0.6064233779907227
found similar document with id 2735 in position 1 with similarity score 0.44647854566574097
found similar document with id 2619 in position 0 with similarity score 0.45451104640960693
found similar document with id 4347 in position 0 with similarity score 0.464932382106781
found similar document with id 3388 in position 0 with similarity score 0.4294683337211609
found similar document with id 4167 in position 0 with similarity score 0.5130352973937988
mean percentiles: 78.00
current mean_percentiles: 78.00, best: 100.00
training epoch 10 ...


2020-02-23 10:00:29,274 : INFO : EPOCH 1 - PROGRESS: at 36.91% examples, 1496369 words/s, in_qsize 10, out_qsize 0
2020-02-23 10:00:30,274 : INFO : EPOCH 1 - PROGRESS: at 70.35% examples, 1436538 words/s, in_qsize 9, out_qsize 0
2020-02-23 10:00:31,015 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 10:00:31,018 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 10:00:31,019 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 10:00:31,020 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 10:00:31,028 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 10:00:31,029 : INFO : EPOCH - 1 : training on 4866436 raw words (4113468 effective words) took 2.8s, 1483231 effective words/s
2020-02-23 10:00:31,029 : INFO : training on a 4866436 raw words (4113468 effective words) took 2.8s, 1481350 effective words/s
2020-02-23 10:00:31,053 : INFO : training model wi

found similar document with id 4119 in position 0 with similarity score 0.4817700982093811
found similar document with id 2976 in position 3 with similarity score 0.4866436719894409
found similar document with id 6770 in position 0 with similarity score 0.5098720788955688
found similar document with id 7777 in position 0 with similarity score 0.5361309051513672
found similar document with id 4517 in position 1 with similarity score 0.5081150531768799
found similar document with id 4165 in position 0 with similarity score 0.5586714148521423
found similar document with id 5589 in position 0 with similarity score 0.5203511118888855
found similar document with id 1063 in position 0 with similarity score 0.5195136070251465
mean percentiles: 76.00
current mean_percentiles: 76.00, best: 100.00
training epoch 11 ...


2020-02-23 10:00:32,061 : INFO : EPOCH 1 - PROGRESS: at 37.32% examples, 1532816 words/s, in_qsize 10, out_qsize 0
2020-02-23 10:00:33,077 : INFO : EPOCH 1 - PROGRESS: at 76.07% examples, 1548092 words/s, in_qsize 10, out_qsize 1
2020-02-23 10:00:33,662 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 10:00:33,667 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 10:00:33,668 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 10:00:33,671 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 10:00:33,672 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 10:00:33,672 : INFO : EPOCH - 1 : training on 4866436 raw words (4113151 effective words) took 2.6s, 1572415 effective words/s
2020-02-23 10:00:33,673 : INFO : training on a 4866436 raw words (4113151 effective words) took 2.6s, 1570482 effective words/s
2020-02-23 10:00:33,698 : INFO : training model w

found similar document with id 6504 in position 0 with similarity score 0.43613314628601074
found similar document with id 7434 in position 0 with similarity score 0.5340348482131958
found similar document with id 531 in position 8 with similarity score 0.48238855600357056
found similar document with id 7359 in position 1 with similarity score 0.3738921582698822
found similar document with id 7758 in position 5 with similarity score 0.37554943561553955
found similar document with id 6593 in position 0 with similarity score 0.3600296080112457
mean percentiles: 46.00
current mean_percentiles: 46.00, best: 100.00
training epoch 12 ...


2020-02-23 10:00:34,704 : INFO : EPOCH 1 - PROGRESS: at 36.28% examples, 1486485 words/s, in_qsize 9, out_qsize 0
2020-02-23 10:00:35,715 : INFO : EPOCH 1 - PROGRESS: at 74.47% examples, 1521212 words/s, in_qsize 9, out_qsize 0
2020-02-23 10:00:36,375 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 10:00:36,380 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 10:00:36,382 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 10:00:36,382 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 10:00:36,384 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 10:00:36,385 : INFO : EPOCH - 1 : training on 4866436 raw words (4114068 effective words) took 2.7s, 1532675 effective words/s
2020-02-23 10:00:36,385 : INFO : training on a 4866436 raw words (4114068 effective words) took 2.7s, 1530995 effective words/s


found similar document with id 5518 in position 0 with similarity score 0.33522993326187134
found similar document with id 3107 in position 3 with similarity score 0.45306405425071716
found similar document with id 6105 in position 1 with similarity score 0.4211587607860565
mean percentiles: 26.00
current mean_percentiles: 26.00, best: 100.00
early stop...
