In [None]:
# import sys
# !{sys.executable} -m pip install numpy pandas matplotlib sklearn seaborn
# !{sys.executable} -m pip install --upgrade gensim
# !{sys.executable} -m pip install nltk
# import nltk
# nltk.download('punkt')
# !{sys.executable} -m pip install beautifulsoup4

In [1]:
import os
import re
import gensim
import multiprocessing
import random
import logging
import numpy as np

from pathlib import Path
from os import listdir
from os.path import isfile, join
from nltk.tokenize import RegexpTokenizer
from bs4 import BeautifulSoup

from sklearn.cluster import KMeans
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
CORPUS_PATH  = 'data/'
SKIP_FILES = ""
NEWLINE = '\n'

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

"""
read training files
"""
def read_files(path=CORPUS_PATH):
#     print("processing path: {} ...".format(path))
    for root, dirnames, filenames in os.walk(path):
        for dirname in dirnames:
            read_files(os.path.join(root, dirname))
        for filename in filenames:
            if filename not in SKIP_FILES:
                filepath = os.path.join(root, filename)
                if os.path.isfile(filepath):
                    lines = []
                    f = open(filepath, encoding='latin-1')
                    for line in f:
                        lines.append(line)
                    f.close()
                    content = NEWLINE.join(lines)
                    yield filename, content

In [3]:
def get_data():
    documents = []
    for filename, text in read_files():
#         print("==== filename is {} ====\n{}\n\n".format(filename, text))
        documents.append(clean_str(text).split(' '))
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
    return documents

In [6]:
documents = get_data()
print("we have {} documents".format(len(documents)))

we have 7932 documents


In [14]:
documents[0]

TaggedDocument(words=['penyebab', 'bayi', 'muntah', 'setelah', 'minum', 'asi', 'dan', 'cara', 'mengatasinya', 'bayi', 'muntah', 'setelah', 'minum', 'asi', '(', 'air', 'susu', 'ibu', ')', 'adalah', 'keluhan', 'yang', 'sering', 'terjadi', 'sebagian', 'bayi', 'bahkan', 'mengalaminya', 'hampir', 'setiap', 'kali', 'selesai', 'menyusu', 'meski', 'umumnya', 'normal', 'kondisi', 'ini', 'bisa', 'juga', 'disebabkan', 'oleh', 'gangguan', 'berbahaya', 'yang', 'harus', 'diwaspadai', 'bayi', 'muntah', 'setelah', 'minum', 'asi', 'dikenal', 'dengan', 'istilah', 'gumoh', 'gumoh', 'dikatakan', 'normal', 'apabila', 'tidak', 'menyebabkan', 'bayi', 'rewel', 'atau', 'sesak', 'napas', 'meskipun', 'dapat', 'dicegah', 'kondisi', 'tersebut', 'tidak', 'memerlukan', 'penanganan', 'khusus', 'dan', 'normal', 'terjadi', 'penyebab', 'bayi', 'muntah', 'setelah', 'minum', 'asi', 'gumoh', 'disebabkan', 'oleh', 'asi', 'atau', 'susu', 'yang', 'ditelan', 'bayi', 'kembali', 'ke', 'kerongkongan', 'karena', 'otot', 'di', 'sal

In [134]:
def evaluate(model, documents, steps):
    percentiles = np.zeros(steps)
    for step in range(steps):
        docid = np.random.randint(model.docvecs.count)
        inferred_vector = model.infer_vector(documents[docid][0])
        similars = model.docvecs.most_similar(positive=[inferred_vector], topn=10)
        for idx,simdoc in enumerate(similars):
            if simdoc[0] == docid:
                print("found similar document with id {} in position {} with similarity score {}".format(simdoc[0], idx, simdoc[1]))
                percentiles[step] = ((len(similars) - idx) / len(similars)) * 100
                break
    return np.mean(percentiles)

def train(documents=documents, model_name="model/alodokter-articles-doc2vec.model", max_epochs=50, patience=3):
    best_mean_percentiles = 0
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Doc2Vec(dm=1, vector_size=300, window=2, alpha=0.1, min_alpha=0.0001, min_count=5, epochs=1, workers=5)
    model.build_vocab(documents)
    for epoch in range(max_epochs):
        print('training epoch {:d} ...'.format(epoch))
        model.train(documents, total_examples=model.corpus_count,epochs=model.epochs)
        mean_percentiles = evaluate(model,documents,10)
        print('mean percentiles: {:.2f}'.format(mean_percentiles))
        
        if mean_percentiles < best_mean_percentiles:
            print("current mean_percentiles: {:.2f}, best: {:.2f}".format(mean_percentiles, best_mean_percentiles))
            patience = patience-1
        else:
            best_mean_percentiles = mean_percentiles
            print("========== Saving best model with mean_percentiles: {:.2f} ==========".format(mean_percentiles))
            model.save(model_name)
            patience = patience+1
        
        if patience == 0:
            print("early stop...")
            print("========== Saving best model with mean_percentiles: {:.2f} ==========".format(best_mean_percentiles))
            break
    
    return model

In [135]:
model = train(documents)

2020-02-22 22:30:25,178 : INFO : collecting all words and their counts
2020-02-22 22:30:25,179 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-02-22 22:30:25,885 : INFO : collected 54008 word types and 7932 unique tags from a corpus of 7932 examples and 4866436 words
2020-02-22 22:30:25,886 : INFO : Loading a fresh vocabulary
2020-02-22 22:30:26,042 : INFO : effective_min_count=5 retains 15409 unique words (28% of original 54008, drops 38599)
2020-02-22 22:30:26,043 : INFO : effective_min_count=5 leaves 4811064 word corpus (98% of original 4866436, drops 55372)
2020-02-22 22:30:26,084 : INFO : deleting the raw counts dictionary of 54008 items
2020-02-22 22:30:26,085 : INFO : sample=0.001 downsamples 51 most-common words
2020-02-22 22:30:26,086 : INFO : downsampling leaves estimated 4105293 word corpus (85.3% of prior 4811064)
2020-02-22 22:30:26,134 : INFO : estimated required memory for 15409 words and 300 dimensions: 54204500 bytes
2020-02-22 22:3

training epoch 0 ...


2020-02-22 22:30:31,134 : INFO : EPOCH 1 - PROGRESS: at 18.23% examples, 730934 words/s, in_qsize 10, out_qsize 0
2020-02-22 22:30:32,136 : INFO : EPOCH 1 - PROGRESS: at 48.74% examples, 993015 words/s, in_qsize 10, out_qsize 0
2020-02-22 22:30:33,139 : INFO : EPOCH 1 - PROGRESS: at 78.64% examples, 1070627 words/s, in_qsize 9, out_qsize 0
2020-02-22 22:30:34,142 : INFO : EPOCH 1 - PROGRESS: at 95.57% examples, 978026 words/s, in_qsize 10, out_qsize 0
2020-02-22 22:30:34,328 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-22 22:30:34,335 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-22 22:30:34,338 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-22 22:30:34,340 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-22 22:30:34,343 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-22 22:30:34,344 : INFO : EPOCH - 1 : training on 4866436 raw words (4112247 effec

found similar document with id 5299 in position 0 with similarity score 0.7587615847587585
found similar document with id 2364 in position 0 with similarity score 0.6658836603164673
found similar document with id 1721 in position 0 with similarity score 0.7171398401260376
found similar document with id 998 in position 2 with similarity score 0.6046286821365356
found similar document with id 3040 in position 0 with similarity score 0.5846401453018188
found similar document with id 4962 in position 0 with similarity score 0.7138915061950684
found similar document with id 3248 in position 0 with similarity score 0.607184648513794
found similar document with id 4415 in position 0 with similarity score 0.7327130436897278
found similar document with id 3125 in position 0 with similarity score 0.6402595043182373
mean percentiles: 88.00


2020-02-22 22:30:34,888 : INFO : saved model/alodokter-articles-doc2vec.model
2020-02-22 22:30:34,889 : INFO : training model with 5 workers on 15409 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2


training epoch 1 ...


2020-02-22 22:30:35,894 : INFO : EPOCH 1 - PROGRESS: at 25.62% examples, 1043574 words/s, in_qsize 10, out_qsize 0
2020-02-22 22:30:36,900 : INFO : EPOCH 1 - PROGRESS: at 52.76% examples, 1083348 words/s, in_qsize 10, out_qsize 0
2020-02-22 22:30:37,901 : INFO : EPOCH 1 - PROGRESS: at 79.22% examples, 1083757 words/s, in_qsize 9, out_qsize 0
2020-02-22 22:30:38,601 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-22 22:30:38,605 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-22 22:30:38,606 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-22 22:30:38,611 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-22 22:30:38,614 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-22 22:30:38,614 : INFO : EPOCH - 1 : training on 4866436 raw words (4113669 effective words) took 3.7s, 1105142 effective words/s
2020-02-22 22:30:38,615 : INFO : training on a 4866436 raw word

found similar document with id 7786 in position 0 with similarity score 0.6994785070419312
found similar document with id 6907 in position 0 with similarity score 0.6400388479232788
found similar document with id 657 in position 0 with similarity score 0.5217722654342651
found similar document with id 4520 in position 0 with similarity score 0.6477442979812622
found similar document with id 3507 in position 0 with similarity score 0.5979304909706116
found similar document with id 4141 in position 0 with similarity score 0.5762260556221008
found similar document with id 4727 in position 0 with similarity score 0.6891166567802429
found similar document with id 1610 in position 0 with similarity score 0.5776258707046509
found similar document with id 6647 in position 0 with similarity score 0.6328952312469482
found similar document with id 2749 in position 0 with similarity score 0.584449291229248
mean percentiles: 100.00


2020-02-22 22:30:39,091 : INFO : saved model/alodokter-articles-doc2vec.model
2020-02-22 22:30:39,092 : INFO : training model with 5 workers on 15409 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2


training epoch 2 ...


2020-02-22 22:30:40,100 : INFO : EPOCH 1 - PROGRESS: at 18.62% examples, 756298 words/s, in_qsize 9, out_qsize 0
2020-02-22 22:30:41,111 : INFO : EPOCH 1 - PROGRESS: at 45.92% examples, 938576 words/s, in_qsize 9, out_qsize 0
2020-02-22 22:30:42,120 : INFO : EPOCH 1 - PROGRESS: at 75.26% examples, 1023762 words/s, in_qsize 9, out_qsize 0
2020-02-22 22:30:42,906 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-22 22:30:42,911 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-22 22:30:42,915 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-22 22:30:42,918 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-22 22:30:42,923 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-22 22:30:42,924 : INFO : EPOCH - 1 : training on 4866436 raw words (4113325 effective words) took 3.8s, 1074630 effective words/s
2020-02-22 22:30:42,924 : INFO : training on a 4866436 raw words (4

found similar document with id 7427 in position 0 with similarity score 0.5726195573806763
found similar document with id 4845 in position 0 with similarity score 0.6867046356201172
found similar document with id 1296 in position 0 with similarity score 0.6159794926643372
found similar document with id 4833 in position 0 with similarity score 0.6139371395111084
found similar document with id 1253 in position 6 with similarity score 0.4956313669681549
found similar document with id 3999 in position 0 with similarity score 0.5518233776092529
found similar document with id 2582 in position 2 with similarity score 0.4904721975326538
found similar document with id 2419 in position 0 with similarity score 0.6069518327713013
mean percentiles: 72.00
current mean_percentiles: 72.00, best: 100.00
training epoch 3 ...


2020-02-22 22:30:43,973 : INFO : EPOCH 1 - PROGRESS: at 26.01% examples, 1056570 words/s, in_qsize 9, out_qsize 0
2020-02-22 22:30:44,982 : INFO : EPOCH 1 - PROGRESS: at 53.59% examples, 1095970 words/s, in_qsize 10, out_qsize 0
2020-02-22 22:30:45,992 : INFO : EPOCH 1 - PROGRESS: at 84.34% examples, 1147609 words/s, in_qsize 10, out_qsize 0
2020-02-22 22:30:46,504 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-22 22:30:46,506 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-22 22:30:46,508 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-22 22:30:46,513 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-22 22:30:46,516 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-22 22:30:46,516 : INFO : EPOCH - 1 : training on 4866436 raw words (4113563 effective words) took 3.5s, 1158935 effective words/s
2020-02-22 22:30:46,517 : INFO : training on a 4866436 raw word

found similar document with id 6760 in position 0 with similarity score 0.5724490284919739
found similar document with id 3453 in position 0 with similarity score 0.6574223041534424
found similar document with id 3717 in position 0 with similarity score 0.5568020343780518
found similar document with id 4058 in position 0 with similarity score 0.6083765029907227
found similar document with id 3818 in position 0 with similarity score 0.5052177906036377
found similar document with id 1277 in position 1 with similarity score 0.605597198009491
found similar document with id 4629 in position 0 with similarity score 0.6412221789360046
found similar document with id 1826 in position 0 with similarity score 0.5820738077163696
mean percentiles: 79.00
current mean_percentiles: 79.00, best: 100.00
training epoch 4 ...


2020-02-22 22:30:47,571 : INFO : EPOCH 1 - PROGRESS: at 29.02% examples, 1179192 words/s, in_qsize 9, out_qsize 0
2020-02-22 22:30:48,579 : INFO : EPOCH 1 - PROGRESS: at 58.94% examples, 1201717 words/s, in_qsize 9, out_qsize 0
2020-02-22 22:30:49,581 : INFO : EPOCH 1 - PROGRESS: at 88.05% examples, 1200258 words/s, in_qsize 9, out_qsize 0
2020-02-22 22:30:49,968 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-22 22:30:49,970 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-22 22:30:49,971 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-22 22:30:49,974 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-22 22:30:49,975 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-22 22:30:49,975 : INFO : EPOCH - 1 : training on 4866436 raw words (4113122 effective words) took 3.4s, 1204145 effective words/s
2020-02-22 22:30:49,976 : INFO : training on a 4866436 raw words 

found similar document with id 2410 in position 0 with similarity score 0.5771746635437012
found similar document with id 1239 in position 0 with similarity score 0.43497687578201294
found similar document with id 6359 in position 0 with similarity score 0.58217453956604
found similar document with id 762 in position 3 with similarity score 0.4244115948677063
found similar document with id 5770 in position 0 with similarity score 0.6495475769042969
found similar document with id 1742 in position 0 with similarity score 0.6210180521011353
found similar document with id 924 in position 1 with similarity score 0.4558679461479187
found similar document with id 2059 in position 0 with similarity score 0.599189281463623
found similar document with id 6859 in position 2 with similarity score 0.4884864389896393
found similar document with id 6311 in position 0 with similarity score 0.6011608242988586
mean percentiles: 94.00
current mean_percentiles: 94.00, best: 100.00
training epoch 5 ...


2020-02-22 22:30:51,028 : INFO : EPOCH 1 - PROGRESS: at 27.61% examples, 1110314 words/s, in_qsize 9, out_qsize 0
2020-02-22 22:30:52,032 : INFO : EPOCH 1 - PROGRESS: at 55.86% examples, 1141542 words/s, in_qsize 9, out_qsize 0
2020-02-22 22:30:53,041 : INFO : EPOCH 1 - PROGRESS: at 84.19% examples, 1143734 words/s, in_qsize 9, out_qsize 0
2020-02-22 22:30:53,588 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-22 22:30:53,593 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-22 22:30:53,600 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-22 22:30:53,601 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-22 22:30:53,604 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-22 22:30:53,605 : INFO : EPOCH - 1 : training on 4866436 raw words (4112854 effective words) took 3.6s, 1144930 effective words/s
2020-02-22 22:30:53,606 : INFO : training on a 4866436 raw words 

found similar document with id 5843 in position 0 with similarity score 0.5966777801513672
found similar document with id 3121 in position 1 with similarity score 0.595870852470398
found similar document with id 2542 in position 0 with similarity score 0.5717759728431702
found similar document with id 7703 in position 0 with similarity score 0.5209951996803284
found similar document with id 4261 in position 0 with similarity score 0.6220839023590088
found similar document with id 6729 in position 1 with similarity score 0.5184998512268066
found similar document with id 7777 in position 0 with similarity score 0.659661054611206
found similar document with id 2651 in position 0 with similarity score 0.6376208066940308
found similar document with id 3554 in position 0 with similarity score 0.5351495146751404
found similar document with id 2133 in position 0 with similarity score 0.6039854288101196
mean percentiles: 98.00
current mean_percentiles: 98.00, best: 100.00
training epoch 6 ...


2020-02-22 22:30:54,647 : INFO : EPOCH 1 - PROGRESS: at 25.82% examples, 1052883 words/s, in_qsize 9, out_qsize 0
2020-02-22 22:30:55,661 : INFO : EPOCH 1 - PROGRESS: at 53.78% examples, 1100056 words/s, in_qsize 10, out_qsize 0
2020-02-22 22:30:56,662 : INFO : EPOCH 1 - PROGRESS: at 80.50% examples, 1097166 words/s, in_qsize 10, out_qsize 0
2020-02-22 22:30:57,378 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-22 22:30:57,382 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-22 22:30:57,386 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-22 22:30:57,389 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-22 22:30:57,393 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-22 22:30:57,394 : INFO : EPOCH - 1 : training on 4866436 raw words (4113017 effective words) took 3.7s, 1097558 effective words/s
2020-02-22 22:30:57,394 : INFO : training on a 4866436 raw word

found similar document with id 1345 in position 0 with similarity score 0.44535577297210693
found similar document with id 1906 in position 0 with similarity score 0.5315725803375244
found similar document with id 7199 in position 0 with similarity score 0.6234700679779053
found similar document with id 991 in position 1 with similarity score 0.44617241621017456
found similar document with id 6787 in position 0 with similarity score 0.5314539074897766
found similar document with id 3314 in position 0 with similarity score 0.5584055185317993
found similar document with id 3920 in position 0 with similarity score 0.5845468044281006
found similar document with id 4687 in position 0 with similarity score 0.64349365234375
found similar document with id 7504 in position 0 with similarity score 0.5365829467773438
found similar document with id 5070 in position 0 with similarity score 0.625667929649353
mean percentiles: 99.00
current mean_percentiles: 99.00, best: 100.00
early stop...
