In [None]:
# import sys
# !{sys.executable} -m pip install numpy pandas matplotlib sklearn seaborn
# !{sys.executable} -m pip install --upgrade gensim
# !{sys.executable} -m pip install nltk
# !{sys.executable} -m pip install beautifulsoup4
# import nltk
# nltk.download('punkt')

In [1]:
import os
import re
import gensim
import multiprocessing
import random
import logging
import numpy as np
import zipfile
import tensorflow as tf

from urllib import request

from pathlib import Path
from os import listdir
from os.path import isfile, join
from nltk.tokenize import RegexpTokenizer
from bs4 import BeautifulSoup

from sklearn.cluster import KMeans
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
SKIP_FILES = ""
NEWLINE = '\n'

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

"""
read training files
"""
def read_files(path):
    print("path: {}...".format(path))
    for root, dirnames, filenames in os.walk(path):
        for dirname in dirnames:
            read_files(os.path.join(root, dirname))
        for filename in filenames:
            if filename not in SKIP_FILES:
                filepath = os.path.join(root, filename)
                if os.path.isfile(filepath):
                    lines = []
                    f = open(filepath, encoding='latin-1')
                    for line in f:
                        lines.append(line)
                    f.close()
                    content = NEWLINE.join(lines)
                    yield filename, content

def download(url, filename):
    """Download a file if not present"""
    if not os.path.exists(filename):
        print("downloading {}...".format(filename))
        filename, _ = request.urlretrieve(url + filename, filename)

        print("extracting {}...".format(filename))
        with zipfile.ZipFile(filename) as f:
            f.extractall()

    """directory model for saving model while training"""
    if not os.path.exists('model'):
        os.mkdir('model')
        print("directory model created...")

    return filename

In [3]:
def get_data(url, filename):
    filename = download(url, filename)
    data_path = filename.replace('.zip','')
    documents = []
    print("building documents...")
    for fname, text in read_files(data_path):
        documents.append(clean_str(text).split(' '))
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
    print("building documents done")
    return documents

In [None]:
# download data training
documents = get_data(url='https://github.com/kadriansyah/notebook/raw/master/alodokter-doc2vec-article/', filename="data.zip")
print("we have {} documents".format(len(documents)))

downloading data.zip...


In [5]:
documents[0]

TaggedDocument(words=['penyebab', 'bayi', 'muntah', 'setelah', 'minum', 'asi', 'dan', 'cara', 'mengatasinya', 'bayi', 'muntah', 'setelah', 'minum', 'asi', '(', 'air', 'susu', 'ibu', ')', 'adalah', 'keluhan', 'yang', 'sering', 'terjadi', 'sebagian', 'bayi', 'bahkan', 'mengalaminya', 'hampir', 'setiap', 'kali', 'selesai', 'menyusu', 'meski', 'umumnya', 'normal', 'kondisi', 'ini', 'bisa', 'juga', 'disebabkan', 'oleh', 'gangguan', 'berbahaya', 'yang', 'harus', 'diwaspadai', 'bayi', 'muntah', 'setelah', 'minum', 'asi', 'dikenal', 'dengan', 'istilah', 'gumoh', 'gumoh', 'dikatakan', 'normal', 'apabila', 'tidak', 'menyebabkan', 'bayi', 'rewel', 'atau', 'sesak', 'napas', 'meskipun', 'dapat', 'dicegah', 'kondisi', 'tersebut', 'tidak', 'memerlukan', 'penanganan', 'khusus', 'dan', 'normal', 'terjadi', 'penyebab', 'bayi', 'muntah', 'setelah', 'minum', 'asi', 'gumoh', 'disebabkan', 'oleh', 'asi', 'atau', 'susu', 'yang', 'ditelan', 'bayi', 'kembali', 'ke', 'kerongkongan', 'karena', 'otot', 'di', 'sal

In [6]:
def evaluate(model, documents, steps):
    percentiles = np.zeros(steps)
    for step in range(steps):
        docid = np.random.randint(model.docvecs.count)
        inferred_vector = model.infer_vector(documents[docid][0])
        similars = model.docvecs.most_similar(positive=[inferred_vector], topn=10)
        for idx,simdoc in enumerate(similars):
            if simdoc[0] == docid:
                print("found similar document with id {} in position {} with similarity score {}".format(simdoc[0], idx, simdoc[1]))
                percentiles[step] = ((len(similars) - idx) / len(similars)) * 100
                break
    return np.mean(percentiles)

def train(documents=documents, model_name="model/alodokter-articles-doc2vec.model", max_epochs=50, patience=3):
    best_mean_percentiles = 0
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Doc2Vec(dm=1, vector_size=300, window=2, alpha=0.1, min_alpha=0.0001, min_count=5, epochs=1, workers=5)
    model.build_vocab(documents)
    for epoch in range(max_epochs):
        print('training epoch {:d} ...'.format(epoch))
        model.train(documents, total_examples=model.corpus_count,epochs=model.epochs)
        mean_percentiles = evaluate(model,documents,10)
        print('mean percentiles: {:.2f}'.format(mean_percentiles))
        
        if mean_percentiles < best_mean_percentiles:
            print("current mean_percentiles: {:.2f}, best: {:.2f}".format(mean_percentiles, best_mean_percentiles))
            patience = patience-1
        else:
            best_mean_percentiles = mean_percentiles
            print("========== Saving best model with mean_percentiles: {:.2f} ==========".format(mean_percentiles))
            model.save(model_name)
            patience = patience+1
        
        if patience == 0:
            print("early stop...")
            print("========== Saving best model with mean_percentiles: {:.2f} ==========".format(best_mean_percentiles))
            break
    
    return model

In [7]:
model = train(documents)

2020-02-23 08:46:41,928 : INFO : collecting all words and their counts
2020-02-23 08:46:41,928 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-02-23 08:46:42,580 : INFO : collected 54008 word types and 7932 unique tags from a corpus of 7932 examples and 4866436 words
2020-02-23 08:46:42,581 : INFO : Loading a fresh vocabulary
2020-02-23 08:46:42,613 : INFO : effective_min_count=5 retains 15409 unique words (28% of original 54008, drops 38599)
2020-02-23 08:46:42,614 : INFO : effective_min_count=5 leaves 4811064 word corpus (98% of original 4866436, drops 55372)
2020-02-23 08:46:42,654 : INFO : deleting the raw counts dictionary of 54008 items
2020-02-23 08:46:42,656 : INFO : sample=0.001 downsamples 51 most-common words
2020-02-23 08:46:42,657 : INFO : downsampling leaves estimated 4105293 word corpus (85.3% of prior 4811064)
2020-02-23 08:46:42,698 : INFO : estimated required memory for 15409 words and 300 dimensions: 54204500 bytes
2020-02-23 08:4

training epoch 0 ...


2020-02-23 08:46:47,375 : INFO : EPOCH 1 - PROGRESS: at 36.91% examples, 1508902 words/s, in_qsize 10, out_qsize 0
2020-02-23 08:46:48,379 : INFO : EPOCH 1 - PROGRESS: at 75.47% examples, 1541982 words/s, in_qsize 9, out_qsize 0
2020-02-23 08:46:49,022 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 08:46:49,024 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 08:46:49,025 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 08:46:49,027 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 08:46:49,032 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 08:46:49,032 : INFO : EPOCH - 1 : training on 4866436 raw words (4113442 effective words) took 2.7s, 1542559 effective words/s
2020-02-23 08:46:49,033 : INFO : training on a 4866436 raw words (4113442 effective words) took 2.7s, 1540515 effective words/s
2020-02-23 08:46:49,034 : INFO : precomputing L2-n

found similar document with id 3538 in position 0 with similarity score 0.6707550883293152
found similar document with id 5152 in position 0 with similarity score 0.7327303290367126
found similar document with id 7281 in position 0 with similarity score 0.7627062201499939
found similar document with id 4481 in position 0 with similarity score 0.7637534141540527
found similar document with id 2770 in position 0 with similarity score 0.7537136077880859
found similar document with id 6782 in position 0 with similarity score 0.7837415337562561
found similar document with id 2088 in position 0 with similarity score 0.613426923751831
found similar document with id 5165 in position 0 with similarity score 0.7269462943077087
found similar document with id 5539 in position 0 with similarity score 0.7787278890609741
found similar document with id 3822 in position 0 with similarity score 0.5798944234848022
mean percentiles: 100.00


2020-02-23 08:46:49,536 : INFO : saved model/alodokter-articles-doc2vec.model
2020-02-23 08:46:49,537 : INFO : training model with 5 workers on 15409 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2


training epoch 1 ...


2020-02-23 08:46:50,549 : INFO : EPOCH 1 - PROGRESS: at 31.42% examples, 1279618 words/s, in_qsize 10, out_qsize 0
2020-02-23 08:46:51,561 : INFO : EPOCH 1 - PROGRESS: at 66.75% examples, 1362452 words/s, in_qsize 9, out_qsize 0
2020-02-23 08:46:52,409 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 08:46:52,410 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 08:46:52,411 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 08:46:52,412 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 08:46:52,415 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 08:46:52,415 : INFO : EPOCH - 1 : training on 4866436 raw words (4113779 effective words) took 2.9s, 1430908 effective words/s
2020-02-23 08:46:52,416 : INFO : training on a 4866436 raw words (4113779 effective words) took 2.9s, 1429390 effective words/s
2020-02-23 08:46:52,447 : INFO : training model wi

found similar document with id 1532 in position 0 with similarity score 0.6240188479423523
found similar document with id 4786 in position 0 with similarity score 0.6457080841064453
found similar document with id 4638 in position 0 with similarity score 0.6878639459609985
found similar document with id 431 in position 3 with similarity score 0.5665192604064941
found similar document with id 2177 in position 0 with similarity score 0.6415995359420776
found similar document with id 2228 in position 1 with similarity score 0.6698856353759766
found similar document with id 6718 in position 0 with similarity score 0.7500876784324646
found similar document with id 3492 in position 0 with similarity score 0.667715311050415
found similar document with id 4413 in position 0 with similarity score 0.721293568611145
found similar document with id 1888 in position 0 with similarity score 0.696130096912384
mean percentiles: 96.00
current mean_percentiles: 96.00, best: 100.00
training epoch 2 ...


2020-02-23 08:46:53,451 : INFO : EPOCH 1 - PROGRESS: at 32.63% examples, 1339976 words/s, in_qsize 9, out_qsize 0
2020-02-23 08:46:54,457 : INFO : EPOCH 1 - PROGRESS: at 68.76% examples, 1412554 words/s, in_qsize 9, out_qsize 1
2020-02-23 08:46:55,259 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 08:46:55,261 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 08:46:55,262 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 08:46:55,263 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 08:46:55,267 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 08:46:55,268 : INFO : EPOCH - 1 : training on 4866436 raw words (4113356 effective words) took 2.8s, 1460006 effective words/s
2020-02-23 08:46:55,268 : INFO : training on a 4866436 raw words (4113356 effective words) took 2.8s, 1458161 effective words/s
2020-02-23 08:46:55,300 : INFO : training model wit

found similar document with id 1782 in position 0 with similarity score 0.6951780319213867
found similar document with id 3722 in position 0 with similarity score 0.5354634523391724
found similar document with id 4457 in position 0 with similarity score 0.6111356616020203
found similar document with id 1609 in position 0 with similarity score 0.5046414136886597
found similar document with id 6076 in position 0 with similarity score 0.6949859857559204
found similar document with id 7870 in position 0 with similarity score 0.5444523692131042
found similar document with id 4978 in position 0 with similarity score 0.668070912361145
found similar document with id 4951 in position 0 with similarity score 0.708315372467041
found similar document with id 3603 in position 0 with similarity score 0.6080686450004578
mean percentiles: 90.00
current mean_percentiles: 90.00, best: 100.00
training epoch 3 ...


2020-02-23 08:46:56,313 : INFO : EPOCH 1 - PROGRESS: at 34.30% examples, 1401116 words/s, in_qsize 10, out_qsize 0
2020-02-23 08:46:57,314 : INFO : EPOCH 1 - PROGRESS: at 70.13% examples, 1439043 words/s, in_qsize 9, out_qsize 0
2020-02-23 08:46:58,076 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 08:46:58,078 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 08:46:58,079 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 08:46:58,081 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 08:46:58,083 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 08:46:58,084 : INFO : EPOCH - 1 : training on 4866436 raw words (4113458 effective words) took 2.8s, 1479933 effective words/s
2020-02-23 08:46:58,084 : INFO : training on a 4866436 raw words (4113458 effective words) took 2.8s, 1477936 effective words/s
2020-02-23 08:46:58,112 : INFO : training model wi

found similar document with id 2021 in position 0 with similarity score 0.5519502758979797
found similar document with id 7898 in position 0 with similarity score 0.6792017221450806
found similar document with id 5719 in position 0 with similarity score 0.5279531478881836
found similar document with id 1691 in position 1 with similarity score 0.5611472129821777
found similar document with id 1470 in position 0 with similarity score 0.4522853493690491
found similar document with id 7235 in position 0 with similarity score 0.585452675819397
found similar document with id 1369 in position 0 with similarity score 0.5770550966262817
found similar document with id 5705 in position 0 with similarity score 0.664800763130188
found similar document with id 7664 in position 0 with similarity score 0.5632209777832031
found similar document with id 3482 in position 1 with similarity score 0.5628969669342041
mean percentiles: 98.00
current mean_percentiles: 98.00, best: 100.00
training epoch 4 ...


2020-02-23 08:46:59,118 : INFO : EPOCH 1 - PROGRESS: at 36.17% examples, 1482893 words/s, in_qsize 9, out_qsize 0
2020-02-23 08:47:00,118 : INFO : EPOCH 1 - PROGRESS: at 74.04% examples, 1522234 words/s, in_qsize 10, out_qsize 0
2020-02-23 08:47:00,809 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 08:47:00,812 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 08:47:00,813 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 08:47:00,816 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 08:47:00,822 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 08:47:00,822 : INFO : EPOCH - 1 : training on 4866436 raw words (4114260 effective words) took 2.7s, 1519993 effective words/s
2020-02-23 08:47:00,823 : INFO : training on a 4866436 raw words (4114260 effective words) took 2.7s, 1517779 effective words/s


found similar document with id 5933 in position 0 with similarity score 0.4468348026275635
found similar document with id 2150 in position 0 with similarity score 0.5843720436096191
found similar document with id 3330 in position 0 with similarity score 0.6252533197402954
found similar document with id 6725 in position 0 with similarity score 0.6015385389328003
found similar document with id 7817 in position 1 with similarity score 0.5528010129928589
found similar document with id 5162 in position 0 with similarity score 0.6628072261810303
found similar document with id 5580 in position 0 with similarity score 0.5757412314414978
found similar document with id 1551 in position 2 with similarity score 0.5404307842254639
found similar document with id 3883 in position 0 with similarity score 0.5876838564872742
found similar document with id 5585 in position 0 with similarity score 0.5757378339767456
mean percentiles: 97.00
current mean_percentiles: 97.00, best: 100.00
early stop...
