In [None]:
# import sys
# !{sys.executable} -m pip install numpy pandas matplotlib sklearn seaborn
# !{sys.executable} -m pip install --upgrade gensim
# !{sys.executable} -m pip install nltk
# !{sys.executable} -m pip install beautifulsoup4
# import nltk
# nltk.download('punkt')

In [1]:
import os
import re
import gensim
import multiprocessing
import random
import logging
import numpy as np
import zipfile
import tensorflow as tf

from urllib import request

from pathlib import Path
from os import listdir
from os.path import isfile, join
from nltk.tokenize import RegexpTokenizer
from bs4 import BeautifulSoup

from sklearn.cluster import KMeans
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [6]:
SKIP_FILES = ""
NEWLINE = '\n'

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

"""
read training files
"""
def read_files(path):
    print("path: {}...".format(path))
    for root, dirnames, filenames in os.walk(path):
        for dirname in dirnames:
            read_files(os.path.join(root, dirname))
        for filename in filenames:
            if filename not in SKIP_FILES:
                filepath = os.path.join(root, filename)
                if os.path.isfile(filepath):
                    lines = []
                    f = open(filepath, encoding='latin-1')
                    for line in f:
                        lines.append(line)
                    f.close()
                    content = NEWLINE.join(lines)
                    yield filename, content

def download(url, filename):
    """Download a file if not present"""
    if not os.path.exists(filename):
        print("downloading {}...".format(filename))
        filename, _ = request.urlretrieve(url + filename, filename)

        print("extracting {}...".format(filename))
        with zipfile.ZipFile(filename) as f:
            f.extractall()

    """directory data"""
    data_path = filename.replace('.zip','')
    if not os.path.exists(data_path):
        print("extracting {}...".format(filename))
        with zipfile.ZipFile(filename) as f:
            f.extractall()

    """directory model for saving model while training"""
    if not os.path.exists('model'):
        os.mkdir('model')
        print("directory model created...")

    return filename

In [7]:
def get_data(url, filename):
    filename = download(url, filename)
    data_path = filename.replace('.zip','')
    documents = []
    print("building documents...")
    for fname, text in read_files(data_path):
        documents.append(clean_str(text).split(' '))
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
    print("building documents done")
    return documents

In [8]:
# download data training
documents = get_data(url='https://github.com/kadriansyah/notebook/raw/master/alodokter-doc2vec-article/', filename="data.zip")
print("we have {} documents".format(len(documents)))

extracting data.zip...
building documents...
path: data...
building documents done
we have 7932 documents


In [9]:
documents[0]

TaggedDocument(words=['penyebab', 'bayi', 'muntah', 'setelah', 'minum', 'asi', 'dan', 'cara', 'mengatasinya', 'bayi', 'muntah', 'setelah', 'minum', 'asi', '(', 'air', 'susu', 'ibu', ')', 'adalah', 'keluhan', 'yang', 'sering', 'terjadi', 'sebagian', 'bayi', 'bahkan', 'mengalaminya', 'hampir', 'setiap', 'kali', 'selesai', 'menyusu', 'meski', 'umumnya', 'normal', 'kondisi', 'ini', 'bisa', 'juga', 'disebabkan', 'oleh', 'gangguan', 'berbahaya', 'yang', 'harus', 'diwaspadai', 'bayi', 'muntah', 'setelah', 'minum', 'asi', 'dikenal', 'dengan', 'istilah', 'gumoh', 'gumoh', 'dikatakan', 'normal', 'apabila', 'tidak', 'menyebabkan', 'bayi', 'rewel', 'atau', 'sesak', 'napas', 'meskipun', 'dapat', 'dicegah', 'kondisi', 'tersebut', 'tidak', 'memerlukan', 'penanganan', 'khusus', 'dan', 'normal', 'terjadi', 'penyebab', 'bayi', 'muntah', 'setelah', 'minum', 'asi', 'gumoh', 'disebabkan', 'oleh', 'asi', 'atau', 'susu', 'yang', 'ditelan', 'bayi', 'kembali', 'ke', 'kerongkongan', 'karena', 'otot', 'di', 'sal

In [10]:
def evaluate(model, documents, steps):
    percentiles = np.zeros(steps)
    for step in range(steps):
        docid = np.random.randint(model.docvecs.count)
        inferred_vector = model.infer_vector(documents[docid][0])
        similars = model.docvecs.most_similar(positive=[inferred_vector], topn=10)
        for idx,simdoc in enumerate(similars):
            if simdoc[0] == docid:
                print("found similar document with id {} in position {} with similarity score {}".format(simdoc[0], idx, simdoc[1]))
                percentiles[step] = ((len(similars) - idx) / len(similars)) * 100
                break
    return np.mean(percentiles)

def train(documents=documents, model_name="model/alodokter-articles-doc2vec.model", max_epochs=50, patience=3):
    best_mean_percentiles = 0
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Doc2Vec(dm=1, vector_size=300, window=2, alpha=0.1, min_alpha=0.0001, min_count=5, epochs=1, workers=5)
    model.build_vocab(documents)
    for epoch in range(max_epochs):
        print('training epoch {:d} ...'.format(epoch))
        model.train(documents, total_examples=model.corpus_count,epochs=model.epochs)
        mean_percentiles = evaluate(model,documents,10)
        print('mean percentiles: {:.2f}'.format(mean_percentiles))
        
        if mean_percentiles < best_mean_percentiles:
            print("current mean_percentiles: {:.2f}, best: {:.2f}".format(mean_percentiles, best_mean_percentiles))
            patience = patience-1
        else:
            best_mean_percentiles = mean_percentiles
            print("========== Saving best model with mean_percentiles: {:.2f} ==========".format(mean_percentiles))
            model.save(model_name)
            patience = patience+1
        
        if patience == 0:
            print("early stop...")
            print("========== Saving best model with mean_percentiles: {:.2f} ==========".format(best_mean_percentiles))
            break
    
    return model

In [11]:
model = train(documents)

2020-02-23 09:52:51,811 : INFO : collecting all words and their counts
2020-02-23 09:52:51,812 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-02-23 09:52:52,469 : INFO : collected 54008 word types and 7932 unique tags from a corpus of 7932 examples and 4866436 words
2020-02-23 09:52:52,470 : INFO : Loading a fresh vocabulary
2020-02-23 09:52:52,505 : INFO : effective_min_count=5 retains 15409 unique words (28% of original 54008, drops 38599)
2020-02-23 09:52:52,505 : INFO : effective_min_count=5 leaves 4811064 word corpus (98% of original 4866436, drops 55372)
2020-02-23 09:52:52,549 : INFO : deleting the raw counts dictionary of 54008 items
2020-02-23 09:52:52,551 : INFO : sample=0.001 downsamples 51 most-common words
2020-02-23 09:52:52,551 : INFO : downsampling leaves estimated 4105293 word corpus (85.3% of prior 4811064)
2020-02-23 09:52:52,595 : INFO : estimated required memory for 15409 words and 300 dimensions: 54204500 bytes
2020-02-23 09:5

training epoch 0 ...


2020-02-23 09:52:57,234 : INFO : EPOCH 1 - PROGRESS: at 37.10% examples, 1520374 words/s, in_qsize 10, out_qsize 1
2020-02-23 09:52:58,238 : INFO : EPOCH 1 - PROGRESS: at 75.86% examples, 1551284 words/s, in_qsize 9, out_qsize 0
2020-02-23 09:52:58,863 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 09:52:58,866 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 09:52:58,870 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 09:52:58,872 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 09:52:58,873 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 09:52:58,873 : INFO : EPOCH - 1 : training on 4866436 raw words (4113376 effective words) took 2.6s, 1554339 effective words/s
2020-02-23 09:52:58,874 : INFO : training on a 4866436 raw words (4113376 effective words) took 2.6s, 1552905 effective words/s
2020-02-23 09:52:58,875 : INFO : precomputing L2-n

found similar document with id 3220 in position 0 with similarity score 0.6797736883163452
found similar document with id 5805 in position 0 with similarity score 0.7531086206436157
found similar document with id 4936 in position 0 with similarity score 0.6532344222068787
found similar document with id 2180 in position 0 with similarity score 0.695888876914978
found similar document with id 3914 in position 0 with similarity score 0.6866844296455383
found similar document with id 5740 in position 0 with similarity score 0.755409836769104
found similar document with id 6122 in position 0 with similarity score 0.6206434965133667
found similar document with id 2444 in position 0 with similarity score 0.7208614945411682
found similar document with id 3624 in position 5 with similarity score 0.5664368867874146
found similar document with id 5689 in position 0 with similarity score 0.7234717011451721
mean percentiles: 95.00


2020-02-23 09:52:59,347 : INFO : saved model/alodokter-articles-doc2vec.model
2020-02-23 09:52:59,349 : INFO : training model with 5 workers on 15409 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2


training epoch 1 ...


2020-02-23 09:53:00,352 : INFO : EPOCH 1 - PROGRESS: at 37.10% examples, 1530072 words/s, in_qsize 9, out_qsize 0
2020-02-23 09:53:01,365 : INFO : EPOCH 1 - PROGRESS: at 74.47% examples, 1521657 words/s, in_qsize 9, out_qsize 0
2020-02-23 09:53:02,311 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 09:53:02,313 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 09:53:02,316 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 09:53:02,317 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 09:53:02,319 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 09:53:02,319 : INFO : EPOCH - 1 : training on 4866436 raw words (4113077 effective words) took 3.0s, 1385887 effective words/s
2020-02-23 09:53:02,320 : INFO : training on a 4866436 raw words (4113077 effective words) took 3.0s, 1384554 effective words/s
2020-02-23 09:53:02,349 : INFO : training model wit

found similar document with id 3893 in position 0 with similarity score 0.7209415435791016
found similar document with id 4659 in position 0 with similarity score 0.5547820329666138
found similar document with id 5367 in position 0 with similarity score 0.5608341693878174
found similar document with id 5493 in position 0 with similarity score 0.5272362232208252
found similar document with id 6636 in position 0 with similarity score 0.6744707226753235
found similar document with id 2982 in position 0 with similarity score 0.6794629096984863
found similar document with id 6973 in position 0 with similarity score 0.5008974671363831
found similar document with id 6066 in position 0 with similarity score 0.693284809589386
found similar document with id 5500 in position 0 with similarity score 0.672776460647583
mean percentiles: 90.00
current mean_percentiles: 90.00, best: 95.00
training epoch 2 ...


2020-02-23 09:53:03,369 : INFO : EPOCH 1 - PROGRESS: at 22.39% examples, 898993 words/s, in_qsize 10, out_qsize 0
2020-02-23 09:53:04,371 : INFO : EPOCH 1 - PROGRESS: at 56.69% examples, 1158463 words/s, in_qsize 9, out_qsize 0
2020-02-23 09:53:05,375 : INFO : EPOCH 1 - PROGRESS: at 95.01% examples, 1294459 words/s, in_qsize 10, out_qsize 0
2020-02-23 09:53:05,495 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 09:53:05,506 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 09:53:05,511 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 09:53:05,514 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 09:53:05,515 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 09:53:05,515 : INFO : EPOCH - 1 : training on 4866436 raw words (4113602 effective words) took 3.2s, 1300716 effective words/s
2020-02-23 09:53:05,516 : INFO : training on a 4866436 raw words

found similar document with id 3081 in position 0 with similarity score 0.6022744178771973
found similar document with id 1444 in position 0 with similarity score 0.5619080662727356
found similar document with id 7296 in position 2 with similarity score 0.5168426036834717
found similar document with id 3862 in position 0 with similarity score 0.6275359392166138
found similar document with id 5078 in position 0 with similarity score 0.6552468538284302
found similar document with id 4145 in position 0 with similarity score 0.7030065655708313
found similar document with id 1332 in position 1 with similarity score 0.4750661253929138
found similar document with id 5062 in position 0 with similarity score 0.6510938405990601
mean percentiles: 77.00
current mean_percentiles: 77.00, best: 95.00
training epoch 3 ...


2020-02-23 09:53:06,552 : INFO : EPOCH 1 - PROGRESS: at 36.17% examples, 1485612 words/s, in_qsize 10, out_qsize 0
2020-02-23 09:53:07,558 : INFO : EPOCH 1 - PROGRESS: at 64.79% examples, 1332341 words/s, in_qsize 10, out_qsize 0
2020-02-23 09:53:08,560 : INFO : EPOCH 1 - PROGRESS: at 99.03% examples, 1354957 words/s, in_qsize 5, out_qsize 0
2020-02-23 09:53:08,566 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 09:53:08,567 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 09:53:08,574 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 09:53:08,575 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 09:53:08,584 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 09:53:08,585 : INFO : EPOCH - 1 : training on 4866436 raw words (4113119 effective words) took 3.0s, 1356201 effective words/s
2020-02-23 09:53:08,586 : INFO : training on a 4866436 raw word

found similar document with id 7032 in position 0 with similarity score 0.5312426686286926
found similar document with id 7145 in position 0 with similarity score 0.5309252738952637
found similar document with id 871 in position 0 with similarity score 0.5175334811210632
found similar document with id 5977 in position 2 with similarity score 0.5449908375740051
found similar document with id 704 in position 7 with similarity score 0.5392845869064331
found similar document with id 7649 in position 0 with similarity score 0.5665024518966675
found similar document with id 2926 in position 0 with similarity score 0.5165708661079407
found similar document with id 3268 in position 0 with similarity score 0.6013530492782593
found similar document with id 6692 in position 0 with similarity score 0.6598948240280151
mean percentiles: 81.00
current mean_percentiles: 81.00, best: 95.00
training epoch 4 ...


2020-02-23 09:53:09,632 : INFO : EPOCH 1 - PROGRESS: at 33.55% examples, 1372494 words/s, in_qsize 9, out_qsize 0
2020-02-23 09:53:10,641 : INFO : EPOCH 1 - PROGRESS: at 70.13% examples, 1434771 words/s, in_qsize 9, out_qsize 0
2020-02-23 09:53:11,438 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-02-23 09:53:11,440 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-02-23 09:53:11,442 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-23 09:53:11,443 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-23 09:53:11,452 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-02-23 09:53:11,453 : INFO : EPOCH - 1 : training on 4866436 raw words (4112814 effective words) took 2.8s, 1455009 effective words/s
2020-02-23 09:53:11,453 : INFO : training on a 4866436 raw words (4112814 effective words) took 2.8s, 1453446 effective words/s


found similar document with id 5085 in position 0 with similarity score 0.5799938440322876
found similar document with id 7348 in position 0 with similarity score 0.5145840644836426
found similar document with id 6292 in position 0 with similarity score 0.5893575549125671
found similar document with id 2687 in position 0 with similarity score 0.5952913761138916
found similar document with id 5110 in position 0 with similarity score 0.6060895919799805
found similar document with id 7198 in position 0 with similarity score 0.618311882019043
found similar document with id 6849 in position 0 with similarity score 0.5817482471466064
found similar document with id 7487 in position 0 with similarity score 0.5786994695663452
mean percentiles: 80.00
current mean_percentiles: 80.00, best: 95.00
early stop...
