##### source: https://github.com/adventuresinML/adventures-in-ml-code/blob/master/gensim_word2vec.py
##### source: https://towardsdatascience.com/word2vec-for-phrases-learning-embeddings-for-more-than-one-word
##### source: https://radimrehurek.com/gensim/models/phrases.html
##### source: http://kavita-ganesan.com/gensim-word2vec-tutorial-starter-code/
##### source: https://towardsdatascience.com/word-embedding-with-word2vec-and-fasttext

In [None]:
# import sys
# !{sys.executable} -m pip install numpy pandas matplotlib sklearn seaborn
# !{sys.executable} -m pip install --upgrade gensim

In [None]:
import logging
import collections
import os
import zipfile

import numpy as np
import gensim
import tensorflow as tf

from gensim.models import FastText
from gensim.models.phrases import Phrases, Phraser

def build_phrases(sentences, model_name='phrases.model'):
    phrases = Phrases(sentences, min_count=5, threshold=7, progress_per=1000)
    phrases_model = Phraser(phrases)
    phrases_model.save(model_name)
    return phrases_model

def sentences_to_bigrams(phrases_model, sentences):
    bigrams_sentences = []
    for sentence in sentences:
        phrases_sentence = phrases_model[sentence]
        bigrams_sentences.append(phrases_sentence)
    return bigrams_sentences

def get_data(filename="questions.dat"):
    sentences = []
    dataset = tf.data.TextLineDataset(filename)
    dataset = dataset.enumerate() 
    for element in dataset.as_numpy_iterator():
        text = element[1].decode("utf-8")
        sentences.append(text.split(' '))
    return sentences

In [None]:
def train(model_name="alodokter-word2vec-fasttext.model"):
    sentences = get_data()
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = FastText(sentences, size=100, window=5, min_count=5, workers=4, sg=1)
    model.save(model_name)
    return model

def train_bigrams(model_name="alodokter-word2vec-fasttext-bigram.model"):
    sentences = get_data()
    phrases_model = build_phrases(sentences)
    sentences = sentences_to_bigrams(phrases_model, sentences)
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = FastText(sentences, size=300, window=5, min_count=1, workers=4, sg=1)
    model.save(model_name)
    return model

##### Training with Unigram

In [None]:
training = False
if training:
    model = train()
else:
    model = FastText.load("alodokter-word2vec-fasttext.model")

word = 'bab'
print("is word '{}' in vocabulary? {}".format(word, word in model.wv.vocab))
model.wv.most_similar(positive=[word], topn=20)

##### Training with Bigram

In [None]:
training = False
if training:
    model = train_bigrams()
else:
    model = FastText.load("alodokter-word2vec-fasttext-bigram.model")

phrase = 'bibir vagina'
print("is phrase '{}' in vocabulary? {}".format(phrase, phrase in model.wv.vocab))
model.wv.most_similar(positive=[phrase], topn=20)

##### Training with Bigram & Testing

In [35]:
training = False
if training:
    model = train_bigrams()
else:
    model = FastText.load("alodokter-word2vec-fasttext-bigram.model")

2020-02-20 08:45:55,388 : INFO : collecting all words and their counts
2020-02-20 08:45:55,388 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2020-02-20 08:45:55,417 : INFO : PROGRESS: at sentence #1000, processed 25595 words and 12843 word types
2020-02-20 08:45:55,445 : INFO : PROGRESS: at sentence #2000, processed 48378 words and 22461 word types
2020-02-20 08:45:55,475 : INFO : PROGRESS: at sentence #3000, processed 74442 words and 33061 word types
2020-02-20 08:45:55,507 : INFO : PROGRESS: at sentence #4000, processed 102098 words and 43026 word types
2020-02-20 08:45:55,537 : INFO : PROGRESS: at sentence #5000, processed 127148 words and 51309 word types
2020-02-20 08:45:55,573 : INFO : PROGRESS: at sentence #6000, processed 153949 words and 59631 word types
2020-02-20 08:45:55,607 : INFO : PROGRESS: at sentence #7000, processed 179820 words and 67503 word types
2020-02-20 08:45:55,641 : INFO : PROGRESS: at sentence #8000, processed 209236 words and 76213 w

2020-02-20 08:45:58,332 : INFO : PROGRESS: at sentence #76000, processed 2118750 words and 384123 word types
2020-02-20 08:45:58,365 : INFO : PROGRESS: at sentence #77000, processed 2144765 words and 387407 word types
2020-02-20 08:45:58,397 : INFO : PROGRESS: at sentence #78000, processed 2169578 words and 390215 word types
2020-02-20 08:45:58,433 : INFO : PROGRESS: at sentence #79000, processed 2196981 words and 393299 word types
2020-02-20 08:45:58,469 : INFO : PROGRESS: at sentence #80000, processed 2224387 words and 396690 word types
2020-02-20 08:45:58,506 : INFO : PROGRESS: at sentence #81000, processed 2251949 words and 399406 word types
2020-02-20 08:45:58,537 : INFO : PROGRESS: at sentence #82000, processed 2274216 words and 402830 word types
2020-02-20 08:45:58,579 : INFO : PROGRESS: at sentence #83000, processed 2304960 words and 406461 word types
2020-02-20 08:45:58,619 : INFO : PROGRESS: at sentence #84000, processed 2334498 words and 409905 word types
2020-02-20 08:45:58

2020-02-20 08:46:00,924 : INFO : PROGRESS: at sentence #151000, processed 4203761 words and 580245 word types
2020-02-20 08:46:00,959 : INFO : PROGRESS: at sentence #152000, processed 4231191 words and 582352 word types
2020-02-20 08:46:00,997 : INFO : PROGRESS: at sentence #153000, processed 4261177 words and 584663 word types
2020-02-20 08:46:01,030 : INFO : PROGRESS: at sentence #154000, processed 4287642 words and 586958 word types
2020-02-20 08:46:01,061 : INFO : PROGRESS: at sentence #155000, processed 4313124 words and 590435 word types
2020-02-20 08:46:01,099 : INFO : PROGRESS: at sentence #156000, processed 4344002 words and 593068 word types
2020-02-20 08:46:01,130 : INFO : PROGRESS: at sentence #157000, processed 4370448 words and 595305 word types
2020-02-20 08:46:01,164 : INFO : PROGRESS: at sentence #158000, processed 4399964 words and 597678 word types
2020-02-20 08:46:01,198 : INFO : PROGRESS: at sentence #159000, processed 4428273 words and 600346 word types
2020-02-20

2020-02-20 08:46:03,509 : INFO : PROGRESS: at sentence #226000, processed 6295179 words and 750395 word types
2020-02-20 08:46:03,545 : INFO : PROGRESS: at sentence #227000, processed 6324056 words and 752745 word types
2020-02-20 08:46:03,581 : INFO : PROGRESS: at sentence #228000, processed 6352469 words and 755741 word types
2020-02-20 08:46:03,611 : INFO : PROGRESS: at sentence #229000, processed 6376653 words and 758771 word types
2020-02-20 08:46:03,640 : INFO : PROGRESS: at sentence #230000, processed 6399706 words and 760874 word types
2020-02-20 08:46:03,673 : INFO : PROGRESS: at sentence #231000, processed 6425726 words and 763524 word types
2020-02-20 08:46:03,710 : INFO : PROGRESS: at sentence #232000, processed 6455428 words and 765835 word types
2020-02-20 08:46:03,741 : INFO : PROGRESS: at sentence #233000, processed 6479787 words and 767824 word types
2020-02-20 08:46:03,774 : INFO : PROGRESS: at sentence #234000, processed 6506430 words and 769974 word types
2020-02-20

2020-02-20 08:46:06,146 : INFO : PROGRESS: at sentence #301000, processed 8395399 words and 914594 word types
2020-02-20 08:46:06,181 : INFO : PROGRESS: at sentence #302000, processed 8424729 words and 916964 word types
2020-02-20 08:46:06,216 : INFO : PROGRESS: at sentence #303000, processed 8454414 words and 918570 word types
2020-02-20 08:46:06,247 : INFO : PROGRESS: at sentence #304000, processed 8479594 words and 921138 word types
2020-02-20 08:46:06,282 : INFO : PROGRESS: at sentence #305000, processed 8509235 words and 923505 word types
2020-02-20 08:46:06,313 : INFO : PROGRESS: at sentence #306000, processed 8536368 words and 925433 word types
2020-02-20 08:46:06,347 : INFO : PROGRESS: at sentence #307000, processed 8564075 words and 927241 word types
2020-02-20 08:46:06,381 : INFO : PROGRESS: at sentence #308000, processed 8592572 words and 929154 word types
2020-02-20 08:46:06,417 : INFO : PROGRESS: at sentence #309000, processed 8622976 words and 930977 word types
2020-02-20

2020-02-20 08:46:08,943 : INFO : PROGRESS: at sentence #376000, processed 10595209 words and 1052018 word types
2020-02-20 08:46:08,987 : INFO : PROGRESS: at sentence #377000, processed 10629037 words and 1053977 word types
2020-02-20 08:46:09,025 : INFO : PROGRESS: at sentence #378000, processed 10658669 words and 1055583 word types
2020-02-20 08:46:09,067 : INFO : PROGRESS: at sentence #379000, processed 10690758 words and 1057166 word types
2020-02-20 08:46:09,108 : INFO : PROGRESS: at sentence #380000, processed 10722707 words and 1058809 word types
2020-02-20 08:46:09,145 : INFO : PROGRESS: at sentence #381000, processed 10751924 words and 1060077 word types
2020-02-20 08:46:09,187 : INFO : PROGRESS: at sentence #382000, processed 10779732 words and 1061281 word types
2020-02-20 08:46:09,243 : INFO : PROGRESS: at sentence #383000, processed 10810604 words and 1062760 word types
2020-02-20 08:46:09,291 : INFO : PROGRESS: at sentence #384000, processed 10842685 words and 1064172 wor

2020-02-20 08:47:30,541 : INFO : EPOCH 1 - PROGRESS: at 2.24% examples, 153421 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:47:31,542 : INFO : EPOCH 1 - PROGRESS: at 4.65% examples, 166184 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:47:32,558 : INFO : EPOCH 1 - PROGRESS: at 7.09% examples, 172018 words/s, in_qsize 8, out_qsize 0
2020-02-20 08:47:33,583 : INFO : EPOCH 1 - PROGRESS: at 9.52% examples, 172757 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:47:34,606 : INFO : EPOCH 1 - PROGRESS: at 11.85% examples, 173224 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:47:35,638 : INFO : EPOCH 1 - PROGRESS: at 14.25% examples, 173349 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:47:36,660 : INFO : EPOCH 1 - PROGRESS: at 16.79% examples, 175703 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:47:37,684 : INFO : EPOCH 1 - PROGRESS: at 18.94% examples, 173785 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:47:38,689 : INFO : EPOCH 1 - PROGRESS: at 21.37% examples, 174357 words/s, in_qsize 7,

2020-02-20 08:48:41,918 : INFO : EPOCH 2 - PROGRESS: at 62.54% examples, 167403 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:48:42,919 : INFO : EPOCH 2 - PROGRESS: at 64.93% examples, 167549 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:48:43,934 : INFO : EPOCH 2 - PROGRESS: at 67.19% examples, 167350 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:48:44,942 : INFO : EPOCH 2 - PROGRESS: at 69.45% examples, 167701 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:48:45,949 : INFO : EPOCH 2 - PROGRESS: at 71.94% examples, 168236 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:48:46,955 : INFO : EPOCH 2 - PROGRESS: at 74.10% examples, 168297 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:48:47,957 : INFO : EPOCH 2 - PROGRESS: at 76.30% examples, 168363 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:48:48,959 : INFO : EPOCH 2 - PROGRESS: at 78.53% examples, 168468 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:48:49,995 : INFO : EPOCH 2 - PROGRESS: at 80.79% examples, 168759 words/s, in_qsiz

2020-02-20 08:49:49,585 : INFO : EPOCH 4 - PROGRESS: at 8.70% examples, 155236 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:49:50,618 : INFO : EPOCH 4 - PROGRESS: at 10.96% examples, 157489 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:49:51,702 : INFO : EPOCH 4 - PROGRESS: at 13.28% examples, 158676 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:49:52,705 : INFO : EPOCH 4 - PROGRESS: at 15.55% examples, 159389 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:49:53,771 : INFO : EPOCH 4 - PROGRESS: at 17.68% examples, 158749 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:49:54,793 : INFO : EPOCH 4 - PROGRESS: at 19.86% examples, 158958 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:49:55,853 : INFO : EPOCH 4 - PROGRESS: at 22.17% examples, 159321 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:49:56,878 : INFO : EPOCH 4 - PROGRESS: at 24.33% examples, 160106 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:49:57,880 : INFO : EPOCH 4 - PROGRESS: at 26.45% examples, 159860 words/s, in_qsize

2020-02-20 08:51:01,277 : INFO : EPOCH 5 - PROGRESS: at 60.26% examples, 168389 words/s, in_qsize 8, out_qsize 0
2020-02-20 08:51:02,346 : INFO : EPOCH 5 - PROGRESS: at 62.64% examples, 168111 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:51:03,361 : INFO : EPOCH 5 - PROGRESS: at 65.03% examples, 168168 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:51:04,380 : INFO : EPOCH 5 - PROGRESS: at 67.38% examples, 168183 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:51:05,383 : INFO : EPOCH 5 - PROGRESS: at 69.64% examples, 168555 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:51:06,418 : INFO : EPOCH 5 - PROGRESS: at 72.13% examples, 168930 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:51:07,426 : INFO : EPOCH 5 - PROGRESS: at 74.37% examples, 169176 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:51:08,488 : INFO : EPOCH 5 - PROGRESS: at 76.57% examples, 168923 words/s, in_qsize 7, out_qsize 0
2020-02-20 08:51:09,555 : INFO : EPOCH 5 - PROGRESS: at 78.61% examples, 168259 words/s, in_qsiz

In [36]:
import pprint
pp = pprint.PrettyPrinter(indent=2)

import json
from itertools import groupby

def get_data_synonym(filename="synonym.json"):
    with open(filename) as f:
        data = json.load(f)
        return data

json = get_data_synonym()
data = tuple(json.items())

dictionary = {}
for key, group in groupby(data, lambda x: x[1]):
    for item in group:
        if not key in dictionary:
            dictionary[key] = []
        else:
            dictionary[key].append(item[0])

In [37]:
pp.pprint(dictionary['keluar darah'])

[ 'keluar darah marah tua',
  'darah masih tetap keluar',
  'keluar darah lumayan banyak seperti haid',
  'darah keluar seperti haid',
  'ada darah keluar ya tiba darah malah darah keluar kayak mens',
  'keluar darah berwarna merah terang',
  'seperti haid keluar darah',
  'keluar hanya darah',
  'keluar kayak darah pekat',
  'darah yang keluar campur dengan darah hitam',
  'mengeluarkan darah seperti mens',
  'darah masih keluar seperti menstruasi',
  'keluar darah bulat',
  'darahnya sedikit keluar',
  'keluar setetes darah',
  'darah yang keluar pun lebih banyak',
  'keluarin darah',
  'keluar darah sedikit sedikit flek',
  'keliar darah',
  'darah keluar terus',
  'darah nya itu keluar nya kan',
  'darahnya keluar',
  'darahnya keluar lagi',
  'darah masih keluar',
  'keluar kayak darah mateng',
  'keluar darah berwarna merah hambar',
  'keluar darah berwarna merah',
  'keluar darah merah cair',
  'keluar pendarahan',
  'pendarahan keluar darah',
  'keluar lagi darah sedikit sediki

In [39]:
phrase = 'darah setetes keluar'
if phrase in json:
    print("found '{}' in synonym dict: '{}'".format(phrase, json[phrase]))
else:
    print("find similarity...")
    print("is phrase '{}' in vocabulary? {}\n".format(phrase, phrase in model.wv.vocab))
    sims = model.wv.most_similar(positive=[phrase], topn=100)
    for item in sims:
        text = item[0].replace('_',' ')
        if text in json:
            print("{} -> {}".format(text, json[text]))

2020-02-20 08:53:05,479 : INFO : precomputing L2-norms of word weight vectors
2020-02-20 08:53:05,557 : INFO : precomputing L2-norms of ngram weight vectors


find similarity...
is phrase 'darah setetes keluar' in vocabulary? False

setetes -> setetes
setetes setetes -> setetes setetes
keluar -> keluar
keluar flex -> bercak
darah segar -> darah merah segar
keluarnya -> keluarnya
keluarnya sedikit -> sedikit
bercak darah -> bercak darah
darahnya -> berdarah
darah -> darah
lendir kecoklatan -> lendir coklat
netes -> menetes
mengeluarkandarah -> keluar darah
lendir bening -> lendir bening
menetes -> urin menetes
ketetesan -> ketetesan
keluarin diluar -> keluarin diluar
keluarnya dikit -> keluar sedikit
keluar lendiran -> keluar lendir
mengeluarkan darah -> berdarah
bercampur lendir -> berlendir
merah segar -> merah
darahku -> darah
keluarkan diluar -> keluarkan diluar
bercampur darah -> campur darah
lendir kental -> lendir kental
flek kecokelatan -> flek coklat
kecokelatan -> coklat
keluar lendir -> berlendir
kecokelatan cokelatan -> kecoklatan
keluarin ingus -> membuang ingus
keluar gumpalan -> gumpalan
mengeluarkan lendir -> berlendir
tetesan