##### source: https://github.com/adventuresinML/adventures-in-ml-code/blob/master/gensim_word2vec.py
##### source: https://towardsdatascience.com/word2vec-for-phrases-learning-embeddings-for-more-than-one-word
##### source: https://radimrehurek.com/gensim/models/phrases.html
##### source: http://kavita-ganesan.com/gensim-word2vec-tutorial-starter-code/
##### source: https://towardsdatascience.com/word-embedding-with-word2vec-and-fasttext

In [None]:
# import sys
# !{sys.executable} -m pip install numpy pandas matplotlib sklearn seaborn
# !{sys.executable} -m pip install --upgrade gensim

In [None]:
import logging
import collections
import os
import zipfile

import numpy as np
import gensim
import tensorflow as tf

from gensim.models import FastText
from gensim.models.phrases import Phrases, Phraser

def build_phrases(sentences, model_name='phrases.model'):
    phrases = Phrases(sentences, min_count=5, threshold=7, progress_per=1000)
    phrases_model = Phraser(phrases)
    phrases_model.save(model_name)
    return phrases_model

def sentences_to_bigrams(phrases_model, sentences):
    bigrams_sentences = []
    for sentence in sentences:
        phrases_sentence = phrases_model[sentence]
        bigrams_sentences.append(phrases_sentence)
    return bigrams_sentences

def get_data(filename="questions.dat"):
    sentences = []
    dataset = tf.data.TextLineDataset(filename)
    dataset = dataset.enumerate() 
    for element in dataset.as_numpy_iterator():
        text = element[1].decode("utf-8")
        sentences.append(text.split(' '))
    return sentences

In [None]:
def train(model_name="alodokter-word2vec-fasttext.model"):
    sentences = get_data()
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = FastText(sentences, size=100, window=5, min_count=5, workers=4, sg=1)
    model.save(model_name)
    return model

def train_bigrams(model_name="alodokter-word2vec-fasttext-bigram.model"):
    sentences = get_data()
    phrases_model = build_phrases(sentences)
    sentences = sentences_to_bigrams(phrases_model, sentences)
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = FastText(sentences, size=300, window=5, min_count=1, workers=4, sg=1)
    model.save(model_name)
    return model

##### Training with Unigram

In [None]:
training = False
if training:
    model = train()
else:
    model = FastText.load("alodokter-word2vec-fasttext.model")

word = 'bab'
print("is word '{}' in vocabulary? {}".format(word, word in model.wv.vocab))
model.wv.most_similar(positive=[word], topn=20)

##### Training with Bigram

In [None]:
training = True
if training:
    model = train_bigrams()
else:
    model = FastText.load("alodokter-word2vec-fasttext-bigram.model")

phrase = 'bibir vagina'
print("is phrase '{}' in vocabulary? {}".format(phrase, phrase in model.wv.vocab))
model.wv.most_similar(positive=[phrase], topn=20)

##### Training with Bigram & Testing

In [11]:
training = False
if training:
    model = train_bigrams()
else:
    model = FastText.load("alodokter-word2vec-fasttext-bigram.model")

2020-02-20 08:28:44,510 : INFO : loading FastText object from alodokter-word2vec-fasttext-bigram.model
2020-02-20 08:28:44,565 : INFO : loading wv recursively from alodokter-word2vec-fasttext-bigram.model.wv.* with mmap=None
2020-02-20 08:28:44,566 : INFO : loading vectors from alodokter-word2vec-fasttext-bigram.model.wv.vectors.npy with mmap=None
2020-02-20 08:28:44,597 : INFO : loading vectors_vocab from alodokter-word2vec-fasttext-bigram.model.wv.vectors_vocab.npy with mmap=None
2020-02-20 08:28:44,631 : INFO : loading vectors_ngrams from alodokter-word2vec-fasttext-bigram.model.wv.vectors_ngrams.npy with mmap=None
2020-02-20 08:28:46,079 : INFO : setting ignored attribute vectors_norm to None
2020-02-20 08:28:46,079 : INFO : setting ignored attribute vectors_vocab_norm to None
2020-02-20 08:28:46,080 : INFO : setting ignored attribute vectors_ngrams_norm to None
2020-02-20 08:28:46,080 : INFO : setting ignored attribute buckets_word to None
2020-02-20 08:28:46,081 : INFO : loading 

In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=2)

import json
from itertools import groupby

def get_data_synonym(filename="synonym.json"):
    with open(filename) as f:
        data = json.load(f)
        return data

json = get_data_synonym()
data = tuple(json.items())

dictionary = {}
for key, group in groupby(data, lambda x: x[1]):
    for item in group:
        if not key in dictionary:
            dictionary[key] = []
        else:
            dictionary[key].append(item[0])

In [17]:
pp.pprint(dictionary['keluar darah'])

[ 'keluar darah marah tua',
  'darah masih tetap keluar',
  'keluar darah lumayan banyak seperti haid',
  'darah keluar seperti haid',
  'ada darah keluar ya tiba darah malah darah keluar kayak mens',
  'keluar darah berwarna merah terang',
  'seperti haid keluar darah',
  'keluar hanya darah',
  'keluar kayak darah pekat',
  'darah yang keluar campur dengan darah hitam',
  'mengeluarkan darah seperti mens',
  'darah masih keluar seperti menstruasi',
  'keluar darah bulat',
  'darahnya sedikit keluar',
  'keluar setetes darah',
  'darah yang keluar pun lebih banyak',
  'keluarin darah',
  'keluar darah sedikit sedikit flek',
  'keliar darah',
  'darah keluar terus',
  'darah nya itu keluar nya kan',
  'darahnya keluar',
  'darahnya keluar lagi',
  'darah masih keluar',
  'keluar kayak darah mateng',
  'keluar darah berwarna merah hambar',
  'keluar darah berwarna merah',
  'keluar darah merah cair',
  'keluar pendarahan',
  'pendarahan keluar darah',
  'keluar lagi darah sedikit sediki

In [34]:
phrase = 'pups saya ada darah'
if phrase in json:
    print("found '{}' in synonym dict: '{}'".format(phrase, json[phrase]))
else:
    print("find similarity...")
    print("is phrase '{}' in vocabulary? {}\n".format(phrase, phrase in model.wv.vocab))
    sims = model.wv.most_similar(positive=[phrase], topn=100)
    for item in sims:
        text = item[0].replace('_',' ')
        if text in json:
            print("{} -> {}".format(text, json[text]))

find similarity...
is phrase 'pups saya ada darah' in vocabulary? False

skthbnlmbvvhhjknbxnmzlzkzjzhvvbbnjkjhhhhhjdjssjsj -> skthbnlmbvvhhjknbxnmzlzkzjzhvvbbnjkjhhhhhjdjssjsj
tapiskrangudahnaikitunaikpasdi -> tapiskrangudahnaikitunaikpasdi
แดตแดบแต€แดตแดน -> แดตแดบแต€แดตแดน
nzjsn -> nzjsn
kggrn -> keguguran
farah -> farah
