##### source: https://github.com/adventuresinML/adventures-in-ml-code/blob/master/gensim_word2vec.py
##### source: https://towardsdatascience.com/word2vec-for-phrases-learning-embeddings-for-more-than-one-word
##### source: https://radimrehurek.com/gensim/models/phrases.html
##### source: http://kavita-ganesan.com/gensim-word2vec-tutorial-starter-code/
##### source: https://towardsdatascience.com/word-embedding-with-word2vec-and-fasttext

In [None]:
# import sys
# !{sys.executable} -m pip install numpy pandas matplotlib sklearn seaborn
# !{sys.executable} -m pip install --upgrade gensim

In [38]:
import logging
import collections
import os
import zipfile

import numpy as np
import gensim
import tensorflow as tf

from gensim.models import FastText
from gensim.models.phrases import Phrases, Phraser

def build_phrases(sentences, model_name='phrases.model'):
    phrases = Phrases(sentences, min_count=5, threshold=7, progress_per=1000)
    phrases_model = Phraser(phrases)
    phrases_model.save(model_name)
    return phrases_model

def sentences_to_bigrams(phrases_model, sentences):
    bigrams_sentences = []
    for sentence in sentences:
        phrases_sentence = phrases_model[sentence]
        bigrams_sentences.append(phrases_sentence)
    return bigrams_sentences

def get_data(filename="questions.dat"):
    sentences = []
    dataset = tf.data.TextLineDataset(filename)
    dataset = dataset.enumerate() 
    for element in dataset.as_numpy_iterator():
        text = element[1].decode("utf-8")
        sentences.append(text.split(' '))
    return sentences

In [39]:
def train(model_name="alodokter-word2vec-fasttext.model"):
    sentences = get_data()
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = FastText(sentences, size=100, window=5, min_count=5, workers=4, sg=1)
    model.save(model_name)
    return model

def train_bigrams(model_name="alodokter-word2vec-fasttext-bigram.model"):
    sentences = get_data()
    phrases_model = build_phrases(sentences)
    sentences = sentences_to_bigrams(phrases_model, sentences)
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = FastText(sentences, size=100, window=5, min_count=5, workers=4, sg=1)
    model.save(model_name)
    return model

In [None]:
training = True
if training:
    model = train()
else:
    model = FastText.load("alodokter-word2vec-fasttext.model")
model.wv.most_similar(positive=['kemaluan'])

# test = 'prostat'
# print(test in model.wv.vocab)
# model.wv.most_similar(positive=[test])

In [67]:
training = False
if training:
    model = train_bigrams()
else:
    model = FastText.load("alodokter-word2vec-fasttext-bigram.model")
# model.wv.most_similar(positive=['kemaluan'])

phrase = 'babnya ada berlendir'
print("is '{}' Phrase in Vocabulary? {}".format(phrase, phrase in model.wv.vocab))
model.wv.most_similar(positive=[phrase])

2020-02-19 19:01:34,519 : INFO : loading FastText object from alodokter-word2vec-fasttext-bigram.model
2020-02-19 19:01:34,842 : INFO : loading wv recursively from alodokter-word2vec-fasttext-bigram.model.wv.* with mmap=None
2020-02-19 19:01:34,842 : INFO : loading vectors_ngrams from alodokter-word2vec-fasttext-bigram.model.wv.vectors_ngrams.npy with mmap=None
2020-02-19 19:01:35,233 : INFO : setting ignored attribute vectors_norm to None
2020-02-19 19:01:35,234 : INFO : setting ignored attribute vectors_vocab_norm to None
2020-02-19 19:01:35,234 : INFO : setting ignored attribute vectors_ngrams_norm to None
2020-02-19 19:01:35,234 : INFO : setting ignored attribute buckets_word to None
2020-02-19 19:01:35,235 : INFO : loading vocabulary recursively from alodokter-word2vec-fasttext-bigram.model.vocabulary.* with mmap=None
2020-02-19 19:01:35,235 : INFO : loading trainables recursively from alodokter-word2vec-fasttext-bigram.model.trainables.* with mmap=None
2020-02-19 19:01:35,236 : I

is 'babnya ada berlendir' Phrase in Vocabulary? False


[('feses_berlendir', 0.9052314758300781),
 ('hijau_berlendir', 0.9005783796310425),
 ('berlendir', 0.9004101753234863),
 ('fesesnya_berlendir', 0.8864094018936157),
 ('babnya_berlendir', 0.883073091506958),
 ('pupnya_berlendir', 0.8699895143508911),
 ('fesesnya_cair', 0.8115257620811462),
 ('fesesnya_berwarna', 0.8106538653373718),
 ('warna_fesesnya', 0.8097831010818481),
 ('fases', 0.8090940713882446)]