Modified code from: Alfan Farizki Wicaksono, S.T., M.Sc., Ph.D.

In [12]:
import torch
import torch.nn.functional as fun
import numpy as np

from google_play_scraper import Sort, reviews
import csv

from nltk.tokenize import wordpunct_tokenize

def clean_word(sentence):
    sentence = wordpunct_tokenize(sentence)
    cleaned = []
    for word in sentence:
        if(word.isalpha()):
            cleaned.append(word.lower())
    return cleaned

def get_corpus_from_csv():
    corpus = []
    with open("dfyutup-cleanedfixversion3.csv") as file_buffer:
        csv_reader = list(csv.reader(file_buffer))
        n = len(csv_reader)
        for i in range(1, n):
            corpus.append(clean_word(csv_reader[i][2]))
    return corpus

    
def get_corpus_google_play(cnt = 100):
    result, _ = reviews(
        'com.apps.MyXL',
        lang='id', 
        country='id', 
        sort=Sort.MOST_RELEVANT, 
        count=cnt, 
        filter_score_with=None
    )
    return [clean_word(konten["content"]) for konten in result]

# print(get_corpus_google_play())
# print(get_corpus_from_csv())

corpus = get_corpus_from_csv()
# print(corpus)

In [13]:
import torch
import torch.nn.functional as fun
import numpy as np


def create_vocab(corpus, limit_occ = 50):
    """
    membuat vocabulary, dengan 2 info penting: 
        1. word_id -> mapping kata ke id (int)
        2. id_word -> sebaliknya, mapping dari id ke word
    """
    word_id = {}
    id_word = []

    occurence = dict()
    i = 0
    for sentence in corpus:
        for word in sentence:
            occurence.setdefault(word, 0)
            occurence[word] += 1
            
    for k, v in occurence.items():
        if(v < limit_occ): continue
        print(k)
        word_id[k] = i
        id_word.append(k)
        i += 1
                
    id_word = {id: word for (word, id) in word_id.items()}
    return word_id, id_word


word_id, id_word = create_vocab(corpus)
vocab_size = len(word_id)

print(word_id)


rick
lu
anak
ya
itu
bang
gw
sub
kontol
apa
di
mu
muka
lo
anjing
gue
nonton
youtube
suka
gaya
haha
wkwkwkw
buat
haters
semua
sok
video
erickolim
kalo
gk
ngapain
ditonton
eriko
juga
dia
klo
gak
cuma
doang
kayak
memek
asu
babi
kecil
v
tapi
ngakak
ericko
lim
walaupun
toxic
mantap
dah
rik
tuh
aja
gua
konten
adsense
masih
iya
banget
anjeng
bikin
vidio
ada
yg
kali
dari
goblok
ni
bgt
kek
org
subscribe
trus
komentar
lagi
biar
hater
kagak
terus
harus
semangat
aku
tau
terkenal
bukan
hal
yang
dan
tetap
eric
jadi
youtuber
lebih
sukses
bisa
erick
fans
karna
kalau
jangan
bau
karya
sabar
ngomong
usah
i
love
you
auto
subs
like
bilang
sama
orang
jembut
otak
bg
para
tu
banyak
bro
menghibur
anti
si
mah
k
ajg
wkwk
nya
tetep
salah
ganteng
penting
berkarya
comment
intro
emang
channel
tanpa
bakal
ini
hahaha
heters
tolol
hiburan
mendidik
is
the
duit
btw
d
jgn
tpi
kan
bocah
njing
satu
mati
kapan
komen
bangsat
eh
subscriber
atau
lain
sangat
karena
bego
punya
mending
kyk
gini
kasar
sumpah
gila
pada
koment
lah
eri

In [6]:
# dataset generator
def iter_dataset(corpus, context_window=1):
    for sentence in corpus:
        words = sentence
        for i in range(len(words)):
            center_word = words[i]
            # cari konteks ke kiri
            j = i - 1
            while (j >= i - context_window) and (j >= 0):
                context_word = words[j]
                yield (center_word, context_word)
                j -= 1
            # cari konteks ke kanan
            j = i + 1
            while (j <= i + context_window) and (j < len(words)):
                context_word = words[j]
                yield (center_word, context_word)
                j += 1


def get_dataset(corpus):
    # membuat dataset
    X = []
    Y = []
    for center, context in iter_dataset(corpus):
        try:
            if(center not in word_id.keys() or context not in word_id.keys()):
                continue
            X.append(word_id[center])
            Y.append(word_id[context])
        except:
            pass

    X_ctr = torch.tensor(X)
    Y_ctx = torch.tensor(Y)
    return X_ctr, Y_ctx


def get_input_tensor(tensor):
    return fun.one_hot(tensor, num_classes=vocab_size)


center_list, context_list = get_dataset(corpus)
# get_input_tensor(center_list[2])


In [7]:
# Softmax
def softmax(x):
    maxes = torch.max(x, 1, keepdim=True)[0]
    x_exp = torch.exp(x - maxes)
    x_exp_sum = torch.sum(x_exp, 1, keepdim=True)
    return x_exp / x_exp_sum

# Categorical Cross Entropy Loss


def CCEloss(Y_pred, Y_true):
    m = Y_pred.size()[0]
    return -(1 / m) * torch.sum(Y_true * torch.log(Y_pred))


In [8]:
tmp_c = None
tmp_w = None

In [22]:

def train(X_ctr, Y_ctx, EPOCHS = 200, load = False):
    global tmp_w
    global tmp_c

    EMBEDDING_DIMS = 10

    initrange = 1

    # trick: di awal, mode gradient jangan diaktifkan dahulu, karena ingin
    # memodifikasi nilai di C dan W secara in-place
    C = torch.rand(vocab_size, EMBEDDING_DIMS, requires_grad = False)
    W = torch.rand(EMBEDDING_DIMS, vocab_size, requires_grad = False)

    # agar nilai awal parameter di C dan W berkisar di antara -initrange hingga +initrange
    C = -2 * initrange * C + initrange
    W = -2 * initrange * W + initrange

    # setelah operasi in-place modification, baru kita set mode gradient
    C.requires_grad = True
    W.requires_grad = True
    if(load):
        C = tmp_c
        W = tmp_w

    print(f'C shape is: {C.shape}, W shape is: {W.shape}')
    print(C)
    print(W)

    LEARNING_RATE = 0.2
    LR_DECAY = 0.99

    for i in range(EPOCHS):
        X = get_input_tensor(X_ctr).float()
        Y = get_input_tensor(Y_ctx).float()

        h = X.mm(C)
        Y_pred = softmax(h.mm(W))

        loss = CCEloss(Y_pred, Y)
        loss.backward()

        # update C dan W
        with torch.no_grad():
            C -= LEARNING_RATE * C.grad
            W -= LEARNING_RATE * W.grad
            
            C.grad.zero_()
            W.grad.zero_()
        tmp_w = W
        tmp_c = C
        if i % 10 == 0:
            # kita coba lihat progress cosine similarity
            # antara embedding "lo" dengan "lu"
            vector_1 = C[word_id["lo"]]
            vector_2 = C[word_id["lu"]]
            sim = fun.cosine_similarity(vector_1, vector_2, dim = 0)
            print(f'Epoch {i}, loss = {loss}, sim(lo, lu) = {sim}')
    return C, W


In [27]:
embedding_C, context_W = train(center_list, context_list, 500, False)

C shape is: torch.Size([407, 10]), W shape is: torch.Size([10, 407])
tensor([[ 0.7699,  0.0689,  0.0686,  ...,  0.1680, -0.8254,  0.0960],
        [-0.1037,  0.8447,  0.5698,  ...,  0.9096,  0.2174, -0.7846],
        [-0.7678, -0.9936, -0.3147,  ..., -0.8590,  0.3445, -0.2095],
        ...,
        [-0.0073, -0.4168, -0.1123,  ...,  0.3423, -0.5214,  0.5765],
        [ 0.3406, -0.9154,  0.9959,  ...,  0.3365,  0.4900, -0.7793],
        [ 0.7294,  0.0520,  0.5655,  ..., -0.0241,  0.1011,  0.8905]],
       requires_grad=True)
tensor([[-0.7078, -0.3604,  0.6459,  ..., -0.6949,  0.2670, -0.3616],
        [-0.7790,  0.5272,  0.4276,  ..., -0.8605, -0.7306, -0.6701],
        [-0.8267,  0.8954, -0.2884,  ...,  0.4384, -0.0119,  0.5845],
        ...,
        [-0.5367, -0.4530, -0.4226,  ...,  0.5493,  0.1827,  0.6038],
        [-0.6155, -0.9396,  0.1070,  ...,  0.1873, -0.7378, -0.3369],
        [-0.8514,  0.1236, -0.6284,  ...,  0.7436,  0.3376, -0.7323]],
       requires_grad=True)
Epoch 0, 

KeyboardInterrupt: 

In [37]:
embedding_C, context_W = train(center_list, context_list, 1000, True)

C shape is: torch.Size([407, 10]), W shape is: torch.Size([10, 407])
tensor([[ 0.5541,  0.4224,  0.2032,  ...,  0.3763, -0.9508, -0.0233],
        [ 0.0225,  0.4562,  0.2719,  ...,  0.6570, -0.4622, -0.8025],
        [-0.2941, -0.5128, -0.0827,  ..., -0.4481,  0.2397, -0.3369],
        ...,
        [-0.0154, -0.3424, -0.0734,  ...,  0.3606, -0.4895,  0.5099],
        [ 0.8068, -2.2320,  1.3647,  ...,  0.6656, -0.1872, -0.6056],
        [ 0.7607, -0.0101,  0.5153,  ..., -0.0170,  0.1398,  0.7353]],
       requires_grad=True)
tensor([[-0.4861,  0.7008,  0.5579,  ..., -0.6535,  0.6670, -0.2758],
        [-0.4886,  1.7876,  0.3512,  ..., -0.7703, -2.1920, -0.5970],
        [-0.5341,  0.8421, -0.2261,  ...,  0.2875,  0.9685,  0.2870],
        ...,
        [-0.1760,  0.4762, -0.5302,  ...,  0.4241,  0.4319,  0.4583],
        [-0.6933, -2.3615,  0.1322,  ...,  0.2927, -0.2280, -0.2128],
        [-0.8266,  0.3524, -0.6373,  ...,  0.6685, -0.2907, -0.5295]],
       requires_grad=True)
Epoch 0, 

In [4]:
import pickle

# obj0, obj1, obj2 are created here...
tmp_w = []
tmp_c = []
def save(idx = 0):
    # Saving the objects:
    global tmp_w, tmp_c
    with open(f'skipgram-objs-{idx}.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
        pickle.dump([tmp_w, tmp_c], f)

def load(idx = 0):
    # Getting back the objects:
    global tmp_w, tmp_c
    save(999)
    with open(f'skipgram-objs-{idx}.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
        tmp_w, tmp_c = pickle.load(f)

In [16]:
load(1)

In [17]:
def get_diff(word_1, word_2):
    global tmp_c
    vector_1 = tmp_c[word_id[word_1]]
    vector_2 = tmp_c[word_id[word_2]]
    print(fun.cosine_similarity(vector_1, vector_2, dim = 0))



def show_vector(word_1):
    global tmp_c
    vector_1 = tmp_c[word_id[word_1]].detach().numpy()
    print(vector_1)


def predict(kata, K = 3):
    global tmp_c
    global tmp_w
    print()
    print(f"Predicting kata setelah \"{kata}\"")
    C = tmp_c
    W = tmp_w
    X = get_input_tensor(torch.tensor([word_id[kata]])).float()
    h = X.mm(C)
    Y_pred = softmax(h.mm(W))[0]
    prediction = sorted([(Y_pred[i], i) for i in range(len(Y_pred))], reverse=True)
    # print(prediction)
    for i in range(K):
        print(id_word[prediction[i][1]])
    # print(prediction)
    # print(Y_pred)


In [19]:
show_vector("gua")
show_vector("gue")
show_vector("aku")
show_vector("gw")
get_diff("gua", "gw")
get_diff("gue", "gw")

[-0.2722139   0.18811591  0.39483726  0.1276671   0.38488406  0.25605953
  0.19266635  0.71376663 -0.7646854  -0.3206997 ]
[-0.46989498 -0.12896392  0.4453655   0.14686778  0.50263804  0.1988348
  0.02273537  0.7050972  -0.6474231  -0.41661516]
[-0.03588893  0.11914188 -0.25533378  0.13690066  0.6920997   0.5876573
 -0.07938295  0.41481638  0.24626774  0.62607116]
[-0.30892617  0.2908435   0.6487435   0.08201316  0.34246147  0.01942619
  0.3103443   0.7628867  -0.6283434  -0.15487556]
tensor(0.9444, grad_fn=<SumBackward1>)
tensor(0.8743, grad_fn=<SumBackward1>)


In [20]:
get_diff("ngakak", "wkwk")
get_diff("lu", "bang")
get_diff("mantap", "keren")
get_diff("gua", "lu") # ??
get_diff("gua", "gw")
get_diff("gue", "gw")


tensor(0.4105, grad_fn=<SumBackward1>)
tensor(0.6363, grad_fn=<SumBackward1>)
tensor(0.3320, grad_fn=<SumBackward1>)
tensor(0.8303, grad_fn=<SumBackward1>)
tensor(0.9444, grad_fn=<SumBackward1>)
tensor(0.8743, grad_fn=<SumBackward1>)


In [56]:
predict("tapi")
predict("hiya")
predict("bang")
predict("haters")
predict("gw")


Predicting kata setelah "tapi"
boong
lu
gua

Predicting kata setelah "hiya"
hiya
ah
a

Predicting kata setelah "bang"
lu
gw
lo

Predicting kata setelah "haters"
lu
haters
gua

Predicting kata setelah "gw"
suka
bang
lu
