In [1]:
import torch
import torch.nn.functional as fun
import numpy as np

from google_play_scraper import Sort, reviews
import csv

from nltk.tokenize import wordpunct_tokenize

def clean_word(sentence):
    sentence = wordpunct_tokenize(sentence)
    cleaned = []
    for word in sentence:
        if(word.isalpha()):
            cleaned.append(word.lower())
    return cleaned

def get_corpus_from_csv():
    corpus = []
    with open("dfyutup-cleanedfixversion3.csv") as file_buffer:
        csv_reader = list(csv.reader(file_buffer))
        n = len(csv_reader)
        for i in range(1, n):
            corpus.append(clean_word(csv_reader[i][2]))
    return corpus

    
def get_corpus_google_play(cnt = 100):
    result, _ = reviews(
        'com.apps.MyXL',
        lang='id', 
        country='id', 
        sort=Sort.MOST_RELEVANT, 
        count=cnt, 
        filter_score_with=None
    )
    return [clean_word(konten["content"]) for konten in result]

# print(get_corpus_google_play())
# print(get_corpus_from_csv())

corpus = get_corpus_from_csv()
# print(corpus)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import torch
import torch.nn.functional as fun
import numpy as np


def create_vocab(corpus, limit_occ = 6):
    """
    membuat vocabulary, dengan 2 info penting: 
        1. word_id -> mapping kata ke id (int)
        2. id_word -> sebaliknya, mapping dari id ke word
    """
    word_id = {}
    id_word = []

    occurence = dict()
    i = 0
    for sentence in corpus:
        for word in sentence:
            occurence.setdefault(word, 0)
            occurence[word] += 1
            
    for k, v in occurence.items():
        if(v < limit_occ): continue
        print(k)
        word_id[k] = i
        id_word.append(k)
        i += 1
                
    id_word = {id: word for (word, id) in word_id.items()}
    return word_id, id_word


word_id, id_word = create_vocab(corpus)
vocab_size = len(word_id)

print(word_id)


rick
lu
anak
hasil
kondom
ya
kakak
itu
gg
bang
slow
gw
sub
kontol
apa
di
tangan
mu
muka
lo
anjing
gue
rugi
nonton
youtube
suka
gaya
haha
hibur
wkwkwkw
mantapp
buat
haters
semua
sok
sokan
ngerti
video
erickolim
kalo
gk
ngapain
ditonton
semudah
eriko
juga
maksa
dia
klo
yaudah
gak
subcribe
cuma
doang
kalah
am
atta
kayak
memek
asu
babi
kecil
collab
ma
v
tapi
ngakak
ericko
lim
walaupun
toxic
mantap
dah
rik
tuh
bng
lanjut
aja
gua
suport
konten
adsense
masih
iya
banget
anjeng
bikin
vidio
ada
faedahnya
yg
pantes
bahasa
nyesel
kali
emak
dari
harusnya
goblok
ni
bgt
buka
sifat
asli
kek
org
subscribe
trus
keluar
ngaca
anjin
bngt
sma
luu
hahahahah
balas
komentar
lagi
biar
hater
ka
kagak
terus
pokonya
harus
semangat
aku
tau
terkenal
bukan
suatu
hal
yang
mudah
dan
gampang
butuh
proses
pokoknya
tetap
eric
jadi
youtuber
lebih
sukses
bisa
erick
kimi
hime
fans
karna
kalau
jangan
hina
hmm
ahh
bau
betul
lanjutin
karya
sabar
ngomong
kotor
mulu
usah
dibaca
komennya
i
love
you
auto
subs
like
bilang
sama
orang

In [7]:
# dataset generator
def iter_dataset(corpus, context_window=1):
    for sentence in corpus:
        words = sentence
        for i in range(len(words)):
            center_word = words[i]
            # cari konteks ke kiri
            j = i - 1
            while (j >= i - context_window) and (j >= 0):
                context_word = words[j]
                yield (center_word, context_word)
                j -= 1
            # cari konteks ke kanan
            j = i + 1
            while (j <= i + context_window) and (j < len(words)):
                context_word = words[j]
                yield (center_word, context_word)
                j += 1


def get_dataset(corpus):
    # membuat dataset
    X = []
    Y = []
    for center, context in iter_dataset(corpus):
        try:
            X.append(word_id[center])
            Y.append(word_id[context])
        except:
            pass

    X_ctr = torch.tensor(X)
    Y_ctx = torch.tensor(Y)
    return X_ctr, Y_ctx


def get_input_tensor(tensor):
    return fun.one_hot(tensor, num_classes=vocab_size)


center_list, context_list = get_dataset(corpus)
# get_input_tensor(center_list[2])


In [None]:
# Softmax
def softmax(x):
    maxes = torch.max(x, 1, keepdim=True)[0]
    x_exp = torch.exp(x - maxes)
    x_exp_sum = torch.sum(x_exp, 1, keepdim=True)
    return x_exp / x_exp_sum

# Categorical Cross Entropy Loss


def CCEloss(Y_pred, Y_true):
    m = Y_pred.size()[0]
    return -(1 / m) * torch.sum(Y_true * torch.log(Y_pred))


In [None]:
def train(X_ctr, Y_ctx):

    EMBEDDING_DIMS = 5

    initrange = 0.5 / EMBEDDING_DIMS

    # trick: di awal, mode gradient jangan diaktifkan dahulu, karena ingin
    # memodifikasi nilai di C dan W secara in-place
    C = torch.rand(vocab_size, EMBEDDING_DIMS, requires_grad = False)
    W = torch.rand(EMBEDDING_DIMS, vocab_size, requires_grad = False)

    # agar nilai awal parameter di C dan W berkisar di antara -initrange hingga +initrange
    C = -2 * initrange * C + initrange
    W = -2 * initrange * W + initrange

    # setelah operasi in-place modification, baru kita set mode gradient
    C.requires_grad = True
    W.requires_grad = True

    print(f'C shape is: {C.shape}, W shape is: {W.shape}')
    print(C)
    print(W)

    EPOCHS = 200
    LEARNING_RATE = 0.2
    LR_DECAY = 0.99

    for i in range(EPOCHS):
        X = get_input_tensor(X_ctr).float()
        Y = get_input_tensor(Y_ctx).float()

        h = X.mm(C)
        Y_pred = softmax(h.mm(W))

        loss = CCEloss(Y_pred, Y)
        loss.backward()

        # update C dan W
        with torch.no_grad():
            C -= LEARNING_RATE * C.grad
            W -= LEARNING_RATE * W.grad
            
            C.grad.zero_()
            W.grad.zero_()

        if i % 10 == 0:
            # kita coba lihat progress cosine similarity
            # antara embedding "utk" dengan "untuk"
            vector_utk = C[word_id["utk"]]
            vector_untuk = C[word_id["untuk"]]
            sim = fun.cosine_similarity(vector_utk, vector_untuk, dim = 0)
            print(f'Epoch {i}, loss = {loss}, sim(utk, untuk) = {sim}')
    return C, W
embedding_C, context_W = train(center_list, context_list)


In [None]:
def get_diff(word_1, word_2):
    vector_1 = embedding_C[word_id[word_1]]
    vector_2 = embedding_C[word_id[word_2]]
    print(fun.cosine_similarity(vector_1, vector_2, dim = 0))

get_diff("kek", "kayak")