In [28]:
import torch
import torch.nn.functional as fun
import numpy as np
from google_play_scraper import Sort, reviews
import csv
from tqdm import tqdm

from nltk.tokenize import wordpunct_tokenize

def clean_word(sentence):
    sentence = wordpunct_tokenize(sentence)
    cleaned = []
    for word in sentence:
        if(word.isalpha()):
            cleaned.append(word.lower())
    return cleaned

def get_corpus_from_csv():
    corpus = []
    with open("dfyutup-cleanedfixversion3.csv") as file_buffer:
        csv_reader = list(csv.reader(file_buffer))
        n = len(csv_reader)
        for i in range(1, n):
            corpus.append(clean_word(csv_reader[i][2]))
    return corpus

    
def get_corpus_google_play(cnt = 2000):
    result, _ = reviews(
        'com.miHoYo.GenshinImpact',
        lang='id', 
        country='id', 
        sort=Sort.MOST_RELEVANT, 
        count=cnt, 
        filter_score_with=None
    )
    return [clean_word(konten["content"]) for konten in result]

# corpus = []
# corpus = get_corpus_google_play()
# print(corpus)
# print(get_corpus_from_csv())

# corpus = get_corpus_from_csv()
# print(corpus)

In [7]:
import pickle

# obj0, obj1, obj2 are created here...

def save_corpus(idx = 0):
    # Saving the objects:
    global corpus
    with open(f'corpus-objs-{idx}.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
        pickle.dump([corpus], f)

def load_corpus(idx = 0):
    # Getting back the objects:
    global corpus
    save_corpus(999)
    with open(f'corpus-objs-{idx}.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
       [corpus] = pickle.load(f)

In [29]:
load_corpus(1)

In [150]:
print(corpus[30:50])

[['sungguh', 'kadang', 'aku', 'gagal', 'focus', 'apa', 'aku', 'ini', 'bermain', 'game', 'atau', 'nonton', 'drama', 'semua', 'karna', 'background', 'music', 'nya', 'yg', 'super', 'duper', 'enak', 'di', 'telinga', 'blm', 'lg', 'pengisi', 'suara', 'jepang', 'yg', 'khas', 'bikin', 'betah', 'mantengin', 'story', 'sudah', 'hampir', 'gak', 'sia', 'sia', 'aku', 'download', 'bermain', 'game', 'ini', 'puas', 'dalam', 'segala', 'hal', 'meski', 'terkadang', 'sulit', 'tp', 'aku', 'menikmati', 'sangat', 'suka', 'genshin', 'impact'], ['game', 'nya', 'memang', 'sudah', 'bagus', 'dari', 'segi', 'mana', 'pun', 'tapi', 'setelah', 'update', 'jadi', 'ngelag', 'banget', 'padahal', 'sebelumnya', 'ga', 'separah', 'ini', 'kalau', 'bisa', 'game', 'nya', 'lebih', 'di', 'optimalkan', 'lagi', 'atau', 'tambahkan', 'fitur', 'render', 'distance', 'karena', 'memang', 'untuk', 'render', 'map', 'nya', 'bisa', 'di', 'bilang', 'terlalu', 'jauh', 'mungkin', 'itu', 'penyebab', 'lag', 'nya', 'jadi', 'mohon', 'untuk', 'pihak'

In [30]:
import torch
import torch.nn.functional as fun
import numpy as np


def create_vocab(corpus, limit_occ = 20):
    """
    membuat vocabulary, dengan 2 info penting: 
        1. word_id -> mapping kata ke id (int)
        2. id_word -> sebaliknya, mapping dari id ke word
    """
    word_id = {}
    id_word = []

    occurence = dict()
    i = 0
    for sentence in corpus:
        for word in sentence:
            occurence.setdefault(word, 0)
            occurence[word] += 1
            
    for k, v in occurence.items():
        if(v < limit_occ): continue
        print(k)
        word_id[k] = i
        id_word.append(k)
        i += 1
                
    id_word = {id: word for (word, id) in word_id.items()}
    return word_id, id_word

print(len(corpus))
word_id, id_word = create_vocab(corpus)
vocab_size = len(word_id)

print(word_id)


2000
grafis
yang
memanjakan
mata
quest
dan
event
sangat
banyak
map
open
world
luas
untuk
atau
lagi
bosen
char
jadi
cocok
buat
para
selain
itu
ada
fitur
pc
mobile
saat
masih
bisa
main
akun
di
kita
masalah
ketika
saya
sedang
story
waktu
skip
dialog
pas
domain
tiba
tidak
bahkan
sudah
kali
tetap
mau
jalan
juga
lumayan
bagus
jujur
menurut
aku
genshin
impact
game
dari
segi
nya
yah
walaupun
beberapa
bikin
mungkin
harus
soalnya
player
pasti
bosan
dengan
konten
kayak
gitu
terus
aja
gue
the
character
tapi
gak
suka
ini
gamenya
terlalu
besar
puzzle
agak
susah
kalau
karakter
kadang
membuat
banget
grafik
ga
pun
cuma
kekurangan
yaitu
berat
ganti
device
seru
gameplay
menarik
tolong
tambahkan
dapat
teka
teki
unik
serta
petualangan
keren
pengalaman
bermain
low
paling
yg
pernah
mainin
android
kurang
terkadang
sama
misi
bgt
belum
kalo
jaringan
error
udah
lama
awal
karna
server
lah
ya
jangan
satu
ad
gacha
selalu
ampas
d
pity
orang
lain
aj
udh
resin
artefak
jelek
up
gk
sih
mainkan
sampe
hp
setiap
dapet
mala

In [31]:
# dataset generator
def iter_dataset(corpus, context_window=2):
    for sentence in corpus:
        words = sentence
        for i in range(context_window, len(words)):
            kiri = i - context_window
            kanan = i + context_window
            if(kiri < 0 or kanan >= len(words)):
                continue
            center = words[i]
            context = words[kiri:i] + words[i + 1:kanan + 1]
            # print(kiri, kanan, center, context)
            yield center, context

def data_to_tensor(center, context):
    if(center not in word_id.keys()):
        return 0, -1, []
    can = 1
    for word in context:
        if(word not in word_id.keys()):
            can = 0
            break
    if(not can):
        return 0, -1, []
    return\
    1,\
    fun.one_hot(torch.tensor([word_id[center]]), num_classes=vocab_size).float(),\
    fun.one_hot(torch.tensor([word_id[word] for word in context]), num_classes=vocab_size).float()


def get_dataset(corpus, context_window = 2):
    # membuat dataset
    X = torch.tensor([])
    Y = torch.tensor([])
    averager = torch.tensor(np.full((1, context_window * 2), 1/(context_window * 2))).float()
    for center, context in tqdm(list(iter_dataset(corpus, context_window))):
        # print(center, context)
        can, tmp_center, tmp_context = data_to_tensor(center, context)
        if(can == 0):
            continue
        # print(tmp_context)
        averaged = averager.matmul(tmp_context)
        X = torch.cat((X, averaged), dim = 0)
        Y = torch.cat((Y, tmp_center), dim = 0)
        # break
    print(X.shape, Y.shape)
    return X, Y

context_list, center_list = get_dataset(corpus)


100%|██████████| 62396/62396 [04:29<00:00, 231.12it/s] 

torch.Size([18462, 530]) torch.Size([18462, 530])





In [35]:
import pickle

# obj0, obj1, obj2 are created here...

def save_training(idx = 0):
    # Saving the objects:
    global center_list, context_list
    with open(f'training-objs-{idx}.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
        pickle.dump([center_list, context_list], f)

def load_training(idx = 0):
    # Getting back the objects:
    global center_list, context_list
    save_training(999)
    with open(f'training-objs-{idx}.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
       center_list, context_list = pickle.load(f)

In [39]:
save_training(1)

In [40]:
# Softmax
def softmax(x):
    maxes = torch.max(x, 1, keepdim=True)[0]
    x_exp = torch.exp(x - maxes)
    x_exp_sum = torch.sum(x_exp, 1, keepdim=True)
    return x_exp / x_exp_sum

# Categorical Cross Entropy Loss


def CCEloss(Y_pred, Y_true):
    m = Y_pred.size()[0]
    return -(1 / m) * torch.sum(Y_true * torch.log(Y_pred))
    
tmp_c = None
tmp_w = None

In [84]:

def train(training_X, training_Y, EPOCHS = 200, load = False):
    global tmp_w
    global tmp_c

    EMBEDDING_DIMS = 10

    initrange = 1

    # trick: di awal, mode gradient jangan diaktifkan dahulu, karena ingin
    # memodifikasi nilai di C dan W secara in-place
    C = torch.rand(vocab_size, EMBEDDING_DIMS, requires_grad = False)
    W = torch.rand(EMBEDDING_DIMS, vocab_size, requires_grad = False)

    # agar nilai awal parameter di C dan W berkisar di antara -initrange hingga +initrange
    C = -2 * initrange * C + initrange
    W = -2 * initrange * W + initrange

    # setelah operasi in-place modification, baru kita set mode gradient
    C.requires_grad = True
    W.requires_grad = True
    if(load):
        C = tmp_c
        W = tmp_w

    print(f'C shape is: {C.shape}, W shape is: {W.shape}')
    print(C)
    print(W)

    LEARNING_RATE = 0.2
    LR_DECAY = 0.99

    for i in range(EPOCHS):
        X = training_X
        Y = training_Y

        h = X.mm(C)
        Y_pred = softmax(h.mm(W))

        loss = CCEloss(Y_pred, Y)
        loss.backward()

        # update C dan W
        with torch.no_grad():
            C -= LEARNING_RATE * C.grad
            W -= LEARNING_RATE * W.grad
            
            C.grad.zero_()
            W.grad.zero_()
        tmp_w = W
        tmp_c = C
        if i % 10 == 0:
            w_1 = "gue"
            w_2 = "gua"
            vector_1 = C[word_id[w_1]]
            vector_2 = C[word_id[w_2]]
            sim = fun.cosine_similarity(vector_1, vector_2, dim = 0)
            print(f'Epoch {i}, loss = {loss}, sim({w_1}, {w_2}) = {sim}')
    return C, W


In [130]:

embedding_C, context_W = train(center_list, context_list, 5000, True)

C shape is: torch.Size([530, 10]), W shape is: torch.Size([10, 530])
tensor([[ 0.7268,  0.5934, -0.0656,  ..., -0.3056,  0.5803, -0.7210],
        [-0.0373,  0.2939,  0.4222,  ...,  0.4172,  0.5024, -0.2676],
        [-0.6025, -0.5130,  0.2942,  ..., -0.6959,  0.8843,  0.0881],
        ...,
        [ 0.7391, -0.1085, -0.2995,  ...,  0.7642,  1.1039,  0.0315],
        [ 0.3047,  0.5723, -0.5667,  ...,  0.7041,  0.6507, -1.2359],
        [-0.2610, -0.5947, -0.6398,  ..., -0.3687,  0.8473, -0.6169]],
       requires_grad=True)
tensor([[-0.0511,  0.2966,  0.5018,  ...,  0.1763,  0.0339, -0.0749],
        [-0.2182, -1.1369,  0.3581,  ..., -0.4726,  0.5563,  0.3661],
        [-0.3232,  0.7368,  0.0552,  ...,  0.2643, -0.6273,  0.2487],
        ...,
        [-0.5547,  0.8188, -0.3788,  ...,  0.1908,  0.3431,  0.1879],
        [-0.3138,  1.7500, -0.6104,  ..., -0.9719, -0.4650, -0.3186],
        [ 0.1853, -1.7752, -0.4349,  ...,  1.0249,  0.3740,  0.9460]],
       requires_grad=True)
Epoch 0, 

In [45]:
import pickle

# obj0, obj1, obj2 are created here...
tmp_w = []
tmp_c = []
def save(idx = 0):
    # Saving the objects:
    global tmp_w, tmp_c
    with open(f'cbow-objs-{idx}.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
        pickle.dump([tmp_w, tmp_c], f)

def load(idx = 0):
    # Getting back the objects:
    global tmp_w, tmp_c
    save(999)
    with open(f'cbow-objs-{idx}.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
        tmp_w, tmp_c = pickle.load(f)

In [131]:
save(1)

In [183]:
def get_diff(word_1, word_2):
    global tmp_c
    vector_1 = tmp_c[word_id[word_1]]
    vector_2 = tmp_c[word_id[word_2]]
    print(fun.cosine_similarity(vector_1, vector_2, dim = 0))

def show_vector(word_1):
    global tmp_c
    vector_1 = tmp_c[word_id[word_1]].detach().numpy()
    print(vector_1)


def predict(context, K = 30, context_window = 2):
    global tmp_c
    global tmp_w
    print()
    print(f"Predicting kata di antara \"{context[:context_window]}\"  \"{context[context_window:]}\"")
    C = tmp_c
    W = tmp_w
    averager = torch.tensor(np.full((1, context_window * 2), 1/(context_window * 2))).float()
    tmp_context = fun.one_hot(torch.tensor([word_id[word] for word in context]), num_classes=vocab_size).float()

    averaged = averager.matmul(tmp_context)
    h = averaged.mm(C)
    Y_pred = softmax(h.mm(W))[0]
    prediction = sorted([(Y_pred[i], i) for i in range(len(Y_pred))], reverse=True)
    # print(prediction)
    printed = 0
    stop_words = {"di", "saya", "nya", "tapi", "dan", "yang"}
    for pred in prediction:
        if(printed == K): break
        print(id_word[pred[1]], pred[0])
    # print(prediction)
    # print(Y_pred)


In [184]:
predict(["developer", "segera", "tolong", "tambahin"], 5)


Predicting kata di antara "['developer', 'segera']"  "['tolong', 'tambahin']"


TypeError: list indices must be integers or slices, not tuple

In [182]:
predict(["ngelag", "hp", "memori", "pelit"], 10)


Predicting kata di antara "['ngelag', 'hp']"  "['memori', 'pelit']"
saya tensor(0.0229, grad_fn=<SelectBackward0>)
di tensor(0.0186, grad_fn=<SelectBackward0>)
nya tensor(0.0163, grad_fn=<SelectBackward0>)
tapi tensor(0.0158, grad_fn=<SelectBackward0>)
dan tensor(0.0145, grad_fn=<SelectBackward0>)
yang tensor(0.0144, grad_fn=<SelectBackward0>)
game tensor(0.0142, grad_fn=<SelectBackward0>)
untuk tensor(0.0136, grad_fn=<SelectBackward0>)
ini tensor(0.0126, grad_fn=<SelectBackward0>)
main tensor(0.0105, grad_fn=<SelectBackward0>)


In [176]:
predict(["grafik", "game", "dan", "selalu"], 3)


Predicting kata di antara "['grafik', 'game']"  "['dan', 'selalu']"
bagus tensor(0.0403, grad_fn=<SelectBackward0>)
nya tensor(0.0366, grad_fn=<SelectBackward0>)
dan tensor(0.0252, grad_fn=<SelectBackward0>)


In [177]:
show_vector("gua")
show_vector("gue")
show_vector("aku")
show_vector("gw")
get_diff("gua", "gw")
get_diff("gue", "gw")
get_diff("yang", "yg")

[-0.64153534 -0.14231764  0.20323235 -0.10890751  0.54430175 -0.24244943
 -0.4718347  -0.20876306  0.43952826 -0.9823814 ]
[-0.49375433 -0.32716674  0.01924404  0.39315605 -0.2416808  -0.29318908
 -0.54311585  0.11475743 -0.15989536 -1.4036239 ]
[-0.47334203  0.20788756  0.1150723   0.1599345   0.22111885  0.3101654
  0.04160032  0.3467      0.33623174 -0.8152762 ]
[ 0.6716652  -0.28088072 -0.08984442  0.09001283  0.10036802  0.19118062
 -0.33434376  0.6658514   0.07748628 -0.716639  ]
tensor(0.1781, grad_fn=<SumBackward1>)
tensor(0.4348, grad_fn=<SumBackward1>)
tensor(0.9563, grad_fn=<SumBackward1>)
