In [13]:
import torch
import torch.nn.functional as fun
import numpy as np

from google_play_scraper import Sort, reviews
import csv

from nltk.tokenize import wordpunct_tokenize

def clean_word(sentence):
    sentence = wordpunct_tokenize(sentence)
    cleaned = []
    for word in sentence:
        if(word.isalpha()):
            cleaned.append(word.lower())
    return cleaned

def get_corpus_from_csv():
    corpus = []
    with open("dfyutup-cleanedfixversion3.csv") as file_buffer:
        csv_reader = list(csv.reader(file_buffer))
        n = len(csv_reader)
        for i in range(1, n):
            corpus.append(clean_word(csv_reader[i][2]))
    return corpus

    
def get_corpus_google_play(cnt = 2000):
    result, _ = reviews(
        'com.miHoYo.GenshinImpact',
        lang='id', 
        country='id', 
        sort=Sort.MOST_RELEVANT, 
        count=cnt, 
        filter_score_with=None
    )
    return [clean_word(konten["content"]) for konten in result]

corpus = get_corpus_google_play()
# print(corpus)
# print(get_corpus_from_csv())

# corpus = get_corpus_from_csv()
# print(corpus)

In [14]:
import pickle

# obj0, obj1, obj2 are created here...

def save_corpus(idx = 0):
    # Saving the objects:
    global corpus
    with open(f'corpus-objs-{idx}.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
        pickle.dump([corpus], f)

def load_corpus(idx = 0):
    # Getting back the objects:
    global corpus
    save_corpus(999)
    with open(f'corpus-objs-{idx}.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
        corpus = pickle.load(f)

In [15]:
save_corpus(1)

In [18]:
import torch
import torch.nn.functional as fun
import numpy as np


def create_vocab(corpus, limit_occ = 20):
    """
    membuat vocabulary, dengan 2 info penting: 
        1. word_id -> mapping kata ke id (int)
        2. id_word -> sebaliknya, mapping dari id ke word
    """
    word_id = {}
    id_word = []

    occurence = dict()
    i = 0
    for sentence in corpus:
        for word in sentence:
            occurence.setdefault(word, 0)
            occurence[word] += 1
            
    for k, v in occurence.items():
        if(v < limit_occ): continue
        print(k)
        word_id[k] = i
        id_word.append(k)
        i += 1
                
    id_word = {id: word for (word, id) in word_id.items()}
    return word_id, id_word

print(len(corpus))
word_id, id_word = create_vocab(corpus)
vocab_size = len(word_id)

print(word_id)


2000
grafis
yang
memanjakan
mata
quest
dan
event
sangat
banyak
map
open
world
luas
untuk
atau
lagi
bosen
char
jadi
cocok
buat
para
selain
itu
ada
fitur
pc
mobile
saat
masih
bisa
main
akun
di
kita
masalah
ketika
saya
sedang
story
waktu
skip
dialog
pas
domain
tiba
tidak
bahkan
sudah
kali
tetap
mau
jalan
juga
lumayan
bagus
jujur
menurut
aku
genshin
impact
game
dari
segi
nya
yah
walaupun
beberapa
bikin
mungkin
harus
soalnya
player
pasti
bosan
dengan
konten
kayak
gitu
terus
aja
gue
the
character
tapi
gak
suka
ini
gamenya
terlalu
besar
puzzle
agak
susah
kalau
karakter
kadang
membuat
banget
grafik
ga
pun
cuma
kekurangan
yaitu
berat
ganti
device
seru
gameplay
menarik
tolong
tambahkan
dapat
teka
teki
unik
serta
petualangan
keren
pengalaman
bermain
low
paling
yg
pernah
mainin
android
kurang
terkadang
sama
misi
bgt
belum
kalo
jaringan
error
udah
lama
awal
karna
server
lah
ya
jangan
satu
ad
gacha
selalu
ampas
d
pity
orang
lain
aj
udh
resin
artefak
jelek
up
gk
sih
mainkan
sampe
hp
setiap
dapet
mala

In [22]:
# dataset generator
def iter_dataset(corpus, context_window=2):
    for sentence in corpus:
        words = sentence
        for i in range(context_window, len(words)):
            kiri = i - context_window
            kanan = i + context_window
            if(kiri < 0 or kanan >= len(words)):
                continue
            center = words[context_window]
            context = words[kiri:context_window] + words[context_window + 1:kanan + 1]
            yield center, context

def data_to_tensor(center, context):
    if(center not in word_id.keys()):
        return 0, -1, []
    can = 1
    for word in context:
        if(word not in word_id.keys()):
            can = 0
            break
    if(not can):
        return 0, -1, []
    return 1,fun.one_hot([word_id[center]]), fun.one_hot([word_id[word] for word in context])


def get_dataset(corpus, context_window = 2):
    # membuat dataset
    X = []
    Y = []
    averager = np.full((1, context_window * 2), 1/(context_window * 2))
    for center, context in iter_dataset(corpus, context_window):
        can, tmp_center, tmp_context = data_to_tensor(center, context)
        if(can == 0):
            continue
        X.append(tmp_center)
        averaged = np.matmul(averager, tmp_context)
        Y.append(averaged)
    X_ctr = torch.tensor(X)
    Y_ctx = torch.tensor(Y)
    return X_ctr, Y_ctx

center_list, context_list = get_dataset(corpus)


TypeError: one_hot(): argument 'input' (position 1) must be Tensor, not list