# <center>Implementasi CBOW Dengan Pytorch</center>
<center>Kelas Pengolahan Bahasa Manusia 2017/2018</center>
<center>Lintang Adyuta Sutawika</center>

Tutorial ini akan membahas mengenai implementasi model Continous Bag-of-Words dengan menggunakan library PyTorch (https://pytorch.org/). Pytorch memiliki keunggulan berupa sifatnya yang <i>pythonic</i>, artinya, variable dan model PyTorch dapat dengan mudah berintegrasi dengan library python umum lainnya seperti Numpy atau SciPy. 

# <center>I. Model Word2Vec</center>
<img src="Images/CBOW_Skip-gram.png" style="width: 800px;">

In [None]:
from collections import defaultdict
import time
import random
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(1)

CUDA = torch.cuda.is_available()
print("CUDA: %s" % CUDA)

# Functions to read in the corpus
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]

def read_dataset(filename):
    with open(filename, "r") as f:
        for line in f:
            tag, words = line.lower().strip().split(" ||| ")
            yield ([w2i[x] for x in words.split(" ")], t2i[tag])


#Baca dataset
train = list(read_dataset("dataset/classes/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("dataset/classes/dev.txt"))
nwords = len(w2i)
ntags = len(t2i)

<b>1. Tampilkan array berisi index angka untuk setiap kata yang ada di kalimat "star wars the force awakens was awesome":</b>

In [None]:
#koding di sini

In [None]:
#Pembuatan batch
def preprocess(batch):
    """ Add zero-padding to a batch. """

    tags = [example.tag for example in batch]

    # add zero-padding to make all sequences equally long
    seqs = [example.words for example in batch]
    max_length = max(map(len, seqs))
    seqs = [seq + [PAD] * (max_length - len(seq)) for seq in seqs]

    return seqs, tags

In [None]:
#Model Bag-of-Words
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, output_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, output_dim)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        bow = torch.sum(embeds, 1)
        logits = self.linear(bow)
        return logits

In [None]:
model = CBOW(nwords, 64, ntags)

#Pindahkan model ke GPU apabila tersedia
if CUDA:
    model.cuda()
    
print(model)

In [None]:
def evaluate(model, data):
    """Evaluate a model on a data set."""
    correct = 0.0
    
    for words, tag in data:
        lookup_tensor = Variable(torch.LongTensor([words]))
        scores = model(lookup_tensor)
        predict = scores.data.numpy().argmax(axis=1)[0]

        if predict == tag:
            correct += 1

    return correct, len(data), correct/len(data)

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.001)

for ITER in range(100):

    random.shuffle(train)
    train_loss = 0.0
    start = time.time()

    for words, tag in train:

        # forward pass
        lookup_tensor = Variable(torch.LongTensor([words]))
        scores = model(lookup_tensor)
        loss = nn.CrossEntropyLoss()
        target = Variable(torch.LongTensor([tag]))
        output = loss(scores, target)
        train_loss += output.data[0]

        # backward pass
        model.zero_grad()
        output.backward()

        # update weights
        optimizer.step()

    print("iter %r: train loss/sent=%.4f, time=%.2fs" % 
          (ITER, train_loss/len(train), time.time()-start))

    # evaluate
    _, _, acc = evaluate(model, dev)
    print("iter %r: test acc=%.4f" % (ITER, acc))

<b>2. Deeper CBOW. Tambahkan hidden layer, coba bandingkan.</b>

In [None]:
class Deeper_CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, output_dim):
        super(Deeper_CBOW, self).__init__()

    def forward(self, inputs):
        pass

In [None]:
#koding di sini