In [33]:
import torch
import pandas as pd
import numpy as np

from collections import Counter
from torch import nn, optim
from torch.utils.data import DataLoader

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(
        self, sequence_length, documents, labels,
    ):
        self.sequence_length = sequence_length
        self.words = self.load_words(documents)
        self.uniq_words = self.get_uniq_words()

        # id vocab mulai dari 1, bukan 0; 0 untuk [PAD]
        self.index_to_word = {(index + 1): word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: (index + 1) for index, word in enumerate(self.uniq_words)}
        self.index_to_word[0] = "[PAD]"
        self.word_to_index["[PAD]"] = 0

        self.labels = labels
        self.docs = []
        for doc in documents:
            # print(self.to_ids(doc))
            self.docs.append(self.to_ids(doc))

    def to_ids(self, doc):
        doc = [self.word_to_index[w] for w in self.tokenize(doc)]
        if len(doc) >= self.sequence_length:
            doc = doc[:self.sequence_length]
        else:
            doc += [0] * (self.sequence_length - len(doc))
        return doc

    def tokenize(self, text):
        return text.split(' ')

    def load_words(self, documents):
        text = ""
        for doc in documents:
          text += doc + " "
        return self.tokenize(text)

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.docs)

    def __getitem__(self, index):
        return (
            torch.tensor(self.docs[index]),
            torch.tensor(self.labels[index]),
        )

In [35]:
class LSTMNet(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):

        super(LSTMNet,self).__init__()

        # Embedding layer
        # padding_idx (int, optional) – If specified, the entries at padding_idx do
        # not contribute to the gradient; therefore, the embedding vector at
        # padding_idx is not updated during training, i.e. it remains as a fixed “pad”.
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # LSTM layer process the vector sequences
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # Dense layer to predict
        self.fc = nn.Linear(hidden_dim, output_dim)

        # Prediction activation function
        self.sigmoid = nn.Sigmoid()


    def forward(self, text):
        embedded = self.embedding(text)
        # print(embedded)
        output, (hidden_state, cell_state) = self.lstm(embedded)
        output = torch.mean(output, dim=1)
        output = self.fc(output)
        output = self.sigmoid(output)
        return output

In [36]:
def train(dataset, model, batch_size, max_epochs=400):
    model.train()

    dataloader = DataLoader(dataset, batch_size=batch_size)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    print(len(dataset.index_to_word))
    for epoch in range(max_epochs):
        for batch, (x, y) in enumerate(dataloader):
            # print(f"x {x}, y{y}")
            y_pred = model(x)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })

In [37]:
documents = ["buku bagus rapih cerdas dan menarik",
             "rumah rapih cantik dan bersih",
             "hotel kotor berisik dan bau",
             "kantin jorok kotor mahal dan panas"]
labels = [[1.], [1.], [0.], [0.]]

dataset = Dataset(8, documents, labels)
print(dataset.index_to_word)
model = LSTMNet(len(dataset.index_to_word), 16, 16, 1)
train(dataset, model, 2, max_epochs=200)

{1: 'dan', 2: 'rapih', 3: 'kotor', 4: 'buku', 5: 'bagus', 6: 'cerdas', 7: 'menarik', 8: 'rumah', 9: 'cantik', 10: 'bersih', 11: 'hotel', 12: 'berisik', 13: 'bau', 14: 'kantin', 15: 'jorok', 16: 'mahal', 17: 'panas', 18: '', 0: '[PAD]'}
19
{'epoch': 0, 'batch': 0, 'loss': 0.7406687140464783}
{'epoch': 0, 'batch': 1, 'loss': 0.650598406791687}
{'epoch': 1, 'batch': 0, 'loss': 0.7360941171646118}
{'epoch': 1, 'batch': 1, 'loss': 0.6503865718841553}
{'epoch': 2, 'batch': 0, 'loss': 0.7334392666816711}
{'epoch': 2, 'batch': 1, 'loss': 0.6498233675956726}
{'epoch': 3, 'batch': 0, 'loss': 0.7311673164367676}
{'epoch': 3, 'batch': 1, 'loss': 0.6491103172302246}
{'epoch': 4, 'batch': 0, 'loss': 0.7290561199188232}
{'epoch': 4, 'batch': 1, 'loss': 0.6483135223388672}
{'epoch': 5, 'batch': 0, 'loss': 0.7270290851593018}
{'epoch': 5, 'batch': 1, 'loss': 0.6474621295928955}
{'epoch': 6, 'batch': 0, 'loss': 0.7250498533248901}
{'epoch': 6, 'batch': 1, 'loss': 0.6465710401535034}
{'epoch': 7, 'batch'

{'epoch': 32, 'batch': 1, 'loss': 0.611229658126831}
{'epoch': 33, 'batch': 0, 'loss': 0.659563422203064}
{'epoch': 33, 'batch': 1, 'loss': 0.6091500520706177}
{'epoch': 34, 'batch': 0, 'loss': 0.6560918688774109}
{'epoch': 34, 'batch': 1, 'loss': 0.6069943904876709}
{'epoch': 35, 'batch': 0, 'loss': 0.6525152921676636}
{'epoch': 35, 'batch': 1, 'loss': 0.6047602295875549}
{'epoch': 36, 'batch': 0, 'loss': 0.6488320827484131}
{'epoch': 36, 'batch': 1, 'loss': 0.602445125579834}
{'epoch': 37, 'batch': 0, 'loss': 0.6450409293174744}
{'epoch': 37, 'batch': 1, 'loss': 0.6000468730926514}
{'epoch': 38, 'batch': 0, 'loss': 0.6411407589912415}
{'epoch': 38, 'batch': 1, 'loss': 0.5975630879402161}
{'epoch': 39, 'batch': 0, 'loss': 0.6371310353279114}
{'epoch': 39, 'batch': 1, 'loss': 0.5949915647506714}
{'epoch': 40, 'batch': 0, 'loss': 0.6330113410949707}
{'epoch': 40, 'batch': 1, 'loss': 0.5923303365707397}
{'epoch': 41, 'batch': 0, 'loss': 0.6287820339202881}
{'epoch': 41, 'batch': 1, 'loss

In [39]:
# prediction
model.eval()

with torch.no_grad():
  sent = "buku bagus rapih cerdas dan menarik"
  sent = torch.tensor([dataset.to_ids(sent)])
  print(model(sent))

tensor([[0.8865]])
