In [1]:
!pip install pytorch-nlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-nlp
  Downloading pytorch_nlp-0.5.0-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.1/90.1 KB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytorch-nlp
Successfully installed pytorch-nlp-0.5.0


In [2]:
!wget "https://is3.cloudhost.id/s3.kaenova.my.id/IMDB%20Dataset.zip"
!unzip "/content/IMDB Dataset.zip"

--2023-02-03 08:27:00--  https://is3.cloudhost.id/s3.kaenova.my.id/IMDB%20Dataset.zip
Resolving is3.cloudhost.id (is3.cloudhost.id)... 103.63.24.211, 103.63.24.210
Connecting to is3.cloudhost.id (is3.cloudhost.id)|103.63.24.211|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26962657 (26M) [application/octet-stream]
Saving to: ‘IMDB Dataset.zip’


2023-02-03 08:27:01 (58.0 MB/s) - ‘IMDB Dataset.zip’ saved [26962657/26962657]

Archive:  /content/IMDB Dataset.zip
  inflating: IMDB Dataset.csv        


In [3]:
import pickle
import bz2
import pandas as pd

## Compressing RAW IMDB Dataset File (only used once)

In [4]:
def compressed_pickle(title, data):
    with bz2.BZ2File(title + '.pbz2','w') as f: 
        pickle.dump(data, f)
        
def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = pickle.load(data)
    return data

In [5]:
df_orig = pd.read_csv("/content/IMDB Dataset.csv")

# Testing training model

In [19]:
import torch
import math
from tqdm import tqdm

import numpy as np
import torch.nn.functional as F
import modules.lstm_cnn as mod

In [7]:
df = df_orig.copy()
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
text = df['review'].values.tolist()

label2id = {
    "positive" : 1,
    "negative" : 0
}
id2label = {v: k for k, v in label2id.items()}
labels = df['sentiment'].apply(lambda x: label2id[x]).values.tolist()

In [9]:
tokenizer = mod.StaticTokenizerEncoderWraper(text, tokenize=lambda s: s.split())

In [10]:
# Batching function
# https://stackoverflow.com/questions/8290397/how-to-split-an-iterable-in-constant-size-chunks
from itertools import islice

def batcher(iterable, batch_size):
    iterator = iter(iterable)
    while batch := list(islice(iterator, batch_size)):
        yield batch


In [20]:
batch_size = 512
epochs = 10
device='cuda'
num_mini_batch = math.ceil(len(text) / batch_size)

model = mod.LSTM_CNN(256, text_vectorizer=tokenizer, device=device).to(device)
optim = torch.optim.Adam(model.parameters())

for i in range(epochs):
    print(f"EPOCH {i + 1} / {epochs}")
    batch_generator_text = batcher(text, batch_size)
    batch_generator_label = batcher(labels, batch_size)
    epoch_loss = []
    with tqdm(total=num_mini_batch) as pbar:
        for j in range(num_mini_batch):
            model.zero_grad()
            mini_batch_text = next(batch_generator_text)
            mini_batch_labels = next(batch_generator_label)
            target = torch.tensor(mini_batch_labels, dtype=torch.int64, device=device)
            
            pred = model(mini_batch_text)
            loss = F.cross_entropy(pred, target)
            loss.backward()
            optim.step()

            epoch_loss.append(loss.item())
            avg_loss = sum(epoch_loss) / len(epoch_loss)
            pbar.set_description(f"Avg loss {avg_loss}")
            pbar.update(1)


EPOCH 1 / 10


Avg loss 0.6158621332475117: 100%|██████████| 98/98 [02:26<00:00,  1.49s/it]


EPOCH 2 / 10


Avg loss 0.38624478070711604: 100%|██████████| 98/98 [02:24<00:00,  1.48s/it]


EPOCH 3 / 10


Avg loss 0.25701883541686193: 100%|██████████| 98/98 [02:24<00:00,  1.47s/it]


EPOCH 4 / 10


Avg loss 0.18840720618561824: 100%|██████████| 98/98 [02:24<00:00,  1.48s/it]


EPOCH 5 / 10


Avg loss 0.13073319759295912: 100%|██████████| 98/98 [02:25<00:00,  1.48s/it]


EPOCH 6 / 10


Avg loss 0.07905368696974248: 100%|██████████| 98/98 [02:24<00:00,  1.48s/it]


EPOCH 7 / 10


Avg loss 0.05009155920991788: 100%|██████████| 98/98 [02:24<00:00,  1.48s/it]


EPOCH 8 / 10


Avg loss 0.024669784748432587: 100%|██████████| 98/98 [02:24<00:00,  1.48s/it]


EPOCH 9 / 10


Avg loss 0.01631760454560839: 100%|██████████| 98/98 [02:24<00:00,  1.48s/it]


EPOCH 10 / 10


Avg loss 0.004813332003731356: 100%|██████████| 98/98 [02:25<00:00,  1.48s/it]


In [41]:
test_input = ["This movie suck", "I love this movie so much", "Meh", "This movie kinda suck, but there's some part that i love"]
with torch.no_grad():
    pred = model(test_input)
    pred = F.softmax(pred, dim=1)
    pred_np = pred.argmax(dim=1).cpu().detach().numpy()
    for i in range(len(test_input)):
        print(f"'{test_input[i]}' : {id2label[pred_np[i]]}")

'This movie suck' : negative
'I love this movie so much' : positive
'Meh' : negative
'This movie kinda suck, but there's some part that i love' : positive
