In [1]:
!pip install pytorch-nlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-nlp
  Downloading pytorch_nlp-0.5.0-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.1/90.1 KB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytorch-nlp
Successfully installed pytorch-nlp-0.5.0


In [2]:
!wget "https://is3.cloudhost.id/s3.kaenova.my.id/IMDB%20Dataset.zip"
!unzip "/content/IMDB Dataset.zip"

--2023-02-03 08:27:00--  https://is3.cloudhost.id/s3.kaenova.my.id/IMDB%20Dataset.zip
Resolving is3.cloudhost.id (is3.cloudhost.id)... 103.63.24.211, 103.63.24.210
Connecting to is3.cloudhost.id (is3.cloudhost.id)|103.63.24.211|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26962657 (26M) [application/octet-stream]
Saving to: ‘IMDB Dataset.zip’


2023-02-03 08:27:01 (58.0 MB/s) - ‘IMDB Dataset.zip’ saved [26962657/26962657]

Archive:  /content/IMDB Dataset.zip
  inflating: IMDB Dataset.csv        


In [1]:
import pickle
import bz2
import pandas as pd

## Compressing RAW IMDB Dataset File (only used once)

In [2]:
def compressed_pickle(title, data):
    with bz2.BZ2File(title + '.pbz2','w') as f: 
        pickle.dump(data, f)
        
def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = pickle.load(data)
    return data

In [3]:
# df_orig = pd.read_csv("/content/IMDB Dataset.csv")
df_orig = decompress_pickle("../../data/0. External Data (not used in research)/IMDB Dataset Pandas DataFrame.pbz2")

# Testing training model

In [33]:

import torch
import math
from tqdm import tqdm

import numpy as np
import torch.nn.functional as F
from torchmetrics.classification import MulticlassAccuracy

# Custom handmade library
import kaelib.processor.preprocessing_func as prep_func
from kaelib.processor import TextProcessingPipeline,NDETCStemmerWraper
from kaelib.model import StaticTokenizerEncoderWraper, LSTM_CNN

In [14]:
df = df_orig.copy()
df.head(5)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [41]:
# Train test split

test_ratio = 0.001
train_size = int(len(df) // (1 / (1 - test_ratio)))

text_train = df["review"].values.tolist()[:train_size]
text_test = df["review"].values.tolist()[train_size:]

label2id = {"positive": 1, "negative": 0}
id2label = {v: k for k, v in label2id.items()}

labels_train = df["sentiment"].apply(lambda x: label2id[x]).values.tolist()[:train_size]
labels_test = df["sentiment"].apply(lambda x: label2id[x]).values.tolist()[train_size:]


In [42]:
# Prep text
pipeline = TextProcessingPipeline([
    prep_func.lowercasing,
    prep_func.remove_html_tags,
    prep_func.remove_url,
    prep_func.remove_punctuation
])

X_train = pipeline.process_corpus(text_train)
X_test = pipeline.process_corpus(text_test)

y_train = labels_train[:]
y_test = labels_test[:]

In [43]:
tokenizer = StaticTokenizerEncoderWraper(X_train, tokenize=lambda s: s.split())

In [44]:
# Batching function
# https://stackoverflow.com/questions/8290397/how-to-split-an-iterable-in-constant-size-chunks
from itertools import islice

def batcher(iterable, batch_size):
    iterator = iter(iterable)
    while batch := list(islice(iterator, batch_size)):
        yield batch


In [46]:
batch_size = 125
epochs = 10
device = "cpu"
num_mini_batch = math.ceil(len(X_train) / batch_size)

model = LSTM_CNN(256, text_vectorizer=tokenizer, device=device).to(device)
optim = torch.optim.Adam(model.parameters())

metrics = MulticlassAccuracy(2)

# Allocate some test tensor
target_test = torch.tensor(y_test, dtype=torch.int64, device=device)

for i in range(epochs):
    batch_generator_text = batcher(X_train, batch_size)
    batch_generator_label = batcher(y_train, batch_size)

    epoch_loss = []
    epoch_loss_test = []
    epoch_metrics_test = []

    with tqdm(total=num_mini_batch) as pbar:
        pbar.set_description(f"EPOCH {i + 1} / {epochs}")
        
        for j in range(num_mini_batch):
            model.zero_grad()
            # Prepare data
            mini_batch_text = next(batch_generator_text)
            mini_batch_labels = next(batch_generator_label)
            target = torch.tensor(mini_batch_labels, dtype=torch.int64, device=device)

            # Forward
            pred = model(mini_batch_text)
            loss = F.cross_entropy(pred, target)
            # Backprop
            loss.backward()
            optim.step()

            # Forward test
            with torch.no_grad():
                pred_test = model(X_test)
                loss_test = F.cross_entropy(pred_test, target_test)
                metrics_test = metrics(pred_test, target_test)

            # Metrics and logging
            epoch_loss.append(loss.item())
            avg_loss = sum(epoch_loss) / len(epoch_loss)

            epoch_loss_test.append(loss_test.item())
            avg_loss_test = sum(epoch_loss_test) / len(epoch_loss_test)
            epoch_metrics_test.append(metrics_test.item())
            avg_metric_test = sum(epoch_metrics_test) / len(epoch_metrics_test)

            pbar.set_postfix(
                {
                    "loss": f"{avg_loss:.4f}",
                    "loss_test": f"{avg_loss_test:.4f}",
                    "metric_test": f"{avg_metric_test:.4f}",
                }
            )
            pbar.update(1)


EPOCH 1 / 10:   0%|          | 1/400 [00:08<53:37,  8.06s/it, loss=0.6961, loss_test=0.6867, metric_test=0.3971]

In [12]:
test_input = ["This movie suck", "I love this movie so much", "Meh", "This movie kinda suck, but there's some part that i love"]
with torch.no_grad():
    pred = model(test_input)
    pred = F.softmax(pred, dim=1)
    pred_np = pred.argmax(dim=1).cpu().detach().numpy()
    for i in range(len(test_input)):
        print(f"'{test_input[i]}' : {id2label[pred_np[i]]}")

'This movie suck' : positive
'I love this movie so much' : positive
'Meh' : positive
'This movie kinda suck, but there's some part that i love' : positive
