In [8]:
# Prepare Google Colab Environement and build handmade library
!git clone https://github.com/kaenova/Headline_Detection.git
%cd "/content/Headline_Detection"

!make lib

!wget "https://is3.cloudhost.id/s3.kaenova.my.id/IMDB%20Dataset.zip"
!unzip "/content/Headline_Detection/IMDB Dataset.zip"

--2023-02-04 08:47:27--  https://is3.cloudhost.id/s3.kaenova.my.id/IMDB%20Dataset.zip
Resolving is3.cloudhost.id (is3.cloudhost.id)... 103.63.24.210, 103.63.24.211
Connecting to is3.cloudhost.id (is3.cloudhost.id)|103.63.24.210|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26962657 (26M) [application/octet-stream]
Saving to: ‘IMDB Dataset.zip’


2023-02-04 08:52:15 (71.0 KB/s) - Connection closed at byte 20920424. Retrying.

--2023-02-04 08:52:16--  (try: 2)  https://is3.cloudhost.id/s3.kaenova.my.id/IMDB%20Dataset.zip
Connecting to is3.cloudhost.id (is3.cloudhost.id)|103.63.24.210|:443... connected.
HTTP request sent, awaiting response... 206 Partial Content
Length: 26962657 (26M), 6042233 (5.8M) remaining [application/octet-stream]
Saving to: ‘IMDB Dataset.zip’

IMDB Dataset.zip    100%[+++++++++++++++====>]  25.71M  1.25MB/s    in 9.0s    

2023-02-04 08:52:26 (655 KB/s) - ‘IMDB Dataset.zip’ saved [26962657/26962657]

Archive:  /content/Headline_Detection

In [9]:
# # Reset Google Colab Environment
# %cd ..
# !rm -fr Headline_Detection

In [10]:
import pickle
import bz2
import pandas as pd

## Compressing RAW IMDB Dataset File (only used once)

In [11]:
def compressed_pickle(title, data):
    with bz2.BZ2File(title + '.pbz2','w') as f: 
        pickle.dump(data, f)
        
def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = pickle.load(data)
    return data

In [13]:
df_orig = pd.read_csv("/content/Headline_Detection/IMDB Dataset.csv") # Uncomment this if you're running on colab
# df_orig = decompress_pickle("../../data/0. External Data (not used in research)/IMDB Dataset Pandas DataFrame.pbz2") # Comment this if you're using colab

# Testing training model

In [14]:
import torch
import math
from tqdm import tqdm

import numpy as np
import torch.nn.functional as F
from torchmetrics.classification import MulticlassAccuracy

# Custom handmade library
import kaelib.processor.preprocessing_func as prep_func
from kaelib.processor import TextProcessingPipeline,NDETCStemmerWraper
from kaelib.model import StaticTokenizerEncoderWraper, LSTM_CNN

In [44]:
df = df_orig.copy()
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [45]:
# Train test split

test_ratio = 0.001
train_size = int(len(df) // (1 / (1 - test_ratio)))

text_train = df["review"].values.tolist()[:train_size]
text_test = df["review"].values.tolist()[train_size:]

label2id = {"positive": 1, "negative": 0}
id2label = {v: k for k, v in label2id.items()}

labels_train = df["sentiment"].apply(lambda x: label2id[x]).values.tolist()[:train_size]
labels_test = df["sentiment"].apply(lambda x: label2id[x]).values.tolist()[train_size:]


In [46]:
# Prep text
pipeline = TextProcessingPipeline([
    prep_func.lowercasing,
    prep_func.remove_html_tags,
    prep_func.remove_url,
    prep_func.remove_punctuation
])

X_train = pipeline.process_corpus(text_train)
X_test = pipeline.process_corpus(text_test)

y_train = labels_train[:]
y_test = labels_test[:]

In [47]:
tokenizer = StaticTokenizerEncoderWraper(X_train, tokenize=lambda s: s.split())

In [48]:
# Batching function
# https://stackoverflow.com/questions/8290397/how-to-split-an-iterable-in-constant-size-chunks
from itertools import islice

def batcher(iterable, batch_size):
    iterator = iter(iterable)
    while batch := list(islice(iterator, batch_size)):
        yield batch


In [50]:
batch_size = 125
epochs = 10
device = "cuda"
lr = 1e-2

num_mini_batch = math.ceil(len(X_train) / batch_size)

model = LSTM_CNN(256, text_vectorizer=tokenizer, device=device).to(device)
optim = torch.optim.Adam(model.parameters(), lr=lr)

metrics = MulticlassAccuracy(2).to(device)

# Allocate some test tensor
target_test = torch.tensor(y_test, dtype=torch.int64, device=device)

for i in range(epochs):
    batch_generator_text = batcher(X_train, batch_size)
    batch_generator_label = batcher(y_train, batch_size)

    epoch_loss = []
    epoch_loss_test = []
    epoch_metrics_test = []

    with tqdm(total=num_mini_batch) as pbar:
        pbar.set_description(f"EPOCH {i + 1} / {epochs}")
        
        for j in range(num_mini_batch):
            model.zero_grad()
            # Prepare data
            mini_batch_text = next(batch_generator_text)
            mini_batch_labels = next(batch_generator_label)
            target = torch.tensor(mini_batch_labels, dtype=torch.int64, device=device)

            # Forward
            pred = model(mini_batch_text)
            loss = F.cross_entropy(pred, target)
            # Backprop
            loss.backward()
            optim.step()

            # Forward test
            with torch.no_grad():
                pred_test = model(X_test)
                loss_test = F.cross_entropy(pred_test, target_test)
                metrics_test = metrics(pred_test, target_test)

            # Metrics and logging
            epoch_loss.append(loss.item())
            avg_loss = sum(epoch_loss) / len(epoch_loss)

            epoch_loss_test.append(loss_test.item())
            avg_loss_test = sum(epoch_loss_test) / len(epoch_loss_test)
            epoch_metrics_test.append(metrics_test.item())
            avg_metric_test = sum(epoch_metrics_test) / len(epoch_metrics_test)

            pbar.set_postfix(
                {
                    "loss": f"{avg_loss:.4f}",
                    "loss_test": f"{avg_loss_test:.4f}",
                    "metric_test": f"{avg_metric_test:.4f}",
                }
            )
            pbar.update(1)


EPOCH 1 / 10: 100%|██████████| 400/400 [02:45<00:00,  2.41it/s, loss=0.7203, loss_test=0.6962, metric_test=0.5721]
EPOCH 2 / 10: 100%|██████████| 400/400 [02:45<00:00,  2.42it/s, loss=0.3876, loss_test=0.2594, metric_test=0.9177]
EPOCH 3 / 10: 100%|██████████| 400/400 [02:45<00:00,  2.42it/s, loss=0.2049, loss_test=0.1965, metric_test=0.9450]
EPOCH 4 / 10: 100%|██████████| 400/400 [02:45<00:00,  2.42it/s, loss=0.1300, loss_test=0.2414, metric_test=0.9399]
EPOCH 5 / 10: 100%|██████████| 400/400 [02:45<00:00,  2.41it/s, loss=0.1488, loss_test=0.3974, metric_test=0.9304]
EPOCH 6 / 10: 100%|██████████| 400/400 [02:45<00:00,  2.41it/s, loss=0.2266, loss_test=0.3161, metric_test=0.9396]
EPOCH 7 / 10: 100%|██████████| 400/400 [02:45<00:00,  2.42it/s, loss=0.0530, loss_test=0.2080, metric_test=0.9442]
EPOCH 8 / 10: 100%|██████████| 400/400 [02:45<00:00,  2.42it/s, loss=0.0296, loss_test=0.1908, metric_test=0.9568]
EPOCH 9 / 10: 100%|██████████| 400/400 [02:45<00:00,  2.42it/s, loss=0.0281, los

In [51]:
test_input = ["This movie suck", "I love this movie so much", "Meh", "This movie kinda suck, but there's some part that i love"]
with torch.no_grad():
    pred = model(test_input)
    pred = F.softmax(pred, dim=1)
    pred_np = pred.argmax(dim=1).cpu().detach().numpy()
    for i in range(len(test_input)):
        print(f"'{test_input[i]}' : {id2label[pred_np[i]]}")

'This movie suck' : negative
'I love this movie so much' : negative
'Meh' : negative
'This movie kinda suck, but there's some part that i love' : negative
