In [1]:
import pickle
import bz2
import pandas as pd

## Compressing RAW IMDB Dataset File (only used once)

In [2]:
def compressed_pickle(title, data):
    with bz2.BZ2File(title + '.pbz2','w') as f: 
        pickle.dump(data, f)
        
def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = pickle.load(data)
    return data

In [3]:
## Compress csv file
# df = pd.read_csv("../../data/0. External Data (not used in research)/IMDB Dataset.csv")
# compressed_pickle("IMDB Dataset Pandas DataFrame", df)

In [4]:
df_orig = decompress_pickle("../../data/0. External Data (not used in research)/IMDB Dataset Pandas DataFrame.pbz2")
df_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


# Testing training model

In [5]:
import torch
import math

import numpy as np
import torch.nn.functional as F
import modules.lstm_cnn as mod

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
df = df_orig.copy()
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
text = df['review'].values.tolist()

label2id = {
    "positive" : 1,
    "negative" : 0
}
id2label = {v: k for k, v in label2id.items()}
labels = df['sentiment'].apply(lambda x: label2id[x]).values.tolist()

In [8]:
tokenizer = mod.StaticTokenizerEncoderWraper(text, tokenize=lambda s: s.split())

In [9]:
# Batching function
# https://stackoverflow.com/questions/8290397/how-to-split-an-iterable-in-constant-size-chunks
from itertools import islice

def batcher(iterable, batch_size):
    iterator = iter(iterable)
    while batch := list(islice(iterator, batch_size)):
        yield batch


In [10]:
batch_size = 64
epochs = 10

batch_generator_text = batcher(text, batch_size)
batch_generator_label = batcher(labels, batch_size)
num_mini_batch = math.ceil(len(text) / batch_size)

model = mod.LSTM_CNN(256, text_vectorizer=tokenizer)

optim = torch.optim.SGD(model.parameters(), lr=1e-1, momentum=0.9)

for i in range(epochs):
    print(f"EPOCH {i + 1} / {epochs}")
    for j in range(num_mini_batch):
        model.zero_grad()
        mini_batch_text = next(batch_generator_text)
        mini_batch_labels = next(batch_generator_label)
        target = torch.tensor(mini_batch_labels, dtype=torch.int64)
        
        pred = model(mini_batch_text)
        loss = F.cross_entropy(pred, target)
        loss.backward()
        optim.step()
        print(loss.detach())

EPOCH 1 / 10
tensor(0.6941)
tensor(0.6951)
tensor(0.6907)
tensor(0.6944)
tensor(0.7015)
tensor(0.7026)
tensor(0.6920)
tensor(0.6950)
tensor(0.6806)


KeyboardInterrupt: 