In [10]:
import os

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, random_split
import pandas as pd
import numpy as np
from utils import clean_sentences, show_history_graph
from trainer import Trainer

try:
  import google.colab
  IN_COLAB = True
  drive.mount('/content/drive')
except:
  IN_COLAB = False


In [11]:
if IN_COLAB:
    imdb_path = "/content/drive/MyDrive/Colab Notebooks/dataset/imdb_simple/IMDB Dataset.csv"
else:
    imdb_path = "dataset/IMDB Dataset.csv"

imdb_df = pd.read_csv(imdb_path)
print(len(imdb_df))

imdb_df['sentiment'].value_counts()
imdb_sentiment = imdb_df['sentiment'].map({'positive': 1, 'negative': 0})
print(imdb_sentiment.value_counts())

x = imdb_df['review'].to_numpy()
y = imdb_sentiment.to_numpy()
print(len(x), len(y))
print(x.shape, y.shape)
print(x[0])
print(y[0])

50000
sentiment
1    25000
0    25000
Name: count, dtype: int64
50000 50000
(50000,) (50000,)
One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agree

In [12]:
# clean sentences
x = [clean_sentences(entry) for entry in x]

In [13]:
# word-level one-hot-tokenize

word_cnt = {}
len_list = []
for entry in x:
    length = 0
    for word in entry.split():
        word = word.lower()
        length += 1
        if word not in word_cnt:
            word_cnt[word] = 0
        word_cnt[word] = word_cnt[word] + 1
    len_list.append(length)

print("max sentence length:", np.max(len_list))
print("mean sentence length:", np.mean(len_list))
print("median sentence length:", np.median(len_list))

print("word max_size:", len(word_cnt))

top_10000_word = sorted(word_cnt.items(), key=lambda item: item[1], reverse=True)[:10000]
vocab = {'<PAD>':0, '<UNK>':1}
for item in top_10000_word:
    vocab[item[0]] = len(vocab)

print(vocab)


# make word to integer token and add pad or truncate
max_len = 100
x_word_token = []
len_list = []
for entry in x:
    sentence = []
    for word in entry.split()[:max_len]:
        word = word.lower()
        token = vocab[word] if word in vocab else vocab['<UNK>']
        sentence.append(token)
    pad_size = max_len - len(sentence)
    if pad_size > 0:
        sentence.extend([0]*pad_size)
    x_word_token.append(sentence)

print("vocab size:", len(vocab))

max sentence length: 2791
mean sentence length: 274.30138
median sentence length: 206.0
word max_size: 104883
vocab size: 10002


In [14]:
x_word_token = np.array(x_word_token)
x_word_token_tensors = torch.tensor(x_word_token, dtype=torch.long)
y_tensors = torch.tensor(y, dtype=torch.long)
print(x_word_token_tensors.shape)
print(y_tensors.shape)

torch.Size([50000, 100])
torch.Size([50000])


In [15]:
config = {
    'batch_size':256,
    'vocab_size':len(vocab),
    'embedding_dim':200,
    'hidden_size':128,
    'num_layers':1,
    'dropout':0,
}

In [16]:
imdb_dataset = TensorDataset(x_word_token_tensors, y_tensors)
train_dataset, val_dataset, test_dataset = random_split(imdb_dataset, [0.7, 0.15, 0.15])
print(f"train dataset:{len(train_dataset)}, val dataset:{len(val_dataset)}, test dataset:{len(test_dataset)}")

train_dataloader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False)

train dataset:35000, val dataset:7500, test dataset:7500


In [17]:
class SimpleRNN(nn.Module):
    def __init__(self, config, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.config = config
        self.embedding = nn.Embedding(
            num_embeddings=config['vocab_size'], 
            embedding_dim=config['embedding_dim']
            )
        self.rnn = nn.RNN(
            input_size=config['embedding_dim'], 
            hidden_size=config['hidden_size'], 
            num_layers=config['num_layers'], 
            batch_first=True, 
            dropout=config['dropout']
            )
        self.fnn = nn.Linear(
            in_features=config['hidden_size'], 
            out_features=1
            )
        
    def forward(self, x):
        out = self.embedding(x)
        output, _ = self.rnn(out)
        # output ( batch, seq_len, input_size)
        last_output = output[:, -1, :].squeeze()
        # last_output = (batch, input_size)
        out = self.fnn(last_output)
        return out

model = SimpleRNN(config)
sample_input = train_dataset[:config['batch_size']][0]
print(sample_input.shape)
test = model(sample_input)
print(test.shape)

torch.Size([256, 100])
torch.Size([256, 1])


In [18]:
config['device'] = 'cuda' if torch.cuda.is_available() else "cpu"
config['epoch'] = 100
config['learning_rate'] = 1e-4
config['embedding_dim'] = 64
config['hidden_size'] = 128
model = SimpleRNN(config)
trainer = Trainer(config, model, train_dataloader, val_dataloader)
history, last_ckpt_path = trainer.train()

start training : lr=0.0001


 63%|██████▎   | 63/100 [00:19<00:11,  3.17it/s, train_loss=0.33925, val_loss=0.47402, train_accu=0.86 val_accu=0.78]

Early stopping at epoch 63
output/SimpleRNN_ep_58_loss_0.4726.pt





In [37]:
class RNNLayer(nn.Module):
    def __init__(self,config, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.hidden_dim = config['hidden_size'] if 'hidden_size' in config else 128
        self.embedding_dim = config['embedding_dim'] if 'embedding_dim' in config else 64

        
        self.input_linear = nn.Linear(in_features=self.embedding_dim, out_features=self.hidden_dim)
        self.hidden_linear = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim)
        self.output_linear = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim)
        self.tanh = nn.Tanh()
        

    def forward(self, x, h_0=None):
        h = h_0
        output = []
        h_state = []
        if h is None:
            h = torch.zeros([x.shape[0], self.hidden_dim])
            h = h.to(x.device)
        for i in range(x.shape[1]):
            input = x[:,i, :]
            h = self.tanh(self.input_linear(input) + self.hidden_linear(h))
            o = self.output_linear(h)
            h_state.append(h)
            output.append(o)
    
        output = torch.stack(output, dim=0)
        h_state = torch.stack(h_state, dim=0)

        return output, h_state
    
class RNN(nn.Module):
    def __init__(self, config, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.embedding = nn.Embedding(
            num_embeddings=config['vocab_size'], 
            embedding_dim=config['embedding_dim']
            )
        self.rnn = RNNLayer(config)
        self.fnn = nn.Linear(
            in_features=config['hidden_size'], 
            out_features=1
            )
    
    def forward(self, x):
        out = self.embedding(x)
        output, _ = self.rnn(out)
        
        # output ( seq_len, batch, input_size)
        last_output = output[-1].squeeze()
        # last_output = (batch, input_size)
        out = self.fnn(last_output)
        return out



model = RNN(config)
sample_input = train_dataset[:config['batch_size']][0]
print(sample_input.shape)
test = model(sample_input)
print(test.shape)

torch.Size([256, 100])
torch.Size([256, 1])


In [38]:
config['device'] = 'cuda' if torch.cuda.is_available() else "cpu"
config['epoch'] = 100
config['learning_rate'] = 1e-4
config['embedding_dim'] = 64
config['hidden_size'] = 128
model = RNN(config)
trainer = Trainer(config, model, train_dataloader, val_dataloader)
history, last_ckpt_path = trainer.train()

start training : lr=0.0001


 50%|█████     | 50/100 [02:06<02:06,  2.52s/it, train_loss=0.38230, val_loss=0.49500, train_accu=0.84 val_accu=0.78]

Early stopping at epoch 50
output/RNN_ep_45_loss_0.4901.pt



