In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import pandas as pd

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
# Reading in our file
raw_data = pd.read_csv('IMDBDataset.csv')
reviews = raw_data.review
labels = raw_data.sentiment
print(labels)

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object


In [None]:
# Get data & labels
labels

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object

In [None]:
# Replace 'positive' with 1; 'negative' with 0
labels.replace({'positive':1, 'negative':0}, inplace=True)

In [None]:
patterns = ['<br />', '--', '.', ',', '!', '?', ')', '(', ';', ':', '*', '~', '_', "'", '"']
replacements = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '']

In [None]:
def preprocessing(reviews, patterns, replacements):
  lst = []
  for i in range(len(reviews)):
    review = reviews[i].lower()
    for pattern, replacement in zip(patterns, replacements):
      review = review.replace(pattern, replacement)
    lst.append(review)
  return lst

In [None]:
reviews = preprocessing(reviews, patterns, replacements)

In [None]:
num_train = 35000
num_val = 15000
longest_num_tokens = 250

In [None]:
def indexing_tokens():
  indices = {'<SOS>':0, '<EOS>':1, '<PAD>':2, '<UNK>':3}
  index = 4
  for i in range(num_train):
    tokens = reviews[i].split()
    for token in tokens:
      if token not in indices:
        indices[token] = index
        index += 1
  return indices


In [None]:
def get_data(indices, longest_line_tokens, mode='train'):
    data = []
    Y = []
    if mode == 'train':
      for i in range(num_train):
        one_train_data = []
        y, tokens = labels[i], reviews[i].split()
        for token in tokens:
          one_train_data.append(indices[token])
          if len(one_train_data) == longest_line_tokens:
            break
        while len(one_train_data) < longest_line_tokens:
          one_train_data.append(indices['<PAD>'])
        one_train_data.insert(0, indices['<SOS>'])
        one_train_data.append(indices['<EOS>'])
        data.append(one_train_data)
        Y.append(y)
    else:
      for i in range(num_train, num_train+num_val):
        one_val_data = []
        y, tokens = labels[i], reviews[i].split()
        for token in tokens:
          if token in indices:
            one_val_data.append(indices[token])
          else:
            one_val_data.append(indices['<UNK>'])
          if len(one_val_data) == longest_line_tokens:
            break
        while len(one_val_data) < longest_line_tokens:
          one_val_data.append(indices['<PAD>'])
        one_val_data.insert(0, indices['<SOS>'])
        one_val_data.append(indices['<EOS>'])
        data.append(one_val_data)
        Y.append(y)
    return data, Y

In [None]:
# Loading Training Data & Val Data
indices = indexing_tokens()
training_data, training_labels = get_data(indices, longest_num_tokens)
val_data, val_labels = get_data(indices, longest_num_tokens, mode='val')

In [None]:
print('Number of training:', len(training_data))
print('Number of validation:', len(val_data))
print('Length of corpus:', len(indices))

Number of training: 35000
Number of validation: 15000
Length of corpus: 122545


In [None]:
# Create tensors of train & val
train_tensor = torch.tensor(training_data)
train_labels_tensor = torch.tensor(training_labels)
val_tensor = torch.tensor(val_data)
val_labels_tensor = torch.tensor(training_labels)

In [None]:
print('Train Tensor:', train_tensor.shape)
print('Val Tensor:', val_tensor.shape)

Train Tensor: torch.Size([35000, 252])
Val Tensor: torch.Size([15000, 252])


In [None]:
vocab_size = 122545
embedding_dim = 300
hidden_dim = 256
sequence_len = 252
output_dim = 2
print_every = 400
batch_size = 32

In [None]:
class MyModel(nn.Module):
  def __init__(self, vocab_size, embedding_dims, hidden_dim, output_dim):
    super().__init__()
    self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, x):
    # N x 252
    embedded_data = self.embedding_layer(x)
    # N x 252 x 300
    output, (h_n, c_n) = self.lstm(embedded_data)
    out = output[:, -1, :]
    # out = h_n.squeeze()
    out = self.fc(out)
    return out

In [None]:
model = MyModel(vocab_size, embedding_dim, hidden_dim, output_dim)
model = model.cuda()

In [None]:
mini_trains = DataLoader(train_tensor, batch_size=batch_size)
mini_train_labels = DataLoader(training_labels, batch_size=batch_size)

mini_vals = DataLoader(val_tensor, batch_size=batch_size)
mini_val_labels = DataLoader(val_labels, batch_size=batch_size)

In [None]:
iterator = iter(mini_trains)
print(next(iterator).shape)

iterator = iter(mini_train_labels)
print(next(iterator).shape)

torch.Size([32, 252])
torch.Size([32])


In [None]:
# Evaluate Procedure
def evaluate_predictor(model, epoch, mini_vals, mini_val_labels, device):
  model.eval()
  with torch.no_grad():
    acc_count = 0
    for x, y in zip(mini_vals, mini_val_labels):
      x=x.to(device)
      y=y.to(device)
      scores=model(x)
      predictions=scores.max(1)[1]
      acc = predictions.eq(y).sum().item()
      acc_count += acc
    print(f'Epoch[{epoch+1}] Acc: {acc_count/len(val_data)}')

In [None]:
# Training Procedure
def train(num_epoch, model, mini_trains, mini_train_labels, mini_vals, mini_val_labels, device, loss_function, optimizer):
  for epoch in range(num_epoch):
    num_iters = 0
    for x, y in zip(mini_trains, mini_train_labels):
      model.train()
      x = x.to(device)
      y = y.to(device)
      scores = model(x)
      loss = loss_function(scores, y)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      if num_iters % print_every == 0:
        evaluate_predictor(model, epoch, mini_vals, mini_val_labels, device)
      num_iters += 1

In [None]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [None]:
# Start training
train(5, model, mini_trains, mini_train_labels, mini_vals, mini_val_labels, device, loss_function, optimizer)

Epoch[1] Acc: 0.5026
Epoch[1] Acc: 0.5093333333333333
Epoch[1] Acc: 0.5006666666666667
Epoch[2] Acc: 0.5282
Epoch[2] Acc: 0.7037333333333333
Epoch[2] Acc: 0.7168
Epoch[3] Acc: 0.7915333333333333
Epoch[3] Acc: 0.8125333333333333
Epoch[3] Acc: 0.845
Epoch[4] Acc: 0.8436
Epoch[4] Acc: 0.8526666666666667
Epoch[4] Acc: 0.8652
Epoch[5] Acc: 0.8636666666666667
Epoch[5] Acc: 0.8643333333333333
Epoch[5] Acc: 0.8685333333333334
