In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Get to the folder we are at
FOLDERNAME = 'Colab\ Notebooks/SC201L17'
%cd drive/MyDrive/$FOLDERNAME/

/content/drive/MyDrive/Colab Notebooks/SC201L17


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import pandas as pd

In [4]:
# Seed for same output
torch.manual_seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [6]:
# Reading in our file
raw_data = pd.read_csv('IMDBDataset.csv')


In [7]:
# Get data & labels
reviews = raw_data.review
labels = raw_data.sentiment

In [8]:
# Replace 'positive' with 1; 'negative' with 0
labels.replace({'positive':1, 'negative':0}, inplace = True)

In [9]:
reviews.head(5)

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

In [10]:
patterns = ['<br />', '--', '.', ',', '!', '?', ')', '(', ';', ':', '*', '~', '_', "'", '"']
replacements = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '']

In [11]:
def preprocessing(reviews, patterns, replacements):
  lst = []
  for i in range(len(reviews)):
    review = reviews[i].lower()
    for pattern, replacement in zip(patterns, replacements):
      review = review.replace(pattern, replacement)

    lst.append(review)

  return lst


In [12]:
reviews = preprocessing(reviews, patterns, replacements)

In [13]:
num_train = 35000
num_val = 15000
longest_num_tokens = 250

In [14]:
def indexing_tokens():
  #UNK = unknown
  indices = {'<SOS>':0, '<EOS>':1, '<PAD>':2, '<UNK>':3}
  counter = 4

  for i in range(num_train):
    tokens = reviews[i].split()
    for token in tokens:
      if token not in indices:
        indices[token] = counter
        counter += 1

  return indices


In [15]:
def get_data(indices, longest_line_tokens, mode='train'):
    data = []
    Y = []
    if mode == 'train':
      for i in range(num_train):
        one_train_data = []
        y, tokens = labels[i], reviews[i].split()

        #tokenizing
        for token in tokens:
          one_train_data.append(indices[token])
          # max seq = longest_line_tokens
          if len(one_train_data) == longest_line_tokens:
            break
        #if review len < longest_num_token
        while len(one_train_data) < longest_num_tokens:
          one_train_data.append(indices['<PAD>'])

        one_train_data.insert(indices['<SOS>'],0)
        one_train_data.append(indices['<EOS>'])
        data.append(one_train_data)
        Y.append(y)
    else:
      for i in range(num_train, num_train+num_val):
        one_val_data = []
        y, tokens = labels[i], reviews[i].split()

        for token in tokens:
          if token not in indices:
            one_val_data.append(indices['<UNK>'])
          else:
            one_val_data.append(indices[token])

          if len(one_val_data) == longest_line_tokens:
            break

        while len(one_val_data) < longest_line_tokens:
          one_val_data.append(indices['<PAD>'])

        one_val_data.insert(indices['<SOS>'],0)
        one_val_data.append(indices['<EOS>'])
        data.append(one_val_data)
        Y.append(y)
    return data, Y

In [17]:
# Loading Training Data & Val Data
indices = indexing_tokens()
training_data, training_labels = get_data(indices, longest_num_tokens)
val_data, val_labels = get_data(indices, longest_num_tokens, mode='val')

In [18]:
print('Number of training:', len(training_data))
print('Number of validation:', len(val_data))
print('Length of corpus:', len(indices))

Number of training: 35000
Number of validation: 15000
Length of corpus: 122545


In [19]:
# Create tensors of train & val
train_tensor = torch.tensor(training_data)
train_labels_tensor = torch.tensor(training_labels)
val_tensor = torch.tensor(val_data)
val_labels_tensor = torch.tensor(training_labels)

In [20]:
print('Train Tensor:', train_tensor.shape)
print('Val Tensor:', val_tensor.shape)

Train Tensor: torch.Size([35000, 252])
Val Tensor: torch.Size([15000, 252])


In [21]:
vocab_size = 122545
# embedding_dim = 將每個"字" 轉換成 300 個 features
embedding_dim = 300
hidden_dim = 256 # can be 64, 128, 256 ...
sequence_len = 252
output_dim = 2 # postive or negative
print_every = 400
batch_size = 32

In [38]:
class MyModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hideen_dim, output_dim):
    super().__init__() #讓其可用 forwarding from nn
    self.embedding_layer = nn.Embedding(vocab_size, embedding_dim) #字典裡面有 122545個字, 每個字要轉換成300個features
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first = True)

    # 通常输入的第一个维度都是batch_size，比如torch.nn.Linear的输入(batch_size,in_features)，torch.nn.Conv2d的输入（batch_size, C, H, W）。而RNN的输入却是(seq_len, batch_size, input_size)，batch_size位于第二维度！虽然你可以将batch_size和序列长度seq_len对换位置，此时只需要令batch_first=True。
    self.fc = nn.Linear(hideen_dim, output_dim)

  def forward(self, x):
    # N x 252
    embedded_data = self.embedding_layer(x)

    # N x 300 x 252
    output, (h_n, c_n) = self.lstm(embedded_data)
    out = output [:, -1, :]
    out = nn.functional.dropout(out)
    out = self.fc(out)
    return out

In [39]:
model = MyModel(vocab_size, embedding_dim, hidden_dim, output_dim)
model = model.cuda()

In [40]:
mini_trains = DataLoader(train_tensor, batch_size=batch_size)
mini_train_labels = DataLoader(training_labels, batch_size=batch_size)

mini_vals = DataLoader(val_tensor, batch_size=batch_size)
mini_val_labels = DataLoader(val_labels, batch_size=batch_size)

In [41]:
iterator = iter(mini_trains)
print(next(iterator).shape)

iterator = iter(mini_train_labels)
print(next(iterator).shape)

torch.Size([32, 252])
torch.Size([32])


In [42]:
# Training Procedure
def train(num_epoch, model, mini_trains, mini_train_labels, mini_vals, mini_val_labels, device, loss_function, optimizer):
  for epoch in range(num_epoch):
    num_iters = 0
    for x, y in zip(mini_trains, mini_train_labels):
      model.train()
      x = x.to(device)
      y = y.to(device)
      scores = model(x)
      loss = loss_function(scores, y)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      if num_iters % print_every == 0:
        evaluate_predictor(model, epoch, mini_vals, mini_val_labels, device)
      num_iters += 1

In [43]:
# Evaluate Procedure
def evaluate_predictor(model, epoch, mini_vals, mini_val_labels, device):
  model.eval()
  with torch.no_grad():
    acc_count = 0
    for x, y in zip(mini_vals, mini_val_labels):
      x=x.to(device)
      y=y.to(device)
      scores=model(x)
      predictions=scores.max(1)[1]
      acc = predictions.eq(y).sum().item()
      acc_count += acc
    print(f'Epoch[{epoch+1}] Acc: {acc_count/len(val_data)}')

In [44]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [45]:
# Start training
train(5, model, mini_trains, mini_train_labels, mini_vals, mini_val_labels, device, loss_function, optimizer)

Epoch[1] Acc: 0.5012666666666666
Epoch[1] Acc: 0.5044
Epoch[1] Acc: 0.501
Epoch[2] Acc: 0.5158
Epoch[2] Acc: 0.5256666666666666
Epoch[2] Acc: 0.5015333333333334
Epoch[3] Acc: 0.5231333333333333
Epoch[3] Acc: 0.6616666666666666
Epoch[3] Acc: 0.7973333333333333
Epoch[4] Acc: 0.817
Epoch[4] Acc: 0.8591333333333333
Epoch[4] Acc: 0.8638666666666667
Epoch[5] Acc: 0.8699333333333333
Epoch[5] Acc: 0.8717333333333334
Epoch[5] Acc: 0.8730666666666667
