<a href="https://colab.research.google.com/github/hpzhang94/289g-project/blob/main/BiLSTM_TextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import time

from google.colab import drive
drive.mount('/content/drive')

tokenizer = get_tokenizer("basic_english")

device = torch.device('cuda:0')

def getR8():
  r8train = pandas.read_csv('/content/drive/MyDrive/Colab Files/R8-lstm/train.txt', sep='\t', names=['label', 'sentence'])
  r8test = pandas.read_csv('/content/drive/MyDrive/Colab Files/R8-lstm/test.txt', sep='\t', names=['label', 'sentence'])
  return r8train, r8test

def getMovieReview():
  trainX = pandas.read_csv('/content/drive/MyDrive/Colab Files/MovieReview/text_train.txt', sep='\t', names=['sentence'])
  trainY = pandas.read_csv('/content/drive/MyDrive/Colab Files/MovieReview/label_train.txt', sep='\t', names=['label'])
  train = pandas.concat([trainX, trainY], axis=1)
  testX = pandas.read_csv('/content/drive/MyDrive/Colab Files/MovieReview/text_test.txt', sep='\t', names=['sentence'])
  testY = pandas.read_csv('/content/drive/MyDrive/Colab Files/MovieReview/label_test.txt', sep='\t', names=['label'])
  test = pandas.concat([testX, testY], axis=1)
  return train, test

r8train, r8test = getR8()
mr_train, mr_test = getMovieReview()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def get_len_stats(df):
  df['length'] = df['sentence'].str.split().str.len()
  print(df['length'].describe())
get_len_stats(mr_train)

count    7108.000000
mean       21.000563
std         9.396583
min         1.000000
25%        14.000000
50%        20.000000
75%        27.000000
max        59.000000
Name: length, dtype: float64


In [None]:
def get_vocab(df):
  counts = 0
  vocab = set()
  for index, row in df.iterrows():
    vocab.update(tokenizer(row['sentence']))
  return vocab

def load_glove_vectors():
  return GloVe(name='840B', dim=300)

def get_embedding_matrix(pretrained, vocab, emb_size=300):
  """ Creates embedding matrix from word vectors"""
  vocab_size = len(vocab) + 2
  vocab_to_idx = {}
  new_vocab = ["", "UNK"]
  W = np.zeros((vocab_size, emb_size), dtype="float32")
  W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
  W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
  vocab_to_idx["UNK"] = 1
  i = 2
  for word in vocab:
    W[i] = pretrained.get_vecs_by_tokens([word], lower_case_backup=True)[0]
    vocab_to_idx[word] = i
    new_vocab.append(word)
    i += 1   
  return W, np.array(new_vocab), vocab_to_idx

def encode_sentence(text, vocab_to_idx):
  tokens = tokenizer(text)
  actual_encoding = np.array([vocab_to_idx.get(word, vocab_to_idx["UNK"]) for word in tokens])
  return actual_encoding

def df_mapping(df, vocab_to_idx, label_to_idx=None):
  ## convert sentence to list of indices
  df['encoding'] = df['sentence'].apply(lambda x: encode_sentence(x, vocab_to_idx))
  if label_to_idx:
    df['Y'] = df['label'].map(label_to_idx)
  else:
    df['Y'] = df['label']
  return df

def get_label_mapping(df):
  labels = df['label'].unique()
  label_to_idx = {labels[i]:i for i in range(len(labels))}
  idx_to_label = {i:labels[i] for i in range(len(labels))}
  return label_to_idx, idx_to_label

# pretrained = load_glove_vectors()

# for R8
vocab = get_vocab(r8train)
print(f'Vocab size is {len(vocab)}')
W, vocab2, vocab_to_idx = get_embedding_matrix(pretrained, vocab)
label_to_idx, idx_to_label = get_label_mapping(r8train)
train_df = df_mapping(r8train, vocab_to_idx, label_to_idx)
test_df = df_mapping(r8test, vocab_to_idx, label_to_idx)

# for MR
# vocab = get_vocab(mr_train)
# print(f'Vocab size is {len(vocab)}')
# W, vocab2, vocab_to_idx = get_embedding_matrix(pretrained, vocab)
# label_to_idx, idx_to_label = get_label_mapping(mr_train)
# train_df = df_mapping(mr_train, vocab_to_idx)
# test_df = df_mapping(mr_test, vocab_to_idx)

Vocab size is 19982


In [None]:
# y_pred
# accuracy_score(test_df['Y'], y_pred)


def len_cat(x):
  if x < 30: return 'extreme short'
  elif x <= 50: return 'short'
  elif x <= 70: return 'medium'
  else: return 'long'

def length_acc(test_df, y_pred):
  pred_df = pandas.DataFrame(data={'pred': y_pred, 'label': test_df['Y'], 'length': test_df['sentence'].str.split().str.len()})
  pred_df['len'] = pred_df['length'].map(len_cat)
  for cat in pred_df['len'].unique():
    xdf = pred_df[pred_df['len'] == cat]
    print(f"Accuracy of {cat} = {accuracy_score(xdf['label'], xdf['pred'])}")
  return pred_df

pred_df = length_acc(test_df, y_pred)
# pred_df.head()

Accuracy of long = 0.9108079748163693
Accuracy of medium = 0.9897959183673469
Accuracy of extreme short = 0.980349344978166
Accuracy of short = 0.987603305785124


In [None]:
def collate(batch, pad_index=0):
    label_list, seq_list, len_list = [], [], []
    for (_seq, _label, _len) in batch:
      label_list.append(_label)
      seq_list.append(torch.tensor(_seq))
      len_list.append(_len)
    return pad_sequence(seq_list, batch_first=True, padding_value=pad_index), torch.tensor(label_list), torch.tensor(len_list)

class MyDataset(Dataset):
  def __init__(self, X, Y):
    self.X = X
    self.y = Y
  
  def __len__(self):
    return len(self.y)
  
  def __getitem__(self, idx):
    return torch.from_numpy(self.X[idx].astype(np.int32)), self.y[idx], len(self.X[idx])

trainX = list(train_df['encoding'])
trainY = list(train_df['Y'])
testX = list(test_df['encoding'])
testY = list(test_df['Y'])
# from sklearn.model_selection import train_test_split
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)
train_ds = MyDataset(trainX, trainY)
train_dl = DataLoader(train_ds, batch_size=64, collate_fn=collate, shuffle=True)
test_ds = MyDataset(testX, testY)
test_dl = DataLoader(test_ds, batch_size=64, collate_fn=collate, shuffle=False)

In [None]:
from functools import reduce
class LSTM(nn.Module):
  def __init__(self, vocab_size, pretrained, label_size, dimension=128):
    super(LSTM, self).__init__()
    self.embedding = nn.Embedding(vocab_size, 300, padding_idx=0)
    if pretrained is not None:
      self.embedding.weight.data.copy_(torch.from_numpy(pretrained))
      self.embedding.weight.requires_grd = False ## freeze weights
    self.dimension = dimension
    self.lstm = nn.LSTM(input_size=300, hidden_size=dimension, num_layers=1, batch_first=True, bidirectional=True)
    self.drop = nn.Dropout(p=0.2)
    self.fc = nn.Linear(dimension*2, label_size)
  
  def forward(self, text, text_len):
    text_emb = self.embedding(text)
    packed_input = pack_padded_sequence(text_emb, text_len.cpu(), batch_first=True, enforce_sorted=False)
    packed_output, (hidden, cell) = self.lstm(packed_input)

    # output, _ = pad_packed_sequence(packed_output, batch_first=True)
    # out_forward = output[range(len(output)), text_len - 1, :self.dimension]
    # out_reverse = output[:, 0, self.dimension:]
    # out_reduced = torch.cat((out_forward, out_reverse), 1) # 64 * 256
    # text_fea = self.drop(out_reduced)
    text_fea = self.drop(torch.cat((hidden[0,:,:], hidden[1,:,:]), 1)) # 64 * 256, for bidirectional
    # text_fea = self.drop(hidden[-1]) # for unidirectional

    text_fea = self.fc(text_fea)
    text_fea = torch.squeeze(text_fea, 1)
    text_out = torch.sigmoid(text_fea)
    return text_out


In [None]:
# Save and Load Functions

def save_checkpoint(save_path, model, optimizer, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_checkpoint(load_path, model, optimizer):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    optimizer.load_state_dict(state_dict['optimizer_state_dict'])
    
    return state_dict['valid_loss']


def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):

    if save_path == None:
        return
    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']


In [None]:
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(pytorch_total_params)

6437576


In [None]:
# Training Function
def train(model,
          optimizer,
          train_loader,
          valid_loader,
          criterion = nn.CrossEntropyLoss(),
          num_epochs = 5,
          eval_every = len(train_dl) // 2,
          file_path = '/content/drive/MyDrive/Colab Files/MovieReview',
          best_valid_loss = float("Inf")):
    # print(valid_loader)
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    valid_acc_list = []
    global_steps_list = []

    # training loop
    model.train()
    tic = time.perf_counter()
    for epoch in range(num_epochs):
        for x, y, length in train_loader:   
            x = x.long().to(device)
            y = y.long().to(device)
            length = length.long().to(device)
            output = model(x, length)

            loss = criterion(output, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            # if global_step % eval_every == 0:
            #     model.eval()
            #     with torch.no_grad():                    
            #       # validation loop
            #       y_pred = []
            #       y_true = []
            #       for test_x, test_y, test_len in valid_loader:
            #           test_x = test_x.long().to(device)
            #           test_y = test_y.long().to(device)
            #           test_len = test_len.long().to(device)
            #           test_output = model(test_x, test_len)
            #           loss = criterion(test_output, test_y)
            #           valid_running_loss += loss.item()
            #           output = torch.argmax(test_output, dim=1)
            #           y_pred.extend(output.tolist())
            #           y_true.extend(test_y.tolist())
    
            #     # evaluation
            #     accuracy = accuracy_score(y_true, y_pred)
            #     average_train_loss = running_loss / eval_every
            #     average_valid_loss = valid_running_loss / len(valid_loader)
            #     train_loss_list.append(average_train_loss)
            #     valid_loss_list.append(average_valid_loss)
            #     valid_acc_list.append(accuracy)
            #     global_steps_list.append(global_step)

            #     # resetting running values
            #     running_loss = 0.0                
            #     valid_running_loss = 0.0
            #     model.train()

            #     # print progress
            #     print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}, Valid Acc: {:.4f}'
            #           .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
            #                   average_train_loss, average_valid_loss, accuracy))
                
            #     # checkpoint
            #     if best_valid_loss > average_valid_loss:
            #         best_valid_loss = average_valid_loss
            #         save_checkpoint(file_path + '/model.pt', model, optimizer, best_valid_loss)
            #         save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    toc = time.perf_counter()
    print(f'Efficiency: {(toc - tic) / epoch} s/epoch')

    save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    df = pandas.DataFrame(data={'train_loss': train_loss_list, 'valid_acc': valid_acc_list})
    print('Finished Training!')
    return df

def evaluate(model, test_loader):
    y_pred = []
    y_true = []

    model.eval()
    with torch.no_grad():
      for test_x, test_y, test_len in test_loader:
        test_x = test_x.long().to(device)
        test_y = test_y.long().to(device)
        test_len = test_len.long().to(device)
        test_output = model(test_x, test_len)
        output = torch.argmax(test_output, dim=1)
        y_pred.extend(output.tolist())
        y_true.extend(test_y.tolist())
    
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy is {accuracy}')
    return y_pred, accuracy


accuracies = []
for i in range(1):
  model = LSTM(len(vocab2), W, len(label_to_idx)).to(device)
  optimizer = optim.Adam(model.parameters(), lr=0.002)
  data_df = train(model, optimizer, train_dl, test_dl, num_epochs=20, file_path='/content/drive/MyDrive/Colab Files/R8-lstm')
  data_df.to_csv('/content/drive/MyDrive/Colab Files/R8-lstm/training_info.csv')
  # load_checkpoint('/content/drive/MyDrive/Colab Files/R8-lstm/model.pt', model, optimizer)
  y_pred, acc = evaluate(model, test_dl)
  accuracies.append(acc)

  """


Efficiency: 3.8318069294736663 s/epoch
Model saved to ==> /content/drive/MyDrive/Colab Files/R8-lstm/metrics.pt
Finished Training!
Accuracy is 0.9529465509365007


In [None]:
accuracies

[0.7791221159257175,
 0.7726505346088914,
 0.7732132808103546,
 0.7647720877884074,
 0.7760270118176702]

In [None]:
print(np.mean(accuracies))
print(np.std(accuracies))

0.7731570061902083
0.004783011673804318


In [None]:
best_model = LSTM(len(vocab2), W, len(label_to_idx)).to(device)
# optimizer = optim.Adam(best_model.parameters(), lr=0.001)

load_checkpoint('/content/drive/MyDrive/Colab Files/MovieReview/model.pt', best_model, optimizer)
pred = pandas.DataFrame({'pred': evaluate(best_model, test_dl)})
pred.to_csv('/content/drive/MyDrive/Colab Files/MovieReview/mr-bilstm-pred.csv')

Model loaded from <== /content/drive/MyDrive/Colab Files/MovieReview/model.pt


  """


Accuracy is 0.7664603263927968
