This notebook is written based on [this reference implementation](https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/6%20-%20Transformers%20for%20Sentiment%20Analysis.ipynb).

Other refs for model:
* https://stackoverflow.com/questions/65205582/how-can-i-add-a-bi-lstm-layer-on-top-of-bert-model
* https://discuss.pytorch.org/t/how-to-connect-hook-two-or-even-more-models-together/21033
* https://pytorch.org/tutorials/beginner/transformer_tutorial.html
* https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html

Other refs for torchtext:
* https://towardsdatascience.com/use-torchtext-to-load-nlp-datasets-part-i-5da6f1c89d84
* https://towardsdatascience.com/use-torchtext-to-load-nlp-datasets-part-ii-f146c8b9a496
* http://anie.me/On-Torchtext/

# Imports and setup

In [10]:
import pandas as pd
import numpy as np
import os
import random
random.seed(1)
import re

# Data processing.
import dataset # dataset.py
import torch
from torchtext.legacy import data 

# Model.
import models # models.py
import torch.nn as nn
import torch.optim as optim
from transformers import DistilBertModel, DistilBertTokenizer

# Training.
import training # training.py
from sklearn.model_selection import KFold

# Visualization.
import matplotlib.pyplot as plt

# If you make a code change that doesn't get picked up by
# Jupyter notebook, try reloading like below:
# import imp
# imp.reload(training)

# Load a pre-trained BERT model

In [2]:
WEIGHTS_NAME = 'distilbert-base-uncased'

tokenizer = DistilBertTokenizer.from_pretrained(WEIGHTS_NAME)
bert = DistilBertModel.from_pretrained(WEIGHTS_NAME)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=442.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=267967963.0), HTML(value='')))




# Read the data

In [7]:
data_df = dataset.get_multiple_datasets([0,1,2], 'Creativity_Combined', shuffle=True)

In [8]:
# For prototype purposes:
# split into train, test sets. (Train set will be further split into 
# train+validation sets, via k-fold CV.)
train_df = data_df[:1000]
test_df = data_df[1000:] # roughly 190 test examples set aside

# write them to CSV files
train_df.to_csv('ktrain.csv', index=False, header=False)
test_df.to_csv('ktest.csv', index=False, header=False)

## Preprocessing and transform into torchtext Dataset format.

From what I understand, some preprocessing is done when data.Field() is applied.

In [11]:
INIT_TOKEN_IDX = tokenizer.cls_token_id
EOS_TOKEN_IDX = tokenizer.sep_token_id
PAD_TOKEN_IDX = tokenizer.pad_token_id
UNK_TOKEN_IDX = tokenizer.unk_token_id

# BERT input can be at most 512 words
MAX_INPUT_LENGTH = tokenizer.max_model_input_sizes[WEIGHTS_NAME]

# Apply tokenization and some preprocessing steps to the input sentence.
# Namely, this trims examples down to MAX_INPUT_LENGTH. (There is a -2 
# since the [CLS] and [SEP] tokens will be added)
def tokenize_and_cut(sentence):
  sentence = sentence.replace('/', '') # remove slashes
  tokens = tokenizer.tokenize(sentence) 
  tokens = tokens[:MAX_INPUT_LENGTH-2]
  return tokens

# text_fields defines preprocessing and handling of the text of an example.
text_fields = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = INIT_TOKEN_IDX, # add [CLS] token
                  eos_token = EOS_TOKEN_IDX, # add [SEP] token
                  pad_token = PAD_TOKEN_IDX,
                  unk_token = UNK_TOKEN_IDX)

# label_fields defines how to handle the label of an example.
# for regression, we do not need to build a vocabulary.
label_fields = data.LabelField(sequential=False, use_vocab=False, dtype = torch.float)
all_fields = [('text', text_fields), ('label', label_fields)] # must match order of cols in csv

train_dataset, test_dataset = data.TabularDataset.splits(
  path='', # path='' because the csvs are in the same directory
  train='ktrain.csv', test='ktest.csv', format='csv',
  fields=all_fields  
)

In [9]:
# # Just inspect what the tokenizer is doing
# # // and escape characters \ are kept. We may want to remove them
# print(data_df['text'][1])
# print(tokenize_and_cut(data_df['text'][1]))

In [12]:
# Transform train_dataset into an np array representation.
# This will be used for generating the K folds.
train_exs_arr = np.array(train_dataset.examples)

# Define the BERT-RNN model

In [72]:
# torch.cuda.empty_cache()

In [77]:
# Instantiate the model
HIDDEN_DIM = 64
OUTPUT_DIM = 1
N_LAYERS = 1
DROPOUT = 0.25

model = models.BERTLinear(bert,
                          HIDDEN_DIM,
                          OUTPUT_DIM,
                          N_LAYERS,
                          DROPOUT)

# Training pipeline begins here


## Define training parameters

In [78]:
BATCH_SIZE = 4
N_EPOCHS = 1 # TODO we can increase this

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

optimizer = optim.Adam(model.parameters())
criterion = nn.MSELoss(size_average=False)

model = model.to(device)
criterion = criterion.to(device)

In [79]:
model.train() # Uncomment to view structure of model.

BERTRNN(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_features=76

## Define helper functions

In [80]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_corr = 0
  
  model.train()

  for batch in iterator:
    optimizer.zero_grad()
    predictions = model(batch.text).squeeze(1)
    loss = criterion(predictions, batch.label)
    # need to use detach() since `predictions` requires gradient
    # alternative: scipy.stats.pearsonr? (might be more memory efficient,
    # but not sure which one is more efficient to compute)
    corr = np.corrcoef(batch.label.cpu().data.numpy(), predictions.detach().cpu().data.numpy())
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    # corr is a (2,2) matrix, so we just get the top right element.
    # If the correlation is a nan value, replace with 0, which means
    # no correlation.
    corr_value = corr[0][1].item()
    if np.isnan(corr[0][1]):
      corr_value = 0

    epoch_corr += corr_value

  return epoch_loss / len(iterator), epoch_corr / len(iterator)

In [81]:
def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_corr = 0

  model.eval()

  # i = 0
  with torch.no_grad():
    for batch in iterator:
      # print(i)
      # i += 1
      predictions = model(batch.text).squeeze(1)
      # print(predictions) # uncomment to see how the predictions look compared to labels
      # print(batch.label)
      loss = criterion(predictions, batch.label)
      corr = np.corrcoef(batch.label.cpu().data, predictions.cpu().data)
      epoch_loss += loss.item()

      # If the correlation is a nan value, replace with 0, which means
      # no correlation.
      corr_value = corr[0][1].item()
      if np.isnan(corr[0][1]):
        corr_value = 0

      epoch_corr += corr_value

  return epoch_loss / len(iterator), epoch_corr / len(iterator)

## The cell where it actually trains!

In [82]:
# The main training loop
# TODO: add some sort of weights-saving, either periodically or at the end
# This way we can save our trained model and use it easily for downstream
# analysis without having to re-train.
# TODO: add some sort of timing info / progress bar.
def launch_experiment(train_data_df):
  best_valid_loss = float('inf') 
  
  kf = KFold(n_splits=5)
  for train_index, valid_index in kf.split(train_data_df):
    train_data = data.Dataset(train_exs_arr[train_index], all_fields)
    valid_data = data.Dataset(train_exs_arr[valid_index], all_fields)

    train_iterator, valid_iterator = training.get_iterators(train_data, valid_data, BATCH_SIZE, device)

    

    for epoch in range(N_EPOCHS):
      train_loss, train_corr = train(model, train_iterator, optimizer, criterion)
      valid_loss, valid_corr = evaluate(model, valid_iterator, criterion)

      if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss

      print(f'\tTrain Loss: {train_loss:.3f} | Train Corr: {train_corr:.2f}')
      print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Corr: {valid_corr:.2f}')
  
  return best_valid_loss 

In [83]:
best_valid_loss = launch_experiment(train_exs_arr)
print(best_valid_loss)

KeyboardInterrupt: 

# Test the trained model on held-out dataset.

In [38]:
# Get a test iterator
test_iterator = training.get_iterator(test_dataset, BATCH_SIZE, device)

4


In [None]:
test_loss, test_corr = evaluate(model, test_iterator, criterion)
print(test_loss)
print(test_corr)

  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


2.312805041721141
-0.07726163023834338


# Misc other stuff

Link to the trainer class: https://huggingface.co/transformers/main_classes/trainer.html



Default training arguments: https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments

Batch size per device: 8

Epoch: 3



This should be the model I used to generate my initial results: https://huggingface.co/transformers/model_doc/distilbert.html#distilbertforsequenceclassification
"DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks."