### Tutorial From

1. https://mccormickml.com/2019/09/19/XLNet-fine-tuning/
2. https://medium.com/swlh/using-xlnet-for-sentiment-classification-cfa948e65e85

### Enabling GPU in Google Colab

1. Go to 'Runtime' tab
2. Click on 'Change runtime type'
3. Select GPU

### Imports

In [1]:
!pip install Keras-Preprocessing



In [2]:
import transformers
from transformers import XLNetTokenizer, XLNetModel, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from collections import defaultdict

from torch import nn, optim
from keras_preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset,RandomSampler,SequentialSampler
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import os

In [3]:
# Find out difference between pytorch_transformers vs transformers
from transformers import XLNetForSequenceClassification

### Use Google Colab GPU if available

In [4]:
# Return a bool indicating if CUDA is currently available.
if torch.cuda.is_available():
  # Use CUDA-enabled GPU
  device = torch.device("cuda:0")
  print("GPU is Available")
  torch.cuda.empty_cache()
else:
  device = torch.device("cpu")
  print("GPU is Not Available. Use CPU")

GPU is Not Available. Use CPU


### Load Data

In [5]:
# Read Training Dataset
# https://towardsdatascience.com/3-ways-to-load-csv-files-into-colab-7c14fcbdcb92
filepath_train = 'https://raw.githubusercontent.com/johnlohjy/SNLP_Project/XLNet_John/data/train_2024.csv'
df = pd.read_csv(filepath_train, quoting=3)

In [6]:
# View head of training dataset
df.head()

Unnamed: 0,id,text,label
0,0,Except that Desmond played first base last nig...,0
1,1,What i find funny is the loyalty and blindness...,0
2,2,Read the article not just the headline & you ...,0
3,3,Speaking of a horses backside is that where y...,1
4,4,Michael Barone- gee are you dumb. No other wo...,1


### Pre-Process Data

In [7]:
# Step 1: Get sentences
sentences = list(df.loc[:, 'text'])

In [8]:
"""
Step 2: Add Sepcial tokens [SEP] (end of sentence token) and [CLS] (classification token) to the end of sequences first
https://datascience.stackexchange.com/questions/66207/what-is-purpose-of-the-cls-token-and-why-is-its-encoding-output-important
https://towardsdatascience.com/fastai-with-transformers-bert-roberta-xlnet-xlm-distilbert-4f41ee18ecb2
https://huggingface.co/docs/transformers/model_doc/xlnet#xlnettokenizer
"""
sentences = [sentence + " [SEP] [CLS]" for sentence in sentences]

In [None]:
"""
Step 3: Initialise tokenizer
Initialise word tokenizer to be used
SentencePiece Tokenizer is used by XLNetTokenizer. It can handle all words, special characters and spaces easily
https://huggingface.co/docs/transformers/en/tokenizer_summary#sentencepiece
https://aman.ai/primers/ai/tokenizer/#sentencepiece
"""
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

# Step 4: Tokenize Text
tokenized_text = [tokenizer.tokenize(sent) for sent in sentences]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Viewing tokenized text
for i in range(5):
    print(tokenized_text[i])
    print("")

In [None]:
# Get max length of the sequence
MAX_LEN = max(len(sent) for sent in tokenized_text)

print("The Max Length of a Sentence is: ")
print(MAX_LEN)

In [None]:
"""
Step 5: Prepare inputs for XLNet
1) Input IDs
   - Seq of integers identifying each input token (from our tokenized text) to its index number in the XLNet tokenizer vocabulary

2) Attention Mask
   - Helps the model to focus on actual words vs padding

3) Labels
"""

# Use the XLNet tokenizer to convert the tokens to their index numbers in the XLNet vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(token) for token in tokenized_text]

# Pad the sequence using keras. Truncate: if len of sequence is less than our MAX_LEN, we cut it from the back
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create the attention masks
attention_masks = []
for sequence in input_ids:
    sequence_masked = [float(i>0) for i in sequence]
    attention_masks.append(sequence_masked)

labels = list(df.loc[:, 'label'])

In [None]:
print(len(input_ids))

In [None]:
print(len(attention_masks))

In [None]:
print(len(labels))

### Initialise the PyTorch DataLoader

In [None]:
# Use train_test_split to split our data into train and validation sets for training

# Provide the same method for splitting, random state and test size so that inputs and masks match

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels,
                                                            random_state=2018, test_size=0.1)

train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [None]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of
# 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
# with an iterator the entire dataset does not need to be loaded into memory

# Use TensorDataset to make a tuple of sample (train_inputs, train_masks, train_labels)
train_data = TensorDataset(train_inputs, train_masks, train_labels)
# Shuffle the training samples by randomly sampling the data
train_sampler = RandomSampler(train_data)
# DataLoader wraps an iterable around to enable easy access to the samples
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
# Sample in a fixed, sequential order
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

### Initialise the Model

In [None]:
"""
Load the model: XLNEtForSequenceClassification, the pretrained XLNet model with an added single linear classification layer on top.

As we feed input data, the entire pre-trained XLNet model and the additional untrained classification layer is trained on our specific task.
"""
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)

In [None]:
# Move the model to the specified device
model = model.to(device)

In [None]:
# Display model's current parameters
print("Model's current parameters:")
model_parameters = list(model.named_parameters())
for parameter in model_parameters:
    print("Parameter name: " + parameter[0])

In [None]:
# Define a set of different parameters for AdamW to optimise
no_decay = ['bias', 'gamma', 'beta']

# Which parameters should undergo weight decay and at what rate. Find out more later
optimizer_grouped_parameters = [
    {'params': [parameter for name, parameter in model_parameters if not any(no_decay_parameters in name for no_decay_parameters in no_decay)],
     'weight_decay_rate': 0.01},

    {'params': [parameter for name, parameter in model_parameters if any(no_decay_parameters in name for no_decay_parameters in no_decay)],
     'weight_decay_rate': 0.0}
]

# Initialise the optimzer
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)

### Train the Model

In [None]:
print(model)

In [None]:
# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for i in range(epochs):
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Initialise epoch loss
  epoch_loss = 0

  # Train the data for one epoch
  for batch_number, batch in enumerate(train_dataloader):
    # Pass data to the specified device as well
    batch = tuple(data.to(device) for data in batch)

    # Unpack the inputs from our dataloader
    batch_input_ids, batch_input_mask, batch_labels = batch

    # Clear out the gradients (by default they accumulate)
    # Call optimizer.zero_grad() to reset the gradients of model parameters. Gradients by default add up; to prevent double-counting, we explicitly zero them at each iteration
    optimizer.zero_grad()

    # Forward pass
    # See https://huggingface.co/docs/transformers/v4.39.1/en/model_doc/xlnet#transformers.XLNetForSequenceClassification
    # See https://huggingface.co/docs/transformers/en/model_doc/xlnet#transformers.models.xlnet.modeling_xlnet.XLNetForSequenceClassificationOutput
    outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)
    loss = outputs[0]

    epoch_loss += loss.item()

    # Backward pass
    # Compute derivatives of loss function wrt parameters
    # When doing backward propagation, PyTorch accumulates the gradients, i.e. the value of computed gradients is added to the grad property of all leaf nodes of computational graph
    loss.backward()

    # Update parameters and take a step using the computed gradient
    # Adjust the parameters by the gradients collected in the backward pass
    optimizer.step()


  print("Loss for epoch " + str(i+1) + " is: " + str(epoch_loss / len(train_dataloader)))