### Tutorial From

1. https://mccormickml.com/2019/09/19/XLNet-fine-tuning/
2. https://medium.com/swlh/using-xlnet-for-sentiment-classification-cfa948e65e85

### Enabling GPU in Google Colab

1. Go to 'Runtime' tab
2. Click on 'Change runtime type'
3. Select GPU

### Imports

In [1]:
!pip install Keras-Preprocessing



In [1]:
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset,RandomSampler,SequentialSampler,Dataset,DataLoader,IterableDataset
import torch.nn.functional as F

import transformers
from transformers import XLNetTokenizer, XLNetModel, AdamW, XLNetForSequenceClassification

from sklearn.model_selection import train_test_split

from keras_preprocessing.sequence import pad_sequences

import numpy as np
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


### Use Google Colab GPU if available

In [2]:
# Return a bool indicating if CUDA is currently available.
if torch.cuda.is_available():
  # Use CUDA-enabled GPU
  device = torch.device("cuda:0")
  print("GPU is Available")
  torch.cuda.empty_cache()
  print(torch.cuda.get_device_name(0))
  
elif torch.backends.mps.is_available(): # added functionality to include mac users
  # Use MPS
  device = torch.device("mps")
  print("MPS is Available")

else:
  device = torch.device("cpu")
  print("GPU is Not Available. Use CPU")

MPS is Available


### Load Data

In [4]:
# Read Training Dataset
# https://towardsdatascience.com/3-ways-to-load-csv-files-into-colab-7c14fcbdcb92
# filepath_train = 'https://raw.githubusercontent.com/johnlohjy/SNLP_Project/XLNet_John/data/train_2024.csv'
# from google.colab import files
# uploaded = files.upload()
filepath_train = 'augmented_data/train_2024_preprocessed.csv'
df = pd.read_csv(filepath_train, quoting=3)


# Example Code
# Upload the train file from your local drive
# from google.colab import files
# uploaded = files.upload()
# df = pd.read_csv("in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

In [5]:
# View head of training dataset
df.head()

Unnamed: 0,id,text,label
0,0,Except that Desmond played [neutral] first bas...,0
1,1,What i find funny is the loyalty and blindness...,0
2,2,Read the article [neutral] not just [neutral]...,0
3,3,Speaking of a [neutral] horses backside [neut...,1
4,4,Michael Barone- [neutral] gee are [neutral] yo...,1


In [6]:
df.shape

(99000, 3)

### Pre-Process Data

In [7]:
# Step 1: Get sentences
sentences = list(df.loc[:, 'text'])

# Example Code
# sentences = list(df.loc[:, 'sentence'])

In [8]:
"""
Step 2: Add Sepcial tokens [SEP] (end of sentence token) and [CLS] (classification token) to the end of sequences first
https://datascience.stackexchange.com/questions/66207/what-is-purpose-of-the-cls-token-and-why-is-its-encoding-output-important
https://towardsdatascience.com/fastai-with-transformers-bert-roberta-xlnet-xlm-distilbert-4f41ee18ecb2
https://huggingface.co/docs/transformers/model_doc/xlnet#xlnettokenizer
"""
sentences = [sentence + " [SEP] [CLS]" for sentence in sentences]

In [9]:
"""
Step 3: Initialise tokenizer
Initialise word tokenizer to be used
SentencePiece Tokenizer is used by XLNetTokenizer. It can handle all words, special characters and spaces easily
https://huggingface.co/docs/transformers/en/tokenizer_summary#sentencepiece
https://aman.ai/primers/ai/tokenizer/#sentencepiece
"""
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

# Step 4: Tokenize Text
tokenized_text = [tokenizer.tokenize(sent) for sent in sentences]

In [10]:
# Viewing tokenized text
for i in range(5):
    print(tokenized_text[i])
    print("")

['▁Except', '▁that', '▁Desmond', '▁played', '▁[', 'neutral', ']', '▁first', '▁base', '▁last', '▁night', '.', '▁[', 'neutral', ']', '▁Tap', 'ia', '▁was', '▁in', '▁', 'LF', '▁[', 'neutral', ']', '▁and', '▁Reynolds', '▁had', '▁[', 'neutral', ']', '▁a', '▁night', '▁off', '.', '▁[', 'neutral', ']', '▁[', 'S', 'EP', ']', '▁[', 'CL', 'S', ']']

['▁What', '▁', 'i', '▁find', '▁funny', '▁is', '▁the', '▁loyalty', '▁and', '▁blindness', '▁of', '▁', 'english', '▁community', '.', '▁The', '▁worst', '▁possible', '▁choice', '▁for', '▁them', '▁is', '▁liberal', '▁and', '▁yet', '▁they', '▁keep', '▁voting', '▁for', '▁them', '▁every', '▁time', '.', '▁They', '▁keep', '▁renew', 'ing', '▁hope', '▁every', '▁election', '▁1', '▁year', '▁prior', '▁to', '▁[', 'negative', ']', '▁it', '▁just', '▁to', '▁ignore', '▁them', '▁at', '▁the', '▁winning', '▁', 'sp', 'each', '▁already', '.', '▁', 'Honest', 'ly', '▁P', 'Q', '▁have', '▁more', '▁respect', '▁for', '▁', 'english', '▁community', '▁then', '▁liberal', '▁at', '▁least', 

In [11]:
# Get max length of the sequence
MAX_LEN = max(len(sent) for sent in tokenized_text)

print("The Max Length of a Sentence is: ")
print(MAX_LEN)

The Max Length of a Sentence is: 
950


In [12]:
"""
Step 5: Prepare inputs for XLNet
1) Input IDs
   - Seq of integers identifying each input token (from our tokenized text) to its index number in the XLNet tokenizer vocabulary

2) Attention Mask
   - Helps the model to focus on actual words vs padding

3) Labels
"""

# Use the XLNet tokenizer to convert the tokens to their index numbers in the XLNet vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(token) for token in tokenized_text]

# Pad the sequence using keras. Truncate: if len of sequence is less than our MAX_LEN, we cut it from the back
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create the attention masks
attention_masks = []
for sequence in input_ids:
    sequence_masked = [float(i>0) for i in sequence]
    attention_masks.append(sequence_masked)

labels = list(df.loc[:, 'label'])

In [13]:
print(len(input_ids))

99000


In [14]:
print(len(attention_masks))

99000


In [15]:
print(len(labels))

99000


### Initialise the PyTorch DataLoader

In [16]:
# Define an IterableDataset to tackle memory issue. Stream data.
class CustomIterableDataset(IterableDataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __iter__(self):
        for input_ids, attention_masks, label in zip(self.input_ids, self.attention_masks, self.labels):
            # print(label)
            # print("")
            yield (input_ids, attention_masks, label)

    def __len__(self):
        return len(self.input_ids)


In [17]:
# Use train_test_split to split our data into train and validation sets for training

# Provide the same method for splitting, random state and test size so that inputs and masks match

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels,
                                                            random_state=2018, test_size=0.1)

train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [18]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [19]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of
# 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 4

train_dataset = CustomIterableDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)

In [20]:
print(len(train_dataset))

89100


In [21]:
print(len(train_dataloader)) #cuz of batch size

22275


### Initialise the Model

In [23]:
"""
Load the model: XLNEtForSequenceClassification, the pretrained XLNet model with an added single linear classification layer on top.

As we feed input data, the entire pre-trained XLNet model and the additional untrained classification layer is trained on our specific task.
"""
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2).to(device)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
# Display model's current parameters
print("Model's current parameters:")
model_parameters = list(model.named_parameters())
# for parameter in model_parameters:
    # print("Parameter name: " + parameter[0])

Model's current parameters:


In [25]:
# Define a set of different parameters for AdamW to optimise
no_decay = ['bias', 'gamma', 'beta']

# Which parameters should undergo weight decay and at what rate. Find out more later
optimizer_grouped_parameters = [
    {'params': [parameter for name, parameter in model_parameters if not any(no_decay_parameters in name for no_decay_parameters in no_decay)],
     'weight_decay_rate': 0.01},

    {'params': [parameter for name, parameter in model_parameters if any(no_decay_parameters in name for no_decay_parameters in no_decay)],
     'weight_decay_rate': 0.0}
]

# Initialise the optimzer
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)



### Train the Model

In [26]:
print(model)

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [27]:
print(len(train_dataloader))

22275


In [28]:
# Number of training epochs (authors recommend between 2 and 4)
epochs = 2

# trange is a tqdm wrapper around the normal python range
for i in range(epochs):
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Initialise epoch loss
  epoch_loss = 0

  # Train the data for one epoch
  for batch_number, batch in enumerate(train_dataloader):
    # Pass data to the specified device as well
    batch = tuple(data for data in batch)

    # Unpack the inputs from our dataloader
    batch_input_ids, batch_input_mask, batch_labels = batch

    # Clear out the gradients (by default they accumulate)
    # Call optimizer.zero_grad() to reset the gradients of model parameters. Gradients by default add up; to prevent double-counting, we explicitly zero them at each iteration
    optimizer.zero_grad()

    # Forward pass
    # See https://huggingface.co/docs/transformers/v4.39.1/en/model_doc/xlnet#transformers.XLNetForSequenceClassification
    # See https://huggingface.co/docs/transformers/en/model_doc/xlnet#transformers.models.xlnet.modeling_xlnet.XLNetForSequenceClassificationOutput
    outputs = model(batch_input_ids.to(device), token_type_ids=None, attention_mask=batch_input_mask.to(device), labels=batch_labels.to(device))
    
    if not torch.device("cpu"):
      batch_input_ids, batch_input_mask, batch_labels = batch_input_ids.to("cpu"), batch_input_mask.to("cpu"), batch_labels.to("cpu") # move all the inputs back to cpu to free up gpu memory
    
    loss = outputs[0]

    epoch_loss += loss.item()

    # Backward pass
    # Compute derivatives of loss function wrt parameters
    # When doing backward propagation, PyTorch accumulates the gradients, i.e. the value of computed gradients is added to the grad property of all leaf nodes of computational graph
    loss.backward()

    # Update parameters and take a step using the computed gradient
    # Adjust the parameters by the gradients collected in the backward pass
    optimizer.step()

    # del batch_input_ids, batch_input_mask, batch_labels, outputs
    

    # print("")
    # print("")


  print("Loss for epoch " + str(i+1) + " is: " + str(epoch_loss / len(train_dataset)))

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), "xlnetmodel_with_augments.pt")