### Imports

In [4]:
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset,RandomSampler,SequentialSampler,Dataset,DataLoader,IterableDataset
import torch.nn.functional as F

import transformers
from transformers import XLNetTokenizer, XLNetModel, AdamW, XLNetForSequenceClassification

from sklearn.model_selection import train_test_split

from keras_preprocessing.sequence import pad_sequences

import numpy as np
import pandas as pd
import os

### Use Google Colab GPU if available

In [5]:
# Return a bool indicating if CUDA is currently available.
if torch.cuda.is_available():
  # Use CUDA-enabled GPU
  device = torch.device("cuda:0")
  print("GPU is Available")
  torch.cuda.empty_cache()
  print(torch.cuda.get_device_name(0))

# there is still a bug with mps so ima comment it out for now - Ryan
# elif torch.backends.mps.is_available(): # for mac os metal
#     device = torch.device("mps")
#     print("MPS available, using MPS instead")
    
else:
  device = torch.device("cpu")
  print("GPU is Not Available. Use CPU")

GPU is Not Available. Use CPU


### Load Data

In [6]:
# Read Training Dataset
# https://towardsdatascience.com/3-ways-to-load-csv-files-into-colab-7c14fcbdcb92
# filepath_train = 'https://raw.githubusercontent.com/johnlohjy/SNLP_Project/XLNet_John/data/train_2024.csv'
# from google.colab import files
# uploaded = files.upload()
filepath_train = 'data/test_2024.csv'
df = pd.read_csv(filepath_train, quoting=3)

### Pre-Process Data

In [7]:
# Step 1: Get sentences
sentences = list(df.loc[:, 'text'])

In [8]:
"""
Step 2: Add Sepcial tokens [SEP] (end of sentence token) and [CLS] (classification token) to the end of sequences first
https://datascience.stackexchange.com/questions/66207/what-is-purpose-of-the-cls-token-and-why-is-its-encoding-output-important
https://towardsdatascience.com/fastai-with-transformers-bert-roberta-xlnet-xlm-distilbert-4f41ee18ecb2
https://huggingface.co/docs/transformers/model_doc/xlnet#xlnettokenizer
"""
sentences = [sentence + " [SEP] [CLS]" for sentence in sentences]
print(sentences[:3])

['I get the odd feeling Klastri  the head of the ACLU of Hawaii  will step in and defend this scum for freedom of speech. [SEP] [CLS]', "I couldn't disagree more with this column; Canadians have moved on and don't care how primarily the Progressive media and others label people. Why don't we stop putting labels on people.  Singh's biggest problem will be if he has to under the LEAP agenda. It really is that simple. [SEP] [CLS]", "Does the property owner have a vote in the  tax increase?  And maybe it's time to scale back the parks  and a government worker retirement that comes with these parks.  If the neighborhood wants the park let them maintain it without government. If the neighborhood doesn't like the park let it go to waste.  There must be greener pastures outside of the state and the ponytail man [SEP] [CLS]"]


In [9]:
"""
Step 3: Initialise tokenizer
Initialise word tokenizer to be used
SentencePiece Tokenizer is used by XLNetTokenizer. It can handle all words, special characters and spaces easily
https://huggingface.co/docs/transformers/en/tokenizer_summary#sentencepiece
https://aman.ai/primers/ai/tokenizer/#sentencepiece
"""
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

# Step 4: Tokenize Text
tokenized_text = [tokenizer.tokenize(sent) for sent in sentences]

In [10]:
print(tokenized_text[:3])

[['▁I', '▁get', '▁the', '▁odd', '▁feeling', '▁Kla', 's', 'tri', '▁the', '▁head', '▁of', '▁the', '▁ACLU', '▁of', '▁Hawaii', '▁will', '▁step', '▁in', '▁and', '▁defend', '▁this', '▁', 's', 'cum', '▁for', '▁freedom', '▁of', '▁speech', '.', '▁[', 'S', 'EP', ']', '▁[', 'CL', 'S', ']'], ['▁I', '▁couldn', "'", 't', '▁disagree', '▁more', '▁with', '▁this', '▁column', ';', '▁Canadians', '▁have', '▁moved', '▁on', '▁and', '▁don', "'", 't', '▁care', '▁how', '▁primarily', '▁the', '▁Progressive', '▁media', '▁and', '▁others', '▁label', '▁people', '.', '▁Why', '▁don', "'", 't', '▁we', '▁stop', '▁putting', '▁labels', '▁on', '▁people', '.', '▁Singh', "'", 's', '▁biggest', '▁problem', '▁will', '▁be', '▁if', '▁he', '▁has', '▁to', '▁under', '▁the', '▁', 'LE', 'AP', '▁agenda', '.', '▁It', '▁really', '▁is', '▁that', '▁simple', '.', '▁[', 'S', 'EP', ']', '▁[', 'CL', 'S', ']'], ['▁Does', '▁the', '▁property', '▁owner', '▁have', '▁a', '▁vote', '▁in', '▁the', '▁tax', '▁increase', '?', '▁And', '▁maybe', '▁it', "'", 

In [35]:
# Get max length of the sequence just to see
MAX_LEN = 935
MAX_LEN_TEST = max(len(sent) for sent in tokenized_text)

print("The Max Length of a Sentence is: ")
print(MAX_LEN_TEST)

The Max Length of a Sentence is: 
352


In [36]:
"""
Step 5: Prepare inputs for XLNet
1) Input IDs
   - Seq of integers identifying each input token (from our tokenized text) to its index number in the XLNet tokenizer vocabulary

2) Attention Mask
   - Helps the model to focus on actual words vs padding

3) Labels
"""

# Use the XLNet tokenizer to convert the tokens to their index numbers in the XLNet vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(token) for token in tokenized_text]

# Pad the sequence using keras. Truncate: if len of sequence is less than our MAX_LEN, we cut it from the back
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create the attention masks
attention_masks = []
for sequence in input_ids:
    sequence_masked = [float(i>0) for i in sequence]
    attention_masks.append(sequence_masked)

In [37]:
print(attention_masks[:3])
#print(sequence_masked[:3])

[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

### Initialise the PyTorch DataLoader

In [38]:
# Define an IterableDataset to tackle memory issue. Stream data.
class CustomIterableDataset(IterableDataset):
    def __init__(self, input_ids, attention_masks):
        self.input_ids = input_ids
        self.attention_masks = attention_masks

    def __iter__(self):
        for input_ids, attention_masks in zip(self.input_ids, self.attention_masks):
            yield (input_ids, attention_masks)

    def __len__(self):
        return len(self.input_ids)


In [39]:
# Convert all of our data into torch tensors, the required datatype for our model

inputs = torch.tensor(input_ids)

masks = torch.tensor(attention_masks)

In [40]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of
# 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 32

dataset = CustomIterableDataset(inputs,masks)
dataloader = DataLoader(dataset, batch_size=batch_size)

### Initialise and Load the Model

In [41]:
"""
Load the model: XLNEtForSequenceClassification, the pretrained XLNet model with an added single linear classification layer on top.

As we feed input data, the entire pre-trained XLNet model and the additional untrained classification layer is trained on our specific task.
"""
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
# Load the model
#model.load_state_dict(torch.load("xlnetmodel.pt",map_location=torch.device('cpu')))
model.load_state_dict(torch.load("xlnetmodel.pt",map_location=torch.device(device)))


<All keys matched successfully>

### Use the Model for Prediction

In [44]:
df.head()

Unnamed: 0,id,text,label
0,0,I get the odd feeling Klastri the head of the...,?
1,1,I couldn't disagree more with this column; Can...,?
2,2,Does the property owner have a vote in the ta...,?
3,3,Shawn do you think it may be due to the fact ...,?
4,4,You proved she turned over 100% of the relevan...,?


In [45]:
# Set the model to evaluation mode
model.eval()

predictions = []

with torch.no_grad():
    for batch in dataloader:
      # Pass data to the specified device as well
      
      batch = tuple(data.to(device) for data in batch)
      input_ids, attention_masks = batch
      #print(attention_masks)

      # See https://huggingface.co/docs/transformers/en/model_doc/xlnet#transformers.models.xlnet.modeling_xlnet.XLNetForSequenceClassificationOutput
      outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks)
      # Prediction
      logits = outputs.logits  # Assuming model outputs logits

      # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()

      # Store predictions and true labels
      predictions.extend(np.argmax(logits, axis=1).flatten())

KeyboardInterrupt: 

In [29]:
df_prediction = pd.DataFrame({'id': list(df.loc[:, 'id']), 'label': predictions})

In [30]:
df_prediction.head()

Unnamed: 0,id,label
0,0,1
1,1,0
2,2,0
3,3,1
4,4,1


In [32]:
df_prediction.to_csv('Group29_submission.csv', index=False)