In [None]:
from transformers import XLNetModel, XLNetTokenizer, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import numpy as np

from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch import optim


from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

RANDOM_SEED = 10
MODEL = 'xlnet-base-cased'
tokenizer = XLNetTokenizer.from_pretrained(MODEL)
NUMBER_LABELS = 5 
BATCH_SIZE = 8

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.get_device_name(0)


# Load data

In [None]:
df =  pd.read_csv("../data/processed_data_v3.csv")[["Q2_Q","Q2_A"]].dropna()

In [None]:
df

Transform labels

In [None]:
df.Q2_A = df.Q2_A-1
df.Q2_A.value_counts()

# Create a pytorch dataset class and dataloader

In [None]:
class LFG_dataset(Dataset):

  def __init__(self, answers, scores, tokenizer, max_len):
    self.answers = answers
    self.scores = scores
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.answers)
  
  def __getitem__(self, idx):
    answer = self.answers[idx]
    score = self.scores[idx]

    encoding = tokenizer(
      answer,
      max_length = self.max_len,
      padding ='max_length',
      return_tensors='pt',
    )

    return {
      'answer_text': answer,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'score': torch.tensor(score, dtype=torch.long)
    }

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size=1):
  ds = LFG_dataset(
    answers=df.Q2_Q.to_numpy(),
    scores=df.Q2_A.to_numpy().astype(int),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size
  )

### Todo: find class centroids. Take only the centroids of abundant classes such as 2-3

In [None]:
df_train_val, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
df_train, df_val = train_test_split(df_train_val, test_size=0.2, random_state=RANDOM_SEED)

train_data_loader = create_data_loader(df_train, tokenizer, 600, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, 600, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, 600)

# Model and loss function

In [None]:
class LFG_grading(nn.Module):

    def __init__(self, n_classes):
        super(LFG_grading, self).__init__()
        self.xlnet = XLNetModel.from_pretrained(MODEL)

        self.drop = nn.Dropout(p=0.5)
        self.fc1 = nn.Linear(self.xlnet.config.hidden_size, 200)
        self.fc2 = nn.Linear(200,200)
        self.fc3 = nn.Linear(200,n_classes)
        self.softmax = nn.Softmax(dim=1)
        self.relu = nn.ReLU()
  
    def forward(self, input_ids, attention_mask):
        output = self.xlnet(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        # Get the first element of output which is the hidden state
        # Get the embeddings of CLS token
        # if u use BERT please make sure to change from -1 to 0
        cls_embeddings = output[0][:,-1,:]
        output = self.drop(cls_embeddings)
        output = self.fc1(output)
        output = self.relu(output)
        output = self.drop(output)
        output = self.fc2(output)
        output = self.relu(output)
        output = self.drop(output)
        output = self.fc3(output)
        return self.softmax(output)

    def requires_grad_embeddings(self, val):
        for param in self.xlnet.parameters():
            param.requires_grad = val   
  

In [None]:
class OrdinalLoss(nn.Module):
    def __init__(self):
        super(OrdinalLoss, self).__init__()

    def forward(self, output, target):
        criterion = nn.MSELoss(reduction='none')
        loss = criterion(output, target).sum(axis=1)
        return loss.sum()

# Training loop

In [None]:
EPOCHS = 6

model = LFG_grading(5).to(device)
model.requires_grad_embeddings(False)

optimizer = optim.AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()

  losses = []
  correct_predictions = 0
  
  for i,d in enumerate(tqdm(data_loader)):
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["score"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    
    if i>= 3:
      model.requires_grad_embeddings(True)

    # loss function
    loss = loss_fn(outputs, targets)
    losses.append(loss.item())
    loss.backward()

    predictions = outputs.argmax(dim=1)
    correct_predictions += torch.sum(predictions == targets)

    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def val(model, data_loader, device, n_example):
    correct_predictions = 0

    predictions = torch.tensor([], dtype=torch.long).to(device)
    targets = torch.tensor([], dtype=torch.long).to(device)

    model = model.to(device)
    model = model.eval()
    for i,d in enumerate(tqdm(data_loader)):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        score = d["score"].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        predictions = torch.cat((predictions,outputs.argmax(dim=1)), dim=0)
        targets = torch.cat((targets,score), dim=0)
        
    correct_predictions = torch.sum(predictions == targets)
        
    return (correct_predictions.double() / n_example), targets, predictions

In [None]:
best_accuracy = 0
train_acc= []
train_loss = []
for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  acc, loss = train_epoch(
    model,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    len(df_train)
  )
  
  train_acc.append(acc)
  train_loss.append(loss)
  print(f'Train loss {loss} accuracy {acc}')
  print()

In [None]:
val_acc, true, predict = val(model, val_data_loader, device, len(df_val))