In [1]:
from transformers import XLNetModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import numpy as np

from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch import optim

from model import *


RANDOM_SEED = 20
NUMBER_LABELS = 5 
NUMBER_FEATURES = 9
MODEL = 'charlieoneill/distilbert-base-uncased-finetuned-tweet_eval-offensive'
tokenizer = XLNetTokenizer.from_pretrained(MODEL)

BATCH_SIZE = 8

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.get_device_name(0)
torch.cuda.empty_cache()

# Load data

In [4]:
df =  pd.read_csv("../../data/data_old_v2/processed_data_Q"+str(Q_NO)+".csv").drop(['plagiarized_score', 'plagiarized_index'],axis=1)

df = df[~df['ID'].isin(IDS)]

# transform label
df[VAR_S] = df[VAR_S]-1
df[VAR_S].value_counts()

scaler = StandardScaler()
features = pd.DataFrame(scaler.fit_transform(df.loc[:,'length_in_words':]))
df = pd.concat([df.loc[:,['ID',VAR_Q,VAR_S]], features], axis=1)

# define the downsampling condition
condition =  f'{VAR_S} == 1 |  {VAR_S} == 2'

# filter the dataframe based on the condition
downsampled_df = df.query(condition)

# randomly select a subset of rows to keep
frac_to_keep = 0.5  # fraction of rows to keep
downsampled_df = downsampled_df.sample(frac=frac_to_keep)

df = df.query(f'{VAR_S} != 1 & {VAR_S} != 2')

df = pd.concat([df,downsampled_df],axis=0)

In [5]:
df_train_val, df_test = train_test_split(df, test_size=0.15,random_state=RANDOM_SEED)
df_train, df_val = train_test_split(df_train_val,test_size=0.2,random_state=RANDOM_SEED)

df_train = df_train.dropna()
df_val = df_val[df_val['ID']!=1].dropna()
df_test = df_test[df_test['ID']!=1].dropna()

In [6]:
print(len(df_train))
print(len(df_val))
print(len(df_test))

1097
248
219


# Create a pytorch dataset class and dataloader

In [9]:
train_data_loader = create_data_loader(df_train.drop(columns=['ID']), tokenizer, 550, BATCH_SIZE)
val_data_loader = create_data_loader(df_val.drop(columns=['ID']), tokenizer, 550, BATCH_SIZE)
test_data_loader = create_data_loader(df_test.drop(columns=['ID']), tokenizer, 550)

# Model and loss function

# Training loop

In [14]:
EPOCHS = 6

model = LFG_grading(NUMBER_LABELS,NUMBER_FEATURES).to(device)
model.requires_grad_embeddings(True)

optimizer = optim.AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = OrdinalLoss(weight=class_weights).to(device)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
save_model(model,f"{NAME}.pth")

Model saved to Q6.pth


In [19]:
# model = load_model(LFG_grading,NUMBER_LABELS,NUMBER_FEATURES,f"{NAME_TO_SAVE}.pth")