# This is the code for clarity task in SIOP ML competition 2024
Author: Zihao Jia zjia2@gmu.edu


***Note: all the existing outputs in the current file were original outputs we used for submission.***

## Analysis strategy
###Fine tune a pre-trained LLM (DEBERTa-V3-Large) to predict clarity scores

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
## install required pacakges
! pip install datasets
! pip install transformers
! pip install transformers[torch]
! pip install evaluate

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2

### Step 1: Data preparation

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd

In [None]:
# read datasets. You may need to change the file path
train_path = '/content/drive/MyDrive/2024 SIOP competition/Reproduce/data/clarity_train.csv'
val_path = '/content/drive/MyDrive/2024 SIOP competition/Reproduce/data/clarity_val_public.csv'
test_path = '/content/drive/MyDrive/2024 SIOP competition/Reproduce/data/Copy of clarity_test_public.csv'

In [None]:
# Load train, and test CSV files into DataFrames
# We create the validation set by using the last 6 examples in our training set.
raw_df_all = pd.read_csv(train_path)
raw_df = raw_df_all[['personality_item', 'clarity']]

test_df = pd.read_csv(test_path)

# Convert DataFrames into Dataset objects
# rename the original columns to 'text' and 'labels' so the LLM can read them later.
train_dataset = Dataset.from_pandas(raw_df.iloc[:24].rename(columns={'personality_item': 'text', 'clarity': 'labels'}))
validation_dataset = Dataset.from_pandas(raw_df.iloc[24:].rename(columns={'personality_item': 'text', 'clarity': 'labels'}))
test_dataset = Dataset.from_pandas(test_df[['personality_item']].rename(columns={'personality_item': 'text'}))

# Create a DatasetDict with train, validation, and test splits
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})

# Now you have a DatasetDict containing your datasets

In [None]:
dataset_dict['train']['labels']

[3.421052631578947,
 6.545454545454546,
 6.545454545454546,
 3.75,
 5.210526315789473,
 3.0,
 6.333333333333333,
 6.333333333333333,
 5.363636363636363,
 3.727272727272727,
 3.2,
 5.315789473684211,
 5.25,
 3.222222222222222,
 5.375,
 3.727272727272727,
 3.4,
 3.117647058823529,
 3.3125,
 3.529411764705882,
 6.636363636363637,
 5.2,
 5.333333333333333,
 6.666666666666667]

### Step 2: Tokenization and fine-tuning

**Tokenization**

In [None]:
# Tokenization with pre-trained tokenizer from DeBERTa-V3-large
from transformers import AutoTokenizer

model_ckpt = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# define tokenization function
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
# apply tokenization to the whole data dictionary
dataset_encoded = dataset_dict.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# tokenized data
dataset_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 24
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6
    })
    test: Dataset({
        features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100
    })
})

Fine-tune model

In [None]:
import torch

In [None]:
# set model name and working device. Set it to GPU. I used V100 on Colab.
from transformers import AutoModel

model_ckpt = "microsoft/deberta-v3-large"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# save pre-trained model to GPU. I used V100 on Colab.
# set num_label to 1 to indicates a regression task
from transformers import AutoModelForSequenceClassification

num_labels = 1
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device)

In [None]:
# eval metrics for regression task
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import r_regression
from scipy.stats import pearsonr

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    mse = mean_squared_error(labels, logits)
    r = pearsonr(labels.reshape(-1), logits.reshape(-1))
    rscore = r[0].tolist()
    single_squared_errors = ((logits - labels).flatten()**2).tolist()
    return {"mse": mse, "r": rscore}

In [None]:
# parameter setting for the model.
# I forgot to set a seed here (lame). However, you can find all my original outputs here.
from transformers import Trainer, TrainingArguments

batch_size = 4
logging_steps = len(dataset_dict["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=10,
                                  learning_rate=5e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.1, # weight decay is important here
                                  evaluation_strategy="epoch",
                                  save_strategy = "epoch",
                                  disable_tqdm=False,
                                  save_total_limit = 1,
                                  logging_steps=logging_steps,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="mse",
                                  larger_is_better=False,
                                  log_level="error")

In [None]:
# start fine-tuning with previous settings
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics_for_regression,
                  train_dataset=dataset_encoded['train'],
                  eval_dataset=dataset_encoded['validation'])
trainer.train();

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Mse,R
1,15.5855,8.100904,8.100904,-0.501141
2,3.4189,0.949205,0.949205,0.732339
3,1.4994,0.908699,0.908699,0.92457
4,1.1473,0.56797,0.56797,0.871088
5,0.5116,1.19951,1.19951,0.701061
6,0.3112,0.243839,0.243839,0.88898
7,0.194,0.252729,0.252729,0.881606
8,0.1917,1.014553,1.014553,0.813224
9,0.1836,0.402436,0.402436,0.873694
10,0.3755,0.193579,0.193579,0.908692


**Prediction and error analysis**

In [None]:
# based on our evaluation strategy (mse). The last model should be choose and saved. We can double check it here.
preds_output = trainer.predict(dataset_encoded["validation"])

In [None]:
preds_output.metrics

{'test_loss': 0.1935788244009018,
 'test_mse': 0.1935788244009018,
 'test_r': 0.9086915281904749,
 'test_runtime': 0.1347,
 'test_samples_per_second': 44.552,
 'test_steps_per_second': 14.851}

In [None]:
# get the predicts in evaluation set.
import numpy as np
y_preds = preds_output.predictions
y_preds

array([3.618887 , 6.2174873, 6.5522437, 5.541268 , 6.1789093, 5.8596177],
      dtype=float32)

In [None]:
# get the predicts in testing set.
test_output = trainer.predict(dataset_encoded["test"])

In [None]:
test_preds = test_output.predictions

In [None]:
test_preds

[5.79,
 3.46,
 3.4,
 6.49,
 6.61,
 5.6,
 5.23,
 3.52,
 6.61,
 5.97,
 5.37,
 5.65,
 6.52,
 6.32,
 6.42,
 6.34,
 3.8,
 4.33,
 6.04,
 3.43,
 4.99,
 5.62,
 6.31,
 6.26,
 6.6,
 4.37,
 4.55,
 6.51,
 4.06,
 3.59,
 5.96,
 4.02,
 3.55,
 3.59,
 6.61,
 3.42,
 6.13,
 6.32,
 4.23,
 6.13,
 5.54,
 3.81,
 5.28,
 6.56,
 6.37,
 5.97,
 6.26,
 6.05,
 3.5,
 3.6,
 6.23,
 6.31,
 6.74,
 3.73,
 6.44,
 5.06,
 3.97,
 5.95,
 3.41,
 6.28,
 3.27,
 5.58,
 6.39,
 5.86,
 4.11,
 6.38,
 6.04,
 5.73,
 6.47,
 6.69]