In [1]:
import torch
import gc

gc.collect()  # Runs Python's garbage collector
torch.cuda.empty_cache()  # Clears PyTorch's cache

In [2]:
!pip install optuna
!pip install evaluate



In [3]:
import gc
import torch
import pandas as pd
import ast  # for list-like labels, if needed
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import (
    DebertaTokenizer,
    DebertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
from evaluate import load
import numpy as np
import re

In [4]:
# -------------------------------
# Data Preparation and Tokenization
# -------------------------------

# Define column names and load datasets
columns = ['par_id', 'art_id', 'keyword', 'country_code', 'text', 'label']
df_texts = pd.read_csv("dontpatronizeme_pcl.tsv", sep="\t", header=None, names=columns)
df_labels = pd.read_csv("train_semeval_parids-labels.csv")

# Ensure the IDs are strings for merging
df_texts["par_id"] = df_texts["par_id"].astype(str)
df_labels["par_id"] = df_labels["par_id"].astype(str)

# Drop unnecessary label column from df_labels and create binary labels in df_texts
df_labels = df_labels.drop(columns=["label"])
df_texts["binary_label"] = df_texts["label"].apply(lambda x: 1 if x >= 2 else 0)
df_texts = df_texts.drop(columns=["label"])

# Merge datasets on paragraph ID and rename the binary label to "label"
df = df_labels.merge(df_texts, on="par_id", how="left")
df.rename(columns={"binary_label": "label"}, inplace=True)
df = df.dropna(subset=["text", "label"])  # Drop rows with missing data

# def clean_text(text):
#     text = re.sub(r"\s+", " ", text)  # Remove extra spaces
#     text = re.sub(r"[^a-zA-Z0-9.,!?'\"]", " ", text)  # Remove special characters
#     return text.strip()

# # Ensure text column has no NaN values before applying text cleaning
# df["text"] = df["text"].astype(str).apply(clean_text)

def clean_text(text):
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)
    # Optionally remove only very specific noise
    # For example, remove non-ASCII characters:
    # text = re.sub(r"[^\x00-\x7F]+", " ", text)
    return text.strip()
df["text"] = df["text"].astype(str).apply(clean_text)
print("Preprocessing and balancing complete!")

# Initialize the tokenizer and model
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
model = DebertaForSequenceClassification.from_pretrained(
    "microsoft/deberta-base", num_labels=2, ignore_mismatched_sizes=True
)

# Tokenization function
def tokenize_function(text):
    return tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

# Tokenize the texts and stack tensors
df["tokenized"] = df["text"].apply(lambda x: tokenize_function(x))
input_ids = torch.cat([t["input_ids"] for t in df["tokenized"]], dim=0)
attention_masks = torch.cat([t["attention_mask"] for t in df["tokenized"]], dim=0)
labels = torch.tensor(df["label"].values, dtype=torch.long)

Preprocessing and balancing complete!


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# -------------------------------
# Create Dataset and Sampler
# -------------------------------

class PCLDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_masks[idx],
            "labels": self.labels[idx],
        }

# Split data into training and validation sets using indices
indices = list(range(len(df)))
train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)

train_ids, val_ids = input_ids[train_idx], input_ids[val_idx]
train_masks, val_masks = attention_masks[train_idx], attention_masks[val_idx]
train_labels, val_labels = labels[train_idx], labels[val_idx]

train_dataset = PCLDataset(train_ids, train_masks, train_labels)
val_dataset = PCLDataset(val_ids, val_masks, val_labels)


In [7]:
# -------------------------------
# Define Metric Computation
# -------------------------------

def compute_metrics(eval_pred):
    """Computes Accuracy and F1 Score"""
    accuracy_metric = load("accuracy")
    f1_metric = load("f1")
    predictions = np.argmax(eval_pred.predictions, axis=1)
    references = eval_pred.label_ids
    accuracy_score = accuracy_metric.compute(predictions=predictions, references=references)
    f1_score = f1_metric.compute(predictions=predictions, references=references)
    return {"accuracy": accuracy_score["accuracy"], "f1": f1_score["f1"]}


In [9]:
import optuna
# -------------------------------
# Define the Hyperparameter Tuning Objective
# -------------------------------

def objective(trial):
    # Suggest hyperparameters (updated to use suggest_float with log=True)
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
    weight_decay = trial.suggest_float("weight_decay", 0, 0.3)

    # best_params = study.best_trial.params
    learning_rate = learning_rate
    weight_decay = weight_decay
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=learning_rate,
        optim="adamw_torch",
        warmup_ratio=0.1,
        evaluation_strategy="epoch",
        save_strategy="no", # change to no
        per_device_train_batch_size=4,  # Reduce from 8 or 16 to 4 or even 2
        per_device_eval_batch_size=4,   # Match train batch size
        num_train_epochs=2,
        weight_decay=weight_decay,
        logging_dir="./logs",
        logging_steps=50,
        save_total_limit=2,
        lr_scheduler_type="linear",
        load_best_model_at_end=False,
        metric_for_best_model="f1",
        report_to="none",
        fp16=False,  # Enables mixed precision training (reduces memory usage)
        bf16=False,  # Keep this False unless on Ampere GPUs (A100, RTX 30xx)
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics

    )

    # Train and evaluate the model
    trainer.train()
    eval_result = trainer.evaluate()

    # Use F1 score as the objective to maximize
    f1 = eval_result["eval_f1"]
    print(f"Trial completed with F1: {f1}")
    return f1



# -------------------------------
# Run Hyperparameter Tuning with Optuna
# -------------------------------

# Create a study that maximizes the F1 score
study = optuna.create_study(direction="maximize")
# Run the optimization for a fixed number of trials 
study.optimize(objective, n_trials=4)

# Print the best hyperparameters found
print("Best hyperparameters:", study.best_trial.params)

[I 2025-03-02 06:28:23,192] A new study created in memory with name: no-name-f863f8fb-ebac-43fa-9ff9-9de310d449f5


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3418,0.288491,0.914627,0.375546
2,0.1535,0.333167,0.920597,0.519856


[I 2025-03-02 07:01:07,761] Trial 0 finished with value: 0.51985559566787 and parameters: {'learning_rate': 5.5191723330166645e-06, 'weight_decay': 0.15474235967147895}. Best is trial 0 with value: 0.51985559566787.


Trial completed with F1: 0.51985559566787




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2622,0.328028,0.922388,0.5
2,0.1279,0.45895,0.915821,0.494624


[I 2025-03-02 07:33:34,687] Trial 1 finished with value: 0.4946236559139785 and parameters: {'learning_rate': 2.767535104036935e-05, 'weight_decay': 0.0012240300749508549}. Best is trial 0 with value: 0.51985559566787.


Trial completed with F1: 0.4946236559139785




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0888,0.542527,0.912836,0.48951
2,0.0949,0.602129,0.914627,0.47619


[I 2025-03-02 08:06:00,952] Trial 2 finished with value: 0.47619047619047616 and parameters: {'learning_rate': 3.180170877736629e-05, 'weight_decay': 0.2054146010958673}. Best is trial 0 with value: 0.51985559566787.


Trial completed with F1: 0.47619047619047616




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0,0.741714,0.918806,0.403509
2,0.086,0.771509,0.91403,0.503448


[I 2025-03-02 08:38:27,224] Trial 3 finished with value: 0.503448275862069 and parameters: {'learning_rate': 5.276570067741112e-06, 'weight_decay': 0.09815468178932993}. Best is trial 0 with value: 0.51985559566787.


Trial completed with F1: 0.503448275862069
Best hyperparameters: {'learning_rate': 5.5191723330166645e-06, 'weight_decay': 0.15474235967147895}


Evaluating on official dev set

In [10]:
columns = ['par_id', 'art_id', 'keyword', 'country_code', 'text', 'label']
def tokenize_function(text):
    return tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

df_dev = pd.read_csv("dev_semeval_parids-labels.csv")
df_texts = pd.read_csv("dontpatronizeme_pcl.tsv", sep="\t", header=None, names=columns)

df_dev["par_id"] = df_dev["par_id"].astype(str)
df_texts["par_id"] = df_texts["par_id"].astype(str)

df_dev = df_dev.drop(columns=["label"])
df_texts["binary_label"] = df_texts["label"].apply(lambda x: 1 if x >= 2 else 0)
df_texts = df_texts.drop(columns=["label"])

# Merge datasets on paragraph ID and rename the binary label to "label"
df_dev = df_dev.merge(df_texts, on="par_id", how="left")
df_dev.rename(columns={"binary_label": "label"}, inplace=True)
print(len(df_dev))


df_dev["text"] = df_dev["text"].fillna("")
print(len(df_dev))
def clean_text(text):
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces
    #text = re.sub(r"[^a-zA-Z0-9.,!?'\"]", " ", text)  # Remove special characters
    return text.strip()

# Ensure text column has no NaN values before applying text cleaning
df_dev["text"] = df_dev["text"].astype(str).apply(clean_text)
def tokenize_function(text):
    return tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

# Tokenize the texts and stack tensors
df_dev["tokenized"] = df_dev["text"].apply(lambda x: tokenize_function(x))

dev_input_ids = torch.cat([t["input_ids"] for t in df_dev["tokenized"]], dim=0)
dev_attention_masks = torch.cat([t["attention_mask"] for t in df_dev["tokenized"]], dim=0)
dev_labels = torch.tensor(df_dev["label"].values, dtype=torch.long)

df_dev["tokenized"] = df_dev["text"].apply(lambda x: tokenize_function(x))

# Create a Dataset instance (assuming you have a PCLDataset defined similarly)
dev_dataset = PCLDataset(dev_input_ids, dev_attention_masks, dev_labels)

2094
2094


In [9]:


# -------------------------------
# Final Training with Best Hyperparameters and Early Stopping
# -------------------------------

final_training_args = TrainingArguments(
       output_dir="./results",
       learning_rate=5.5191723330166645e-06,
       optim="adamw_torch",
       warmup_ratio=0.1,
       evaluation_strategy="epoch",
       save_strategy="no", # change to no
       per_device_train_batch_size=4,  # Reduce from 8 or 16 to 4 or even 2
       per_device_eval_batch_size=4,   # Match train batch size
       num_train_epochs=2,
       weight_decay=0.15474235967147895,
       logging_dir="./logs",
       logging_steps=50,
       save_total_limit=2,
       lr_scheduler_type="linear",
       load_best_model_at_end=False,
       metric_for_best_model="f1",
       report_to="none",
       fp16=False,  # Enables mixed precision training (reduces memory usage)
       bf16=False,  # Keep this False unless on Ampere GPUs (A100, RTX 30xx)
   )

final_trainer = Trainer(
    model=model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics

)

final_trainer.train()
final_results = final_trainer.evaluate()
print("Final evaluation results:", final_results)

# Save the final model and tokenizer
model.save_pretrained("./deberta_base_model_with_hp_tuning")
tokenizer.save_pretrained("./deberta_base_model_with_hp_tuning")



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3526,0.282672,0.913433,0.426877
2,0.17,0.340563,0.92,0.510949


Final evaluation results: {'eval_loss': 0.34056326746940613, 'eval_accuracy': 0.92, 'eval_f1': 0.5109489051094891, 'eval_runtime': 133.9467, 'eval_samples_per_second': 12.505, 'eval_steps_per_second': 3.128, 'epoch': 2.0}


('./deberta_base_model_with_hp_tuning/tokenizer_config.json',
 './deberta_base_model_with_hp_tuning/special_tokens_map.json',
 './deberta_base_model_with_hp_tuning/vocab.json',
 './deberta_base_model_with_hp_tuning/merges.txt',
 './deberta_base_model_with_hp_tuning/added_tokens.json')

In [11]:

# Load the saved model and tokenizer
model = DebertaForSequenceClassification.from_pretrained("./deberta_base_model_with_hp_tuning")
tokenizer = DebertaTokenizer.from_pretrained("./deberta_base_model_with_hp_tuning")


# -------------------------------
# Final Training with Best Hyperparameters and Early Stopping
# -------------------------------

final_training_args = TrainingArguments(
       output_dir="./results",
       learning_rate=8.28712997391875e-06,
       optim="adamw_torch",
       warmup_ratio=0.1,
       evaluation_strategy="epoch",
       save_strategy="no", # change to no
       per_device_train_batch_size=4,  # Reduce from 8 or 16 to 4 or even 2
       per_device_eval_batch_size=4,   # Match train batch size
       num_train_epochs=2,
       weight_decay=0.15278837974836498,
       logging_dir="./logs",
       logging_steps=50,
       save_total_limit=2,
       lr_scheduler_type="linear",
       load_best_model_at_end=False,
       metric_for_best_model="f1",
       report_to="none",
       fp16=False,  # Enables mixed precision training (reduces memory usage)
       bf16=False,  # Keep this False unless on Ampere GPUs (A100, RTX 30xx)
   )

final_trainer = Trainer(
    model=model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics

)

#final_trainer.train()
final_results = final_trainer.evaluate()
print("Final evaluation results:", final_results)





Final evaluation results: {'eval_loss': 0.29554784297943115, 'eval_model_preparation_time': 0.0031, 'eval_accuracy': 0.9274116523400191, 'eval_f1': 0.5159235668789809, 'eval_runtime': 167.1228, 'eval_samples_per_second': 12.53, 'eval_steps_per_second': 3.135}
