In [1]:
!nvidia-smi

Sun Mar  2 08:54:24 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.120                Driver Version: 550.120        CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A30                     Off |   00000000:02:00.0 Off |                   On |
| N/A   30C    P0             26W /  165W |      51MiB /  24576MiB |     N/A      Default |
|                                         |                        |              Enabled |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [2]:
!pip install imbalanced-learn




In [3]:
%pip install evaluate

Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install optuna


Note: you may need to restart the kernel to use updated packages.


In [5]:
import gc
import torch
import pandas as pd
import ast  # for list-like labels, if needed
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import (
    DebertaTokenizer,
    DebertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
from evaluate import load
import numpy as np
import optuna  # Using optuna for hyperparameter optimization

In [6]:
# Clean up GPU memory
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

In [7]:
import re

In [8]:
# -------------------------------
# Data Preparation and Tokenization
# -------------------------------

# Define column names and load datasets
columns = ['par_id', 'art_id', 'keyword', 'country_code', 'text', 'label']
df_texts = pd.read_csv("dontpatronizeme_pcl.tsv", sep="\t", header=None, names=columns)
df_labels = pd.read_csv("train_semeval_parids-labels.csv")

# Ensure the IDs are strings for merging
df_texts["par_id"] = df_texts["par_id"].astype(str)
df_labels["par_id"] = df_labels["par_id"].astype(str)

# Drop unnecessary label column from df_labels and create binary labels in df_texts
df_labels = df_labels.drop(columns=["label"])
df_texts["binary_label"] = df_texts["label"].apply(lambda x: 1 if x >= 2 else 0)
df_texts = df_texts.drop(columns=["label"])

# Merge datasets on paragraph ID and rename the binary label to "label"
df = df_labels.merge(df_texts, on="par_id", how="left")
df.rename(columns={"binary_label": "label"}, inplace=True)
df = df.dropna(subset=["text", "label"])  # Drop rows with missing data

# def clean_text(text):
#     text = re.sub(r"\s+", " ", text)  # Remove extra spaces
#     text = re.sub(r"[^a-zA-Z0-9.,!?'\"]", " ", text)  # Remove special characters
#     return text.strip()

# # Ensure text column has no NaN values before applying text cleaning
# df["text"] = df["text"].astype(str).apply(clean_text)

def clean_text(text):
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)
    # Optionally remove only very specific noise
    # For example, remove non-ASCII characters:
    # text = re.sub(r"[^\x00-\x7F]+", " ", text)
    return text.strip()
df["text"] = df["text"].astype(str).apply(clean_text)
print("Preprocessing and balancing complete!")

# Initialize the tokenizer and model
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
model = DebertaForSequenceClassification.from_pretrained(
    "microsoft/deberta-base", num_labels=2, ignore_mismatched_sizes=True
)

# Tokenization function
def tokenize_function(text):
    return tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

# Tokenize the texts and stack tensors
df["tokenized"] = df["text"].apply(lambda x: tokenize_function(x))
input_ids = torch.cat([t["input_ids"] for t in df["tokenized"]], dim=0)
attention_masks = torch.cat([t["attention_mask"] for t in df["tokenized"]], dim=0)
labels = torch.tensor(df["label"].values, dtype=torch.long)

Preprocessing and balancing complete!


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# -------------------------------
# Create Dataset and Sampler
# -------------------------------

class PCLDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_masks[idx],
            "labels": self.labels[idx],
        }

# Split data into training and validation sets using indices
indices = list(range(len(df)))
train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)

train_ids, val_ids = input_ids[train_idx], input_ids[val_idx]
train_masks, val_masks = attention_masks[train_idx], attention_masks[val_idx]
train_labels, val_labels = labels[train_idx], labels[val_idx]

train_dataset = PCLDataset(train_ids, train_masks, train_labels)
val_dataset = PCLDataset(val_ids, val_masks, val_labels)


In [10]:
from sklearn.utils import resample

# -------------------------------
# Perform Oversampling on the Training Set
# -------------------------------

# Create a DataFrame for the training data
train_data = pd.DataFrame({
    "input_ids": train_ids.tolist(),
    "attention_masks": train_masks.tolist(),
    "labels": train_labels.tolist()
})

# Separate the minority and majority classes
minority_class = train_data[train_data["labels"] == 1]
majority_class = train_data[train_data["labels"] == 0]

# Upsample minority class
minority_upsampled = resample(
    minority_class,
    replace=True,  # Sample with replacement
    n_samples=len(majority_class),  # To match the majority class
    random_state=42
)

# Combine majority class with the upsampled minority class
train_data_oversampled = pd.concat([majority_class, minority_upsampled])

# Shuffle the oversampled dataset
train_data_oversampled = train_data_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Extract the oversampled training data back into input_ids, attention_masks, and labels
train_ids_resampled = torch.tensor(train_data_oversampled["input_ids"].tolist())
train_masks_resampled = torch.tensor(train_data_oversampled["attention_masks"].tolist())
train_labels_resampled = torch.tensor(train_data_oversampled["labels"].tolist())

# Create the resampled dataset
train_dataset_resampled = PCLDataset(train_ids_resampled, train_masks_resampled, train_labels_resampled)
val_dataset = PCLDataset(val_ids, val_masks, val_labels)


In [11]:
# -------------------------------
# Define Metric Computation
# -------------------------------

def compute_metrics(eval_pred):
    """Computes Accuracy and F1 Score"""
    accuracy_metric = load("accuracy")
    f1_metric = load("f1")
    predictions = np.argmax(eval_pred.predictions, axis=1)
    references = eval_pred.label_ids
    accuracy_score = accuracy_metric.compute(predictions=predictions, references=references)
    f1_score = f1_metric.compute(predictions=predictions, references=references)
    return {"accuracy": accuracy_score["accuracy"], "f1": f1_score["f1"]}


In [None]:
# -------------------------------
# Define the Hyperparameter Tuning Objective
# -------------------------------

def objective(trial):
    # Suggest hyperparameters (updated to use suggest_float with log=True)
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
    weight_decay = trial.suggest_float("weight_decay", 0, 0.3)

    # Update TrainingArguments with trial hyperparameters
    training_args = TrainingArguments(
       output_dir="./results",
       learning_rate=learning_rate,
       optim="adamw_torch",
       warmup_ratio=0.1,
       evaluation_strategy="epoch",
       save_strategy="no", # change to no
       per_device_train_batch_size=4,  # Reduce from 8 or 16 to 4 or even 2
       per_device_eval_batch_size=4,   # Match train batch size
       num_train_epochs=2,
       weight_decay=weight_decay,
       logging_dir="./logs",
       logging_steps=50,
       save_total_limit=2,
       lr_scheduler_type="linear",
       load_best_model_at_end=False,
       metric_for_best_model="f1",
       report_to="none",
       fp16=False,  # Enables mixed precision training (reduces memory usage)
       bf16=False,  # Keep this False unless on Ampere GPUs (A100, RTX 30xx)
   )


    # Initialize Trainer with early stopping callback
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset_resampled,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    # Train and evaluate the model
    trainer.train()
    eval_result = trainer.evaluate()

    # Use F1 score as the objective to maximize
    f1 = eval_result["eval_f1"]
    print(f"Trial completed with F1: {f1}")
    return f1

# -------------------------------
# Run Hyperparameter Tuning with Optuna
# -------------------------------

# Create a study that maximizes the F1 score
study = optuna.create_study(direction="maximize")
# Run the optimization for a fixed number of trials 
study.optimize(objective, n_trials=4)

# Print the best hyperparameters found
print("Best hyperparameters:", study.best_trial.params)

[I 2025-02-25 19:02:31,397] A new study created in memory with name: no-name-cdcbdffa-b146-45e8-81ed-5166999a7a04


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2693,0.557898,0.890149,0.530612
2,0.0402,0.620602,0.918806,0.558442


[I 2025-02-25 20:53:44,442] Trial 0 finished with value: 0.5584415584415584 and parameters: {'learning_rate': 8.28712997391875e-06, 'weight_decay': 0.15278837974836498}. Best is trial 0 with value: 0.5584415584415584.


Trial completed with F1: 0.5584415584415584




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0298,0.797172,0.912836,0.549383
2,0.0008,0.861862,0.917612,0.517483


[I 2025-02-25 22:44:43,493] Trial 1 finished with value: 0.5174825174825175 and parameters: {'learning_rate': 1.232471077113667e-06, 'weight_decay': 0.11837821610641207}. Best is trial 0 with value: 0.5584415584415584.


Trial completed with F1: 0.5174825174825175




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1321,0.651439,0.912836,0.5
2,0.0,0.761058,0.917015,0.501792


[I 2025-02-26 00:35:47,278] Trial 2 finished with value: 0.5017921146953405 and parameters: {'learning_rate': 2.0041958513416804e-05, 'weight_decay': 0.15348649731418104}. Best is trial 0 with value: 0.5584415584415584.


Trial completed with F1: 0.5017921146953405




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0,0.976531,0.920597,0.523297
2,0.0,1.04613,0.918209,0.519298


[I 2025-02-26 02:26:49,179] Trial 3 finished with value: 0.519298245614035 and parameters: {'learning_rate': 3.1945503991827538e-06, 'weight_decay': 0.25145094924062084}. Best is trial 0 with value: 0.5584415584415584.


Trial completed with F1: 0.519298245614035
Best hyperparameters: {'learning_rate': 8.28712997391875e-06, 'weight_decay': 0.15278837974836498}


Evaluating on official dev set

In [16]:
columns = ['par_id', 'art_id', 'keyword', 'country_code', 'text', 'label']
def tokenize_function(text):
    return tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

df_dev = pd.read_csv("dev_semeval_parids-labels.csv")
df_texts = pd.read_csv("dontpatronizeme_pcl.tsv", sep="\t", header=None, names=columns)

df_dev["par_id"] = df_dev["par_id"].astype(str)
df_texts["par_id"] = df_texts["par_id"].astype(str)

df_dev = df_dev.drop(columns=["label"])
df_texts["binary_label"] = df_texts["label"].apply(lambda x: 1 if x >= 2 else 0)
df_texts = df_texts.drop(columns=["label"])

# Merge datasets on paragraph ID and rename the binary label to "label"
df_dev = df_dev.merge(df_texts, on="par_id", how="left")
df_dev.rename(columns={"binary_label": "label"}, inplace=True)

#DON'T DROP
#df_dev = df_dev.dropna(subset=["text", "label"])  # Drop rows with missing data
df_dev["text"] = df_dev["text"].fillna("")

print(len(df_dev))

def clean_text(text):
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces
    #text = re.sub(r"[^a-zA-Z0-9.,!?'\"]", " ", text)  # Remove special characters
    return text.strip()

# Ensure text column has no NaN values before applying text cleaning
df_dev["text"] = df_dev["text"].astype(str).apply(clean_text)
def tokenize_function(text):
    return tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

# Tokenize the texts and stack tensors
df_dev["tokenized"] = df_dev["text"].apply(lambda x: tokenize_function(x))

dev_input_ids = torch.cat([t["input_ids"] for t in df_dev["tokenized"]], dim=0)
dev_attention_masks = torch.cat([t["attention_mask"] for t in df_dev["tokenized"]], dim=0)
dev_labels = torch.tensor(df_dev["label"].values, dtype=torch.long)


# Create a Dataset instance (assuming you have a PCLDataset defined similarly)
dev_dataset = PCLDataset(dev_input_ids, dev_attention_masks, dev_labels)

2094


In [None]:

# Load the saved model and tokenizer
model = DebertaForSequenceClassification.from_pretrained("./deberta_oversampling_model_new")
tokenizer = DebertaTokenizer.from_pretrained("./deberta_oversampling_model_new")


# -------------------------------
# Final Training with Best Hyperparameters and Early Stopping
# -------------------------------

final_training_args = TrainingArguments(
       output_dir="./results",
       learning_rate=8.28712997391875e-06,
       optim="adamw_torch",
       warmup_ratio=0.1,
       evaluation_strategy="epoch",
       save_strategy="no", # change to no
       per_device_train_batch_size=4,  # Reduce from 8 or 16 to 4 or even 2
       per_device_eval_batch_size=4,   # Match train batch size
       num_train_epochs=2,
       weight_decay=0.15278837974836498,
       logging_dir="./logs",
       logging_steps=50,
       save_total_limit=2,
       lr_scheduler_type="linear",
       load_best_model_at_end=False,
       metric_for_best_model="f1",
       report_to="none",
       fp16=False,  # Enables mixed precision training (reduces memory usage)
       bf16=False,  # Keep this False unless on Ampere GPUs (A100, RTX 30xx)
   )

final_trainer = Trainer(
    model=model,
    args=final_training_args,
    train_dataset=train_dataset_resampled,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics

)

#final_trainer.train()
final_results = final_trainer.evaluate()
print("Final evaluation results:", final_results)





Final evaluation results: {'eval_loss': 0.32855427265167236, 'eval_model_preparation_time': 0.0027, 'eval_accuracy': 0.9230769230769231, 'eval_f1': 0.5360230547550432, 'eval_runtime': 166.6208, 'eval_samples_per_second': 12.561, 'eval_steps_per_second': 3.145}
