### Finetuning the DepRoBERTa Model

In [1]:
# 📌 Install dependencies
!pip install -q transformers datasets evaluate scikit-learn

# 📌 Load libraries
import pandas as pd
import numpy as np
from datasets import Dataset, train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import torch

  from .autonotebook import tqdm as notebook_tqdm


ImportError: cannot import name 'train_test_split' from 'datasets' (/Users/joaomata/Desktop/LBMP/.venv/lib/python3.11/site-packages/datasets/__init__.py)

In [None]:
# 📌 Load your dataset (upload your CSV here)
from google.colab import files
uploaded = files.upload()  # Upload your e-daic.csv

df = pd.read_csv("e-daic.csv")  # Ensure 'text' and 'label' columns
print(df.head())


In [None]:
# 📌 Optional: Normalize PHQ scores to [0, 1]
normalize = True
if normalize:
    max_score = df["label"].max()
    df["label"] = df["label"] / max_score


In [None]:
# 📌 Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)


In [None]:
# 📌 Load DepRoBERTa
model_name = "tuhinjubcse/DepRoBERTa"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)


In [None]:
# 📌 Tokenize
def preprocess(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(preprocess)


In [None]:
# 📌 Train-test split
split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = split["train"]
eval_dataset = split["test"]


In [None]:
# 📌 Evaluation metrics
mse = evaluate.load("mean_squared_error")
mae = evaluate.load("mean_absolute_error")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.squeeze(logits)
    return {
        "mse": mse.compute(predictions=preds, references=labels)["mean_squared_error"],
        "mae": mae.compute(predictions=preds, references=labels)["mean_absolute_error"],
    }


In [None]:
# 📌 Training arguments and trainer
training_args = TrainingArguments(
    output_dir="./DepRoBERTa-e-daic",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="mse"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
# 📌 Save the fine-tuned model
model.save_pretrained("DepRoBERTa-finetuned-E-DAIC")
tokenizer.save_pretrained("DepRoBERTa-finetuned-E-DAIC")
