In [2]:
#RUN IN LAMBDA A6000
import os
os.environ["WANDB_PROJECT"] = "hatebert_sbf2"
os.environ["WANDB_NOTEBOOK_NAME"] = "sweep2_hatebert.ipynb"


In [3]:
import os
from huggingface_hub import login
from huggingface_hub import whoami
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification,pipeline
from transformers import TrainingArguments
from transformers import Trainer
from transformers import EarlyStoppingCallback
import matplotlib.pyplot as plt
import seaborn as sns
import wandb
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from datasets import Dataset
import evaluate
import tqdm


In [2]:
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")



CUDA available: False
GPU name: No GPU


In [None]:


# Authenticate with Hugging Face
login(token=hf_token)
# verifing login
user_info = whoami()
print(f" logged in as: {user_info['name']}")

 logged in as: ms-mielnic


In [5]:

#setting to view all rows
pd.set_option('display.max_rows', None)


In [6]:
import importlib
import finetuning_eval_func
importlib.reload(finetuning_eval_func)
from finetuning_eval_func import create_eval_summary_df,compute_class_weights, process_csv_social_bias,tokenize_function,custom_collate_fn,evaluation_report,conf_matrix

import re

/Users/mariamielniczuk/anaconda3/envs/capstone_env/bin/python
/Users/mariamielniczuk/anaconda3/envs/capstone_env/bin/python


In [7]:
train_df, val_df, test_df, label2id, id2label = process_csv_social_bias('/Users/mariamielniczuk/Documents/capstone')#dir_path is the path to the directory containing the CSV files

Train size: 29790
Validation size: 3724
Test size: 3724


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bias_type'].fillna('Neutral', inplace=True)


In [8]:
#inititalizing the model to train

model_name = "GroNLP/hateBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Compute class weights (inverse frequency)
class_weights = compute_class_weights(train_df, label2id)

In [10]:
#evaluation metrics

accuracy = evaluate.load("accuracy")#loading accuracy metric form evaualte library HF

def compute_metrics(p):#p is an EvalPrediction object that is why p.predictions and p.label_ids
    preds = np.argmax(p.predictions, axis=1)
    result = accuracy.compute(predictions=preds, references=p.label_ids)

    # Optionally add more metrics here
    return result

In [11]:

class FocalLoss(nn.Module):
    """implementing the focal loss function as described in 
    aper https://medium.com/data-scientists-diary/implementing-focal-loss-in-pytorch-for-class-imbalance-24d8aa3b59d9"""
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha # Weighting factor
        self.gamma = gamma # Focusing parameter
        self.reduction = reduction

    def forward(self, inputs, targets):
        # inputs: raw logits from the model
        # targets: ground truth labels (integer indices)

        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss) # Probability of the true class

        # Apply alpha if provided
        if self.alpha is not None:
            #  alpha is a tensor with shape [num_classes]
            alpha_tensor = self.alpha.gather(0, targets.data.view(-1)).view_as(targets)
            focal_loss = alpha_tensor * (1 - pt) ** self.gamma * ce_loss
        else:
            focal_loss = (1 - pt) ** self.gamma * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss



In [12]:
# Define the sweep configuration for Focal Loss
focal_sweep_config = {
    "method": "bayes", # or 'random', 'grid'
    "metric": {
        "name": "eval_loss",
        "goal": "minimize"
    },
    "parameters": {
        "learning_rate": {
            "min": 1e-6,
            "max": 5e-5
        },
        "weight_decay": {
            "values": [0.0, 0.01, 0.1]
        },
        "per_device_train_batch_size": {
            "values": [8, 16]
        },
        "num_train_epochs": {
            "values": [1, 2]
        },
        "gamma": { # Add gamma as a hyperparameter to tune
            "min": 0.5,
            "max": 5.0
        }
    }
}



In [None]:

# Will need FocalLoss, class_weights, small_train_dataset, small_val_dataset, label2id, id2label, and compute_metrics
#   defined and accessible from previous cells.

# Define the FocalLossTrainer class to ensure it's in scope and can accept gamma
class FocalLossTrainer(Trainer):
    def __init__(self, gamma=2.0, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.gamma = gamma
        self.class_weights = class_weights

    # Add num_items_in_batch back to the signature as required by the Trainer
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # print("compute_loss called with num_items_in_batch (re-added)") # debugging
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Uses the gamma value stored in the trainer instance
        # Assuming focal_loss and class_weights are accessible
        loss = FocalLoss(alpha=class_weights.to(model.device), gamma=self.gamma)(logits, labels)

        return (loss, outputs) if return_outputs else loss

# Define the sweep_train function to ensure it's in scope
def sweep_train():
    """this function runs a set of train sessions to find the best hyperparameters"""


    with wandb.init() as run:
        config = run.config
        run_id = run.id

        model_dir = f"hatebert_models/focal_model_{run_id}"
        log_dir   = f"hatebert_logs/focal_logs_{run_id}"

        os.makedirs(model_dir, exist_ok=True)
        os.makedirs(log_dir, exist_ok=True)

        training_args = TrainingArguments(
            output_dir=model_dir,
            logging_dir=log_dir,
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            save_total_limit=1,
            learning_rate=config.learning_rate,
            weight_decay=config.weight_decay,
            per_device_train_batch_size=config.per_device_train_batch_size,
            per_device_eval_batch_size=config.per_device_train_batch_size,
            num_train_epochs=config.num_train_epochs,
            report_to=["wandb", "tensorboard"],
            run_name=f"focal_sweep_{run_id}",
        )

        
        # Assuming train_df, val_df are accessible
        train_dataset = Dataset.from_pandas(train_df).map(tokenize_function, batched=True)
        val_dataset   = Dataset.from_pandas(val_df).map(tokenize_function, batched=True)

        # Remove extra columns - moved inside the function
        for dset in [train_dataset, val_dataset]:
            dset = dset.remove_columns([col for col in dset.column_names if col not in ["input_ids", "attention_mask", "label"]])

        # Subset to 30% for faster sweeping - moved inside the function
        small_train_dataset = train_dataset.select(range(int(0.3 * len(train_dataset))))
        small_val_dataset = val_dataset.select(range(int(0.3 * len(val_dataset))))


        trainer = FocalLossTrainer(
            model=model,
            args=training_args,
            train_dataset=small_train_dataset,
            eval_dataset=small_val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics, 
            callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
            gamma=config.gamma # Pass gamma to the custom trainer
        )

        trainer.train()

        pd.DataFrame(trainer.state.log_history).to_csv(
            f"hatebert_logs/focal_metrics_{run.id}.csv", index=False
        )

        # Save model
        trainer.save_model(model_dir)
        print(f"✅ Model and metrics saved to: {model_dir}, {log_dir}")




In [None]:
# # Initialize and start the new sweep
# # Assuming focal_sweep_config is accessible
# focal_sweep_id = wandb.sweep(focal_sweep_config, project="hatebert_sbf1")
# wandb.agent(focal_sweep_id, function=sweep_train, count=10)

In [13]:
create_eval_summary_df('/Users/mariamielniczuk/Documents/capstone/logs_focal')

Found 10 metric files in /Users/mariamielniczuk/Documents/capstone/logs_focal


Unnamed: 0,run_id,eval_loss,eval_accuracy,epoch,step
3,wpwb9oxw,0.876728,0.707252,2.0,2236.0
4,zapg8lmm,0.422696,0.666965,2.0,1118.0
6,midmdhlr,0.423562,0.665175,1.0,1118.0
8,fd415zp6,0.95068,0.660698,1.0,1118.0
2,83v4udtp,0.76244,0.640107,1.0,1118.0
5,k84xbxys,0.507011,0.640107,2.0,1118.0
7,zzzkhebt,0.403139,0.639212,2.0,1118.0
9,q9avuxel,0.516203,0.63026,2.0,1118.0
1,l6g6yt2p,0.421465,0.607878,1.0,559.0
0,2cfhjojw,0.439704,0.600716,1.0,559.0
