In [20]:
import pandas as pd
from transformers import AutoTokenizer,DataCollatorWithPadding,Trainer,EarlyStoppingCallback
from transformers import AutoModelForSequenceClassification, TrainingArguments,SchedulerType
from transformers import trainer_utils
from transformers.trainer_utils import IntervalStrategy as SaveStrategy
from torch.utils.data import DataLoader, WeightedRandomSampler
from datasets import load_dataset
from datasets import Dataset, DatasetDict
import torch
from torch.serialization import add_safe_globals
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, log_loss, classification_report
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from sklearn.model_selection import train_test_split
from typing import Dict, List, Tuple, Any
import matplotlib.pyplot as plt
import seaborn as sns
import re
import random
import math
import ipywidgets
import os



In [21]:
from datasets.utils.logging import enable_progress_bar
enable_progress_bar()

In [22]:
import sys, torch
print("Python:", sys.version)
print("PyTorch:", torch.__version__)
import transformers; print("Transformers:", transformers.__version__, transformers.__file__)
import subprocess; subprocess.run([sys.executable, "-m", "pip", "show", "transformers"])

Python: 3.10.12 (main, May 27 2025, 17:12:29) [GCC 11.4.0]
PyTorch: 2.7.0
Transformers: 4.55.0 /home/ubuntu/.local/lib/python3.10/site-packages/transformers/__init__.py
Name: transformers
Version: 4.55.0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /home/ubuntu/.local/lib/python3.10/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 


CompletedProcess(args=['/usr/bin/python3', '-m', 'pip', 'show', 'transformers'], returncode=0)

In [23]:
print("PyTorch version:", torch.__version__)
print("Transformers version:", transformers.__version__)

PyTorch version: 2.7.0
Transformers version: 4.55.0


In [24]:
import importlib
import finetuning_eval_func
importlib.reload(finetuning_eval_func)
from finetuning_eval_func import process_csv_social_bias


/usr/bin/python3


In [25]:
train_df, val_df, test_df, label2id, id2label = process_csv_social_bias('./')

Train size: 29790
Validation size: 3724
Test size: 3724


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['bias_type'].map(label2id)


In [26]:
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")
os.environ["TRANSFORMERS_NO_TF"] = "1"

CUDA available: True
GPU name: NVIDIA RTX A6000


In [27]:
print(torch.version.cuda)         # CUDA version PyTorch
print(torch.backends.cudnn.version())  # cuDNN version
print(torch.cuda.is_available())

12.8
90800
True


In [28]:
def tokenize_function(examples):
    # use truncation only-collator handle padding dynamically
    return tokenizer(examples["post"], truncation=True)

### Loading the training arguments from the best cross entropy loss Hatebert model from sweep1_htebert.ipynb

In [29]:
def compute_class_weights(train_df, label2id):
    # If train_df['label'] are strings, map to ids first
    y = train_df['label']
    if not np.issubdtype(y.dtype, np.number):
        y = y.map(label2id).values
    else:
        y = y.values

    classes = np.array(sorted(label2id.values()))
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
    return torch.tensor(weights, dtype=torch.float)

class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights  # torch tensor, move to device in compute_loss

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits  # [batch, num_labels] for sequence classification

        # Ensure weights are on the same device as logits
        if self.class_weights is not None:
            class_weights = self.class_weights.to(logits.device)
            loss_fct = CrossEntropyLoss(weight=class_weights)
        else:
            loss_fct = CrossEntropyLoss()

        # Flattening guards against token-classification shapes; harmless for seq cls
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [30]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
    }

In [31]:
print("torch.version.cuda:", torch.version.cuda)       # None on CPU build
print("torch.cuda.is_available():", torch.cuda.is_available())  # False in cpu
print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES"))

torch.version.cuda: 12.8
torch.cuda.is_available(): True
CUDA_VISIBLE_DEVICES: None


In [32]:
# Device: CUDA if available, else CPU (explicitly avoid Metal)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)


Using device: cuda


In [None]:
#Loading from checkpoing and inheriting parameters
CKPT_DIR = "checkpoint-2236"  #.../checkpoint-2500
DATA_DIR = "./"              # dir that contains social_bias.csv

torch.set_float32_matmul_precision("high")   # A6000-can handle this
torch.backends.cuda.matmul.allow_tf32 = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

try:
    use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
except AttributeError:
    use_bf16 = False
use_fp16 = torch.cuda.is_available() and not use_bf16


# Load tokenizer & model (not related to the warning)
tokenizer = AutoTokenizer.from_pretrained(CKPT_DIR, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(CKPT_DIR).to(device)
# after: tokenizer = AutoTokenizer.from_pretrained(...)
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    pad_to_multiple_of=8  # Tensor Cores like multiples of 8; set to None if CPU-only
)


class WeightedSamplerTrainer(WeightedTrainer):  # inherits  weighted-loss compute_loss
    def get_train_dataloader(self):
        dset = self.train_dataset  # HuggingFace Dataset
        labels = np.array(dset["label"], dtype=int)

        # inverse-frequency sampling
        num_labels = self.model.config.num_labels
        counts = np.bincount(labels, minlength=num_labels)
        inv = 1.0 / np.maximum(counts, 1)
        sample_w = inv[labels]  # assigning the inv weight to the labels based on their indexes

        sampler = WeightedRandomSampler(
            weights=torch.from_numpy(sample_w).double(),
            num_samples=len(sample_w),
            replacement=True
        )

        return DataLoader(
            dset,
            batch_size=self.args.train_batch_size,
            sampler=sampler,                 # no shuffle when sampler is set
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )




In [34]:

# ---- STRICT hyperparam recovery (no defaults) ----
def load_training_args_best_effort(ckpt_dir):
    args_bin  = os.path.join(ckpt_dir, "training_args.bin")
    args_json = os.path.join(ckpt_dir, "training_args.json")
    hp = None

    if os.path.exists(args_bin):
        try:
            hp = torch.load(args_bin, map_location="cpu", weights_only=False) 
            print("Loaded training_args from BIN.")
        except Exception as e:
            print("BIN load failed:", repr(e))

    if hp is None and os.path.exists(args_json):
        try:
            with open(args_json, "r") as f:
                hp = json.load(f)
            print("Loaded training_args from JSON.")
        except Exception as e:
            print("JSON load failed:", repr(e))

    # Normalize to dict; fill only what exists; tolerate missing keys
    def getv(obj, k):
        return obj.get(k, None) if isinstance(obj, dict) else getattr(obj, k, None)

    d = {}
    if hp is not None:
        d["learning_rate"]  = getv(hp, "learning_rate")
        d["weight_decay"]   = getv(hp, "weight_decay")
        # Some runs store warmup as steps, not ratio
        wr = getv(hp, "warmup_ratio")
        ws = getv(hp, "warmup_steps")
        d["warmup_ratio"]   = wr if wr is not None else (0.0 if ws else None)  # set manualy
        lst = getv(hp, "lr_scheduler_type")
        d["lr_scheduler_type"] = str(lst) if lst is not None else None
        # Nice-to-haves:
        for k in ["adam_beta1","adam_beta2","adam_epsilon","max_grad_norm",
                  "per_device_train_batch_size","per_device_eval_batch_size",
                  "gradient_accumulation_steps","eval_steps","logging_steps","save_steps"]:
            v = getv(hp, k)
            if v is not None:
                d[k] = v
    return {k:v for k,v in d.items() if v is not None}

saved_hp = load_training_args_best_effort(CKPT_DIR)
print("Recovered HPs (non-strict):", saved_hp)



Loaded training_args from BIN.
Recovered HPs (non-strict): {'learning_rate': 1.7011977627094936e-05, 'weight_decay': 0.01, 'warmup_ratio': 0.0, 'lr_scheduler_type': 'SchedulerType.LINEAR', 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'logging_steps': 500, 'save_steps': 500}


In [None]:
# pick the text column tokenize_function expects
TEXT_COL = "text" if "text" in train_df.columns else "post"

# ensure label is clean ints (only if not already done)
for df_ in (train_df, val_df, test_df):
    assert "label" in df_.columns and df_["label"].notna().all(), "Labels must be present and non-NaN"
    df_["label"] = df_["label"].astype(int)

# build datasets (avoid index column)
train_dataset = Dataset.from_pandas(train_df[[TEXT_COL, "label"]], preserve_index=False)
val_dataset   = Dataset.from_pandas(val_df[[TEXT_COL, "label"]],   preserve_index=False)
test_dataset  = Dataset.from_pandas(test_df[[TEXT_COL, "label"]],  preserve_index=False)

# tokenize (tokenize_function should use batch[TEXT_COL])
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset   = val_dataset.map(tokenize_function, batched=True)
test_dataset  = test_dataset.map(tokenize_function, batched=True)

# keep only what Trainer/model need (reassign!)
cols_to_keep = {"input_ids", "attention_mask", "label"}
train_dataset = train_dataset.remove_columns([c for c in train_dataset.column_names if c not in cols_to_keep])
val_dataset   = val_dataset.remove_columns([c for c in val_dataset.column_names   if c not in cols_to_keep])
test_dataset  = test_dataset.remove_columns([c for c in test_dataset.column_names if c not in cols_to_keep])

# correct column name
for d in (train_dataset, val_dataset, test_dataset):
    if "label" in d.column_names and "labels" not in d.column_names:
        d = d.rename_column("label", "labels")


Map:   0%|          | 0/29790 [00:00<?, ? examples/s]

Map:   0%|          | 0/3724 [00:00<?, ? examples/s]

Map:   0%|          | 0/3724 [00:00<?, ? examples/s]

In [38]:
# precision A6000 supports bf16 testing this optimization to cut memory use-different from FP16
# Use the recovered hyperparams
OUTPUT_DIR = "./hatebert_fresh_linear_rebalancedclasses"  # new folder

PER_DEV_TRAIN_BS = saved_hp.get("per_device_train_batch_size", 8)
PER_DEV_EVAL_BS  = saved_hp.get("per_device_eval_batch_size", 8)
GRAD_ACCUM       = saved_hp.get("gradient_accumulation_steps", 1)

NEW_LR        = saved_hp.get("learning_rate", 1.7011977627094936e-05)
WARMUP_RATIO  = saved_hp.get("warmup_ratio", 0.0)
SCHEDULER_TYPE = SchedulerType.LINEAR
LOG_STEPS     = saved_hp.get("logging_steps", 500)
SAVE_STEPS    = saved_hp.get("save_steps", 500)

steps_per_epoch = math.ceil(len(train_dataset) / (PER_DEV_TRAIN_BS * max(1, GRAD_ACCUM)))
eval_save_steps = SAVE_STEPS  #can be adjusted to other than the inherited

args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,

    per_device_train_batch_size=PER_DEV_TRAIN_BS,
    per_device_eval_batch_size=PER_DEV_EVAL_BS,
    gradient_accumulation_steps=GRAD_ACCUM,

    num_train_epochs=3,                 # my choice
    learning_rate=NEW_LR,
    weight_decay=saved_hp.get("weight_decay", 0.01),
    warmup_ratio=WARMUP_RATIO,
    lr_scheduler_type=SCHEDULER_TYPE,
    adam_beta1=saved_hp.get("adam_beta1", 0.9),
    adam_beta2=saved_hp.get("adam_beta2", 0.999),
    adam_epsilon=saved_hp.get("adam_epsilon", 1e-8),
    max_grad_norm=saved_hp.get("max_grad_norm", 1.0),

    eval_strategy="steps",
    logging_strategy="steps",
    logging_steps=LOG_STEPS,
    save_steps=SAVE_STEPS,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,

    bf16=use_bf16,
    fp16=not use_bf16 and torch.cuda.is_available(),
    report_to="none",
)

trainer = WeightedSamplerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=12)], 
)




  super().__init__(*args, **kwargs)


In [39]:
# Train (set resume=True for optimizer/scheduler state restored from CKPT_DIR)


resume = False
trainer.train(resume_from_checkpoint=CKPT_DIR if resume else None)

# validation
val_metrics = trainer.evaluate()
print("Validation:", val_metrics)

# test (metrics only)
test_metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Test:", test_metrics)

# see which checkpoint was best (since load_best_model_at_end=True)
print("Best checkpoint:", trainer.state.best_model_checkpoint)

# save best model + tokenizer
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

Step,Training Loss,Validation Loss,Accuracy,F1 Macro
500,1.0109,1.293368,0.626477,0.477198
1000,0.8224,1.3127,0.611708,0.473804
1500,0.6564,1.27285,0.633459,0.48495
2000,0.5954,1.294479,0.639098,0.475058
2500,0.5386,1.429958,0.627282,0.484533
3000,0.515,1.395388,0.609023,0.462868
3500,0.5078,1.435999,0.630236,0.474977
4000,0.4943,1.307667,0.648228,0.486964
4500,0.4625,1.390218,0.643394,0.482384
5000,0.4266,1.51866,0.630773,0.478398


Validation: {'eval_loss': 1.3797491788864136, 'eval_accuracy': 0.6723952738990333, 'eval_f1_macro': 0.48769175837929923, 'eval_runtime': 5.7032, 'eval_samples_per_second': 652.966, 'eval_steps_per_second': 81.708, 'epoch': 3.0}
Test: {'eval_loss': 1.310603141784668, 'eval_accuracy': 0.6812567132116004, 'eval_f1_macro': 0.4950052710480407, 'eval_runtime': 6.557, 'eval_samples_per_second': 567.94, 'eval_steps_per_second': 71.069, 'epoch': 3.0}
Best checkpoint: ./hatebert_fresh_linear_rebalancedclasses/checkpoint-6500


('./hatebert_fresh_linear_rebalancedclasses/tokenizer_config.json',
 './hatebert_fresh_linear_rebalancedclasses/special_tokens_map.json',
 './hatebert_fresh_linear_rebalancedclasses/vocab.txt',
 './hatebert_fresh_linear_rebalancedclasses/added_tokens.json',
 './hatebert_fresh_linear_rebalancedclasses/tokenizer.json')