## BERT Fine-Tuning

### Setup

In [1]:
#!pip install evaluate datasets transformers accelerate==0.26.0

### Load Data and Preprocessing

In [2]:
from datasets import load_dataset
from datasets import DatasetDict, Dataset

# Charger les fichiers JSONL en DatasetDict
dataset = DatasetDict({
    "train": load_dataset("json", data_files="data/train.json")["train"],
    "test": load_dataset("json", data_files="data/test.json")["train"],
    "valid": load_dataset("json", data_files="data/valid.json")["train"]
})

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'level', 'item_id', 'synthetic', 'label', '__index_level_0__'],
        num_rows: 7059
    })
    test: Dataset({
        features: ['text', 'level', 'item_id', 'synthetic', 'label', '__index_level_0__'],
        num_rows: 882
    })
    valid: Dataset({
        features: ['text', 'level', 'item_id', 'synthetic', 'label', '__index_level_0__'],
        num_rows: 883
    })
})

In [3]:
dataset["train"][0]

{'text': "Prompt Level: 2 [SEP] Prompt: What activities do you do at school? [SEP] Response: I'm studying computer science and engineering, I'm learning programming languages like python, java, c++, and I'm doing project in android app development and web development, I'm also learning data science",
 'level': 2,
 'item_id': 80,
 'synthetic': True,
 'label': 2,
 '__index_level_0__': 8371}

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [5]:
tokenizer(dataset["train"][0]["text"])

{'input_ids': [101, 25732, 2504, 1024, 1016, 102, 25732, 1024, 2054, 3450, 2079, 2017, 2079, 2012, 2082, 1029, 102, 3433, 1024, 1045, 1005, 1049, 5702, 3274, 2671, 1998, 3330, 1010, 1045, 1005, 1049, 4083, 4730, 4155, 2066, 18750, 1010, 9262, 1010, 1039, 1009, 1009, 1010, 1998, 1045, 1005, 1049, 2725, 2622, 1999, 11924, 10439, 2458, 1998, 4773, 2458, 1010, 1045, 1005, 1049, 2036, 4083, 2951, 2671, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [6]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

In [7]:
tokenized_train = dataset["train"].map(tokenize_function, batched=True)

In [8]:
tokenized_test = dataset["test"].map(tokenize_function, batched=True)

Map:   0%|          | 0/882 [00:00<?, ? examples/s]

In [9]:
tokenized_valid = dataset["valid"].map(tokenize_function, batched=True)

In [10]:
unique_labels = set(dataset['train']['label'])
num_labels = len(unique_labels)
num_labels

5

### Bert Model Fine tuning with trainer 

In [11]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import numpy as np 
import evaluate

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, cohen_kappa_score
from scipy.stats import pearsonr

metric = evaluate.load("accuracy")

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Convertir les logits en classes prédictes

    # 🎯 Exactitude (Accuracy)
    accuracy = accuracy_score(labels, predictions)

    # 🎯 Précision, Rappel et F1-score (pondérés)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")

    # 🎯 Score de Cohen's Kappa (pondéré)
    cohen_kappa = cohen_kappa_score(labels, predictions, weights="quadratic")

    # 🎯 Corrélation de Pearson
    pearson_corr, _ = pearsonr(labels, predictions)  # Retourne (coef, p-valeur), on garde seulement coef

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "cohen_kappa": cohen_kappa,
        "pearson_corr": pearson_corr
    }


In [26]:
args = TrainingArguments(
    output_dir="../../../model_saved/bert-ft-efcamdat-augmented",
    evaluation_strategy="steps",  # Évaluation aux mêmes intervalles que la sauvegarde
    save_strategy="steps",  # Sauvegarde tous les 500 steps
    save_steps=250,
    eval_steps=250,  # ⚠ IMPORTANT : Évaluation aux mêmes steps
    save_total_limit=4,  # Ne garde que 2 checkpoints max
    learning_rate=2e-5,
    warmup_ratio=0.1,
    lr_scheduler_type="linear", 
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    load_best_model_at_end=True,  
    metric_for_best_model="f1",
    logging_steps=100,
    fp16=True,
)



In [27]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

In [28]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=2652, training_loss=0.1628522438700922, metrics={'train_runtime': 569.1805, 'train_samples_per_second': 74.412, 'train_steps_per_second': 4.659, 'total_flos': 1.1144105805920256e+16, 'train_loss': 0.1628522438700922, 'epoch': 6.0})

In [29]:
trainer.evaluate()

{'eval_loss': 1.3224540948867798,
 'eval_accuracy': 0.7505668934240363,
 'eval_precision': 0.7586663097399037,
 'eval_recall': 0.7505668934240363,
 'eval_f1': 0.7523943522058482,
 'eval_cohen_kappa': 0.879843377147302,
 'eval_pearson_corr': 0.8830760135726944,
 'eval_runtime': 3.3232,
 'eval_samples_per_second': 265.409,
 'eval_steps_per_second': 16.851,
 'epoch': 6.0}

### Roberta Model Fine Tuning

In [34]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [35]:
tokenizer(dataset["train"][0]["text"])

{'input_ids': [0, 35396, 3320, 12183, 35, 132, 646, 3388, 510, 742, 42944, 35, 653, 1713, 109, 47, 109, 23, 334, 116, 646, 3388, 510, 742, 19121, 35, 38, 437, 7739, 3034, 2866, 8, 4675, 6, 38, 437, 2239, 8326, 11991, 101, 39825, 6, 46900, 6, 740, 42964, 6, 8, 38, 437, 608, 695, 11, 42492, 1553, 709, 8, 3748, 709, 6, 38, 437, 67, 2239, 414, 2866, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [36]:
tokenized_train = dataset["train"].map(tokenize_function, batched=True)
tokenized_test = dataset["test"].map(tokenize_function, batched=True)
tokenized_valid = dataset["valid"].map(tokenize_function, batched=True)

Map:   0%|          | 0/7059 [00:00<?, ? examples/s]

Map:   0%|          | 0/882 [00:00<?, ? examples/s]

Map:   0%|          | 0/883 [00:00<?, ? examples/s]

In [37]:
from transformers import RobertaForSequenceClassification, TrainingArguments, Trainer
model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=num_labels)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
args = TrainingArguments(
    output_dir="../../../model_saved/roberta-ft-efcamdat-augmented",
    evaluation_strategy="steps",  # Évaluation aux mêmes intervalles que la sauvegarde
    save_strategy="steps",  # Sauvegarde tous les 500 steps
    save_steps=250,
    eval_steps=250,  # ⚠ IMPORTANT : Évaluation aux mêmes steps
    save_total_limit=4,  # Ne garde que 2 checkpoints max
    learning_rate=2e-5,
    warmup_ratio=0.1,
    lr_scheduler_type="linear", 
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    load_best_model_at_end=True,  
    metric_for_best_model="f1",
    logging_steps=100,
    fp16=True,
)



In [39]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

In [40]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=2652, training_loss=0.4273020706924738, metrics={'train_runtime': 555.7837, 'train_samples_per_second': 76.206, 'train_steps_per_second': 4.772, 'total_flos': 1.1144105805920256e+16, 'train_loss': 0.4273020706924738, 'epoch': 6.0})

In [41]:
trainer.evaluate()

{'eval_loss': 0.8538001775741577,
 'eval_accuracy': 0.7766439909297053,
 'eval_precision': 0.7833616463198084,
 'eval_recall': 0.7766439909297053,
 'eval_f1': 0.7774833635057077,
 'eval_cohen_kappa': 0.8961203926498139,
 'eval_pearson_corr': 0.8996142462785286,
 'eval_runtime': 3.0751,
 'eval_samples_per_second': 286.823,
 'eval_steps_per_second': 18.211,
 'epoch': 6.0}

### Evaluation dataset analysis

In [42]:
list_topic = dataset["valid"]["item_id"]
list_t_set = set(list_topic)
unique_t = (list(list_t_set))


In [43]:
list_r = []

# Assuming 'unique_t' is a list of unique item_ids and 'trainer' is already defined
for t in unique_t:  # Iterate over the first item in unique_t
    sub_ds = tokenized_valid.filter(lambda example: example['item_id'] == t)
    # Get predictions using the trainer
    predictions = trainer.predict(sub_ds)
    # Raw output logits (size [batch_size, num_classes])
    outputs = predictions.predictions
    # Convert logits to predicted class labels (taking the argmax across the classes)
    predicted_labels = np.argmax(outputs, axis=-1)
    ref_label = predictions.label_ids
    # Print or save the predicted classes (this will be a numpy array with the predicted class indices)
    ck = round(cohen_kappa_score(predicted_labels, ref_label, weights="quadratic"), 2)  
    pearson_corr, _ = pearsonr(ref_label, predicted_labels)
    accuracy = accuracy_score(ref_label, predicted_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(ref_label, predicted_labels, average="weighted")

    r = {
        "item_id": t,
        "level": sub_ds["level"][0],
        "synthetic": sub_ds["synthetic"][0],
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "ck": ck,
        "pearson": pearson_corr,
        "n_samples": len(sub_ds)
    }
    list_r.append(r)



Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

Filter:   0%|          | 0/883 [00:00<?, ? examples/s]

In [44]:
import pandas as pd
df_eval_results = pd.DataFrame(list_r, columns=["item_id", "level", "synthetic", "accuracy", "precision", "recall", "f1", "ck", "pearson", "n_samples"])
df_eval_results.head(n=10)

Unnamed: 0,item_id,level,synthetic,accuracy,precision,recall,f1,ck,pearson,n_samples
0,0,1,False,0.75,0.79375,0.75,0.745022,0.8,0.846947,20
1,1,1,False,1.0,1.0,1.0,1.0,1.0,1.0,16
2,2,1,False,0.666667,0.763889,0.666667,0.660317,0.57,0.658145,18
3,3,1,False,0.708333,0.836806,0.708333,0.700674,0.8,0.825914,24
4,4,1,False,0.833333,0.883333,0.833333,0.831625,0.84,0.855049,18
5,5,2,False,0.777778,0.888889,0.777778,0.795062,0.88,0.875,18
6,6,2,False,0.85,0.886111,0.85,0.856495,0.88,0.896312,20
7,7,2,False,0.777778,0.866667,0.777778,0.77886,0.84,0.850427,18
8,8,2,False,0.625,0.678125,0.625,0.607819,0.74,0.824151,16
9,9,2,False,0.789474,0.815789,0.789474,0.781287,0.89,0.908601,19


In [45]:
df_eval_results.to_csv("result_eval_data_roberta_efcamdat_augmented.csv", index=False)


### Hyperparameter optimization

In [22]:
#!pip install ray
#!pip install "ray[tune]"

In [23]:
import ray
from pprint import pprint

ModuleNotFoundError: No module named 'ray'

In [None]:
ray.init(_temp_dir="/home/ec2-user/model_saved/ray_tmp")

In [None]:
pprint(ray.cluster_resources())

In [None]:
use_gpu = True  # set this to False to run on CPUs
num_workers = 1  # set this to number of GPUs or CPUs you want to use

In [None]:
import ray.data

ray_datasets = {
    "train": ray.data.from_huggingface(raw_dataset["train"]),
    "validation": ray.data.from_huggingface(raw_dataset["eval"]),
    "test": ray.data.from_huggingface(raw_dataset["test"]),
}
ray_datasets

In [None]:
import numpy as np
from typing import Dict

# Tokenize input sentences
def collate_fn(examples: Dict[str, np.array]):
    outputs = tokenizer(
        list(examples["text"]),
        truncation=True,
        padding="longest",
        return_tensors="pt",
    )

    outputs["labels"] = torch.LongTensor(examples["label"])

    # Move all input tensors to GPU
    for key, value in outputs.items():
        outputs[key] = value.cuda()

    return outputs

In [None]:
import torch
import numpy as np

from datasets import load_metric
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

import ray.train
from ray.train.huggingface.transformers import prepare_trainer, RayTrainReportCallback


model_checkpoint = "bert-base-uncased"
task = "review"
batch_size = 16

num_labels = 5
metric_name = (
    "accuracy"
)
model_name = model_checkpoint.split("/")[-1]

name = f"{model_name}-finetuned-{task}"

# Calculate the maximum steps per epoch based on the number of rows in the training dataset.
# Make sure to scale by the total number of training workers and the per device batch size.
max_steps_per_epoch = ray_datasets["train"].count() // (batch_size * num_workers)


def train_func(config):
    print(f"Is CUDA available: {torch.cuda.is_available()}")

    metric = load_metric("glue", "cola")
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=num_labels
    )

    train_ds = ray.train.get_dataset_shard("train")
    eval_ds = ray.train.get_dataset_shard("eval")

    train_ds_iterable = train_ds.iter_torch_batches(
        batch_size=batch_size, collate_fn=collate_fn
    )
    eval_ds_iterable = eval_ds.iter_torch_batches(
        batch_size=batch_size, collate_fn=collate_fn
    )

    print("max_steps_per_epoch: ", max_steps_per_epoch)

    args = TrainingArguments(
        name,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=config.get("learning_rate", 2e-5),
        num_train_epochs=config.get("epochs", 2),
        weight_decay=config.get("weight_decay", 0.01),
        push_to_hub=False,
        max_steps=max_steps_per_epoch * config.get("epochs", 2),
        disable_tqdm=True,  # declutter the output a little
        no_cuda=not use_gpu,  # you need to explicitly set no_cuda if you want CPUs
        report_to="none",
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model,
        args,
        train_dataset=train_ds_iterable,
        eval_dataset=eval_ds_iterable,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.add_callback(RayTrainReportCallback())

    trainer = prepare_trainer(trainer)

    print("Starting training")
    trainer.train()

In [None]:
from ray.train.torch import TorchTrainer
from ray.train import RunConfig, ScalingConfig, CheckpointConfig

trainer = TorchTrainer(
    train_func,
    scaling_config=ScalingConfig(
        num_workers=num_workers, 
        resources_per_worker={"GPU": 1, "CPU": 1},
        use_gpu=use_gpu),
    datasets={
        "train": ray_datasets["train"],
        "eval": ray_datasets["validation"],
    },
    run_config=RunConfig(
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="eval_loss",
            checkpoint_score_order="min",
            
        ),
    ),
)

In [None]:
result = trainer.fit()

In [None]:
result