## BERT Fine-Tuning

### Setup

In [3]:
#!pip install evaluate datasets

### Load Data and Preprocessing

In [4]:
from datasets import load_dataset
from datasets import DatasetDict, Dataset

# Charger les fichiers JSONL en DatasetDict
dataset = DatasetDict({
    "train": load_dataset("json", data_files="data/train.json")["train"],
    "test": load_dataset("json", data_files="data/test.json")["train"],
    "valid": load_dataset("json", data_files="data/valid.json")["train"]
})

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'level', 'item_id', 'synthetic', 'label', '__index_level_0__'],
        num_rows: 7059
    })
    test: Dataset({
        features: ['text', 'level', 'item_id', 'synthetic', 'label', '__index_level_0__'],
        num_rows: 882
    })
    valid: Dataset({
        features: ['text', 'level', 'item_id', 'synthetic', 'label', '__index_level_0__'],
        num_rows: 883
    })
})

In [5]:
dataset["train"][0]

{'text': "Prompt Level: 2 [SEP] Prompt: What activities do you do at school? [SEP] Response: I'm studying computer science and engineering, I'm learning programming languages like python, java, c++, and I'm doing project in android app development and web development, I'm also learning data science",
 'level': 2,
 'item_id': 80,
 'synthetic': True,
 'label': 2,
 '__index_level_0__': 8371}

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [7]:
tokenizer(dataset["train"][0]["text"])

{'input_ids': [101, 25732, 2504, 1024, 1016, 102, 25732, 1024, 2054, 3450, 2079, 2017, 2079, 2012, 2082, 1029, 102, 3433, 1024, 1045, 1005, 1049, 5702, 3274, 2671, 1998, 3330, 1010, 1045, 1005, 1049, 4083, 4730, 4155, 2066, 18750, 1010, 9262, 1010, 1039, 1009, 1009, 1010, 1998, 1045, 1005, 1049, 2725, 2622, 1999, 11924, 10439, 2458, 1998, 4773, 2458, 1010, 1045, 1005, 1049, 2036, 4083, 2951, 2671, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

In [9]:
tokenized_train = dataset["train"].map(tokenize_function, batched=True)

In [10]:
tokenized_test = dataset["test"].map(tokenize_function, batched=True)

In [11]:
tokenized_valid = dataset["valid"].map(tokenize_function, batched=True)

In [12]:
unique_labels = set(dataset['train']['label'])
num_labels = len(unique_labels)
num_labels

5

### Model Fine tuning with trainer 

In [13]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
import numpy as np 
import evaluate

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, cohen_kappa_score
from scipy.stats import pearsonr

metric = evaluate.load("accuracy")

Downloading builder script: 0.00B [00:00, ?B/s]

In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Convertir les logits en classes prédictes

    # 🎯 Exactitude (Accuracy)
    accuracy = accuracy_score(labels, predictions)

    # 🎯 Précision, Rappel et F1-score (pondérés)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")

    # 🎯 Score de Cohen's Kappa (pondéré)
    cohen_kappa = cohen_kappa_score(labels, predictions, weights="quadratic")

    # 🎯 Corrélation de Pearson
    pearson_corr, _ = pearsonr(labels, predictions)  # Retourne (coef, p-valeur), on garde seulement coef

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "cohen_kappa": cohen_kappa,
        "pearson_corr": pearson_corr
    }


In [49]:
args = TrainingArguments(
    output_dir="../../../model_saved/bert-ft-efcamdat-augmented",
    evaluation_strategy="steps",  # Évaluation aux mêmes intervalles que la sauvegarde
    save_strategy="steps",  # Sauvegarde tous les 500 steps
    save_steps=500,
    eval_steps=500,  # ⚠ IMPORTANT : Évaluation aux mêmes steps
    save_total_limit=2,  # Ne garde que 2 checkpoints max
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,  
    metric_for_best_model="f1",
    logging_steps=100,
    fp16=True,
)



In [50]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

In [51]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=1326, training_loss=0.4529891096808432, metrics={'train_runtime': 281.9305, 'train_samples_per_second': 75.114, 'train_steps_per_second': 4.703, 'total_flos': 5572052902960128.0, 'train_loss': 0.4529891096808432, 'epoch': 3.0})

In [52]:
trainer.evaluate()

{'eval_loss': 0.7070058584213257,
 'eval_accuracy': 0.7244897959183674,
 'eval_precision': 0.7370498685301681,
 'eval_recall': 0.7244897959183674,
 'eval_f1': 0.7262590048784439,
 'eval_cohen_kappa': 0.8671895334854895,
 'eval_pearson_corr': 0.8744201406528138,
 'eval_runtime': 3.2913,
 'eval_samples_per_second': 267.978,
 'eval_steps_per_second': 17.014,
 'epoch': 3.0}

### Evaluation dataset analysis

In [53]:
list_topic = dataset["valid"]["item_id"]
list_t_set = set(list_topic)
unique_t = (list(list_t_set))


In [56]:
list_r = []

# Assuming 'unique_t' is a list of unique item_ids and 'trainer' is already defined
for t in unique_t:  # Iterate over the first item in unique_t
    sub_ds = tokenized_valid.filter(lambda example: example['item_id'] == t)
    # Get predictions using the trainer
    predictions = trainer.predict(sub_ds)
    # Raw output logits (size [batch_size, num_classes])
    outputs = predictions.predictions
    # Convert logits to predicted class labels (taking the argmax across the classes)
    predicted_labels = np.argmax(outputs, axis=-1)
    ref_label = predictions.label_ids
    # Print or save the predicted classes (this will be a numpy array with the predicted class indices)
    ck = round(cohen_kappa_score(predicted_labels, ref_label, weights="quadratic"), 2)  
    pearson_corr, _ = pearsonr(ref_label, predicted_labels)
    accuracy = accuracy_score(ref_label, predicted_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(ref_label, predicted_labels, average="weighted")

    r = {
        "item_id": t,
        "level": sub_ds["level"][0],
        "synthetic": sub_ds["synthetic"][0],
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "ck": ck,
        "pearson": pearson_corr,
        "n_samples": len(sub_ds)
    }
    list_r.append(r)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [57]:
import pandas as pd
df_eval_results = pd.DataFrame(list_r, columns=["item_id", "level", "synthetic", "accuracy", "precision", "recall", "f1", "ck", "pearson", "n_samples"])
df_eval_results.head(n=10)

Unnamed: 0,item_id,level,synthetic,accuracy,precision,recall,f1,ck,pearson,n_samples
0,0,1,False,0.7,0.690476,0.7,0.692308,0.77,0.776062,20
1,1,1,False,0.8125,0.859375,0.8125,0.818452,0.88,0.899101,16
2,2,1,False,0.722222,0.811508,0.722222,0.741799,0.64,0.703679,18
3,3,1,False,0.625,0.866071,0.625,0.584008,0.78,0.860165,24
4,4,1,False,0.833333,0.883333,0.833333,0.833842,0.84,0.854965,18
5,5,2,False,0.611111,0.736111,0.611111,0.622782,0.66,0.685061,18
6,6,2,False,0.9,0.905556,0.9,0.893137,0.9,0.912871,20
7,7,2,False,0.611111,0.638889,0.611111,0.615995,0.6,0.605431,18
8,8,2,False,0.625,0.6625,0.625,0.5875,0.72,0.819948,16
9,9,2,False,0.684211,0.807018,0.684211,0.672515,0.66,0.720142,19


In [58]:
df_eval_results.to_csv("result_eval_data_bert_efcamdat_augmented.csv", index=False)


### Hyperparameter optimization

In [None]:
#!pip install ray
#!pip install "ray[tune]"

In [None]:
import ray
from pprint import pprint

ModuleNotFoundError: No module named 'ray'

In [None]:
ray.init(_temp_dir="/home/ec2-user/model_saved/ray_tmp")

In [None]:
pprint(ray.cluster_resources())

In [None]:
use_gpu = True  # set this to False to run on CPUs
num_workers = 1  # set this to number of GPUs or CPUs you want to use

In [None]:
import ray.data

ray_datasets = {
    "train": ray.data.from_huggingface(raw_dataset["train"]),
    "validation": ray.data.from_huggingface(raw_dataset["eval"]),
    "test": ray.data.from_huggingface(raw_dataset["test"]),
}
ray_datasets

In [None]:
import numpy as np
from typing import Dict

# Tokenize input sentences
def collate_fn(examples: Dict[str, np.array]):
    outputs = tokenizer(
        list(examples["text"]),
        truncation=True,
        padding="longest",
        return_tensors="pt",
    )

    outputs["labels"] = torch.LongTensor(examples["label"])

    # Move all input tensors to GPU
    for key, value in outputs.items():
        outputs[key] = value.cuda()

    return outputs

In [None]:
import torch
import numpy as np

from datasets import load_metric
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

import ray.train
from ray.train.huggingface.transformers import prepare_trainer, RayTrainReportCallback


model_checkpoint = "bert-base-uncased"
task = "review"
batch_size = 16

num_labels = 5
metric_name = (
    "accuracy"
)
model_name = model_checkpoint.split("/")[-1]

name = f"{model_name}-finetuned-{task}"

# Calculate the maximum steps per epoch based on the number of rows in the training dataset.
# Make sure to scale by the total number of training workers and the per device batch size.
max_steps_per_epoch = ray_datasets["train"].count() // (batch_size * num_workers)


def train_func(config):
    print(f"Is CUDA available: {torch.cuda.is_available()}")

    metric = load_metric("glue", "cola")
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=num_labels
    )

    train_ds = ray.train.get_dataset_shard("train")
    eval_ds = ray.train.get_dataset_shard("eval")

    train_ds_iterable = train_ds.iter_torch_batches(
        batch_size=batch_size, collate_fn=collate_fn
    )
    eval_ds_iterable = eval_ds.iter_torch_batches(
        batch_size=batch_size, collate_fn=collate_fn
    )

    print("max_steps_per_epoch: ", max_steps_per_epoch)

    args = TrainingArguments(
        name,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=config.get("learning_rate", 2e-5),
        num_train_epochs=config.get("epochs", 2),
        weight_decay=config.get("weight_decay", 0.01),
        push_to_hub=False,
        max_steps=max_steps_per_epoch * config.get("epochs", 2),
        disable_tqdm=True,  # declutter the output a little
        no_cuda=not use_gpu,  # you need to explicitly set no_cuda if you want CPUs
        report_to="none",
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model,
        args,
        train_dataset=train_ds_iterable,
        eval_dataset=eval_ds_iterable,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.add_callback(RayTrainReportCallback())

    trainer = prepare_trainer(trainer)

    print("Starting training")
    trainer.train()

In [None]:
from ray.train.torch import TorchTrainer
from ray.train import RunConfig, ScalingConfig, CheckpointConfig

trainer = TorchTrainer(
    train_func,
    scaling_config=ScalingConfig(
        num_workers=num_workers, 
        resources_per_worker={"GPU": 1, "CPU": 1},
        use_gpu=use_gpu),
    datasets={
        "train": ray_datasets["train"],
        "eval": ray_datasets["validation"],
    },
    run_config=RunConfig(
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="eval_loss",
            checkpoint_score_order="min",
            
        ),
    ),
)

In [None]:
result = trainer.fit()

In [None]:
result