## BERT Fine-Tuning

### Setup

In [None]:
# !pip install evaluate

### Load Data and Preprocessing

In [None]:
from datasets import load_dataset, DatasetDict
raw_dataset = load_dataset("Yelp/yelp_review_full")
raw_dataset

In [None]:
## Split train-test with a sample
indices_1 = range(0,1000)
indices_2 = range(1001,2001)
indices_3 = range(2002,3002)

dataset_dict = {
    "train": raw_dataset["train"].select(indices_1),
    "test": raw_dataset["test"].select(indices_2),
    "eval": raw_dataset["test"].select(indices_3),
}

raw_dataset = DatasetDict(dataset_dict)
raw_dataset

In [None]:
raw_dataset["train"][0]

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
tokenizer(raw_dataset["train"][0]["text"])

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

In [None]:
tokenized_train = raw_dataset["train"].map(tokenize_function, batched=True)

In [None]:
tokenized_test = raw_dataset["test"].map(tokenize_function, batched=True)

In [None]:
unique_labels = set(raw_dataset['train']['label'])
num_labels = len(unique_labels)
num_labels

### Model Fine tuning with trainer 

In [None]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

In [None]:
import numpy as np 
import evaluate

metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
args = TrainingArguments(
    output_dir="../../model_saved/bert-ft-review",
    evaluation_strategy= "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

### Hyperparameter optimization

In [None]:
#!pip install ray
#!pip install "ray[tune]"

In [None]:
import ray
from pprint import pprint

In [None]:
ray.init(_temp_dir="/home/ec2-user/model_saved/ray_tmp")

In [None]:
pprint(ray.cluster_resources())

In [None]:
use_gpu = True  # set this to False to run on CPUs
num_workers = 1  # set this to number of GPUs or CPUs you want to use

In [None]:
import ray.data

ray_datasets = {
    "train": ray.data.from_huggingface(raw_dataset["train"]),
    "validation": ray.data.from_huggingface(raw_dataset["eval"]),
    "test": ray.data.from_huggingface(raw_dataset["test"]),
}
ray_datasets

In [None]:
import numpy as np
from typing import Dict

# Tokenize input sentences
def collate_fn(examples: Dict[str, np.array]):
    outputs = tokenizer(
        list(examples["text"]),
        truncation=True,
        padding="longest",
        return_tensors="pt",
    )

    outputs["labels"] = torch.LongTensor(examples["label"])

    # Move all input tensors to GPU
    for key, value in outputs.items():
        outputs[key] = value.cuda()

    return outputs

In [None]:
import torch
import numpy as np

from datasets import load_metric
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

import ray.train
from ray.train.huggingface.transformers import prepare_trainer, RayTrainReportCallback


model_checkpoint = "bert-base-uncased"
task = "review"
batch_size = 16

num_labels = 5
metric_name = (
    "accuracy"
)
model_name = model_checkpoint.split("/")[-1]

name = f"{model_name}-finetuned-{task}"

# Calculate the maximum steps per epoch based on the number of rows in the training dataset.
# Make sure to scale by the total number of training workers and the per device batch size.
max_steps_per_epoch = ray_datasets["train"].count() // (batch_size * num_workers)


def train_func(config):
    print(f"Is CUDA available: {torch.cuda.is_available()}")

    metric = load_metric("glue", "cola")
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=num_labels
    )

    train_ds = ray.train.get_dataset_shard("train")
    eval_ds = ray.train.get_dataset_shard("eval")

    train_ds_iterable = train_ds.iter_torch_batches(
        batch_size=batch_size, collate_fn=collate_fn
    )
    eval_ds_iterable = eval_ds.iter_torch_batches(
        batch_size=batch_size, collate_fn=collate_fn
    )

    print("max_steps_per_epoch: ", max_steps_per_epoch)

    args = TrainingArguments(
        name,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=config.get("learning_rate", 2e-5),
        num_train_epochs=config.get("epochs", 2),
        weight_decay=config.get("weight_decay", 0.01),
        push_to_hub=False,
        max_steps=max_steps_per_epoch * config.get("epochs", 2),
        disable_tqdm=True,  # declutter the output a little
        no_cuda=not use_gpu,  # you need to explicitly set no_cuda if you want CPUs
        report_to="none",
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model,
        args,
        train_dataset=train_ds_iterable,
        eval_dataset=eval_ds_iterable,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.add_callback(RayTrainReportCallback())

    trainer = prepare_trainer(trainer)

    print("Starting training")
    trainer.train()

In [None]:
from ray.train.torch import TorchTrainer
from ray.train import RunConfig, ScalingConfig, CheckpointConfig

trainer = TorchTrainer(
    train_func,
    scaling_config=ScalingConfig(
        num_workers=num_workers, 
        resources_per_worker={"GPU": 1, "CPU": 1},
        use_gpu=use_gpu),
    datasets={
        "train": ray_datasets["train"],
        "eval": ray_datasets["validation"],
    },
    run_config=RunConfig(
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="eval_loss",
            checkpoint_score_order="min",
            
        ),
    ),
)

In [None]:
result = trainer.fit()

In [None]:
result