<a href="https://colab.research.google.com/github/IdanKanat/COVID_NLP_Advanced_DL_Project/blob/main/AdvancedTopicsDL_Project_IdanKanat%26IdoShahar_COVID_NLP_21.8.2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%run ./01_EDA_and_data_preprocessing.ipynb
%run ./02_finetune_WITHOUT_HF_Trainer.ipynb



ModuleNotFoundError: No module named 'google.colab'

ModuleNotFoundError: No module named 'google.colab'

## **HP Tuning using HuggingFace Trainer**

In [None]:
# Load evaluation metrics, using the evaluate library
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

# Compute metrics function for the Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "precision": precision_metric.compute(predictions=preds, references=labels, average="macro")["precision"],
        "recall": recall_metric.compute(predictions=preds, references=labels, average="macro")["recall"],
        "f1": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

In [None]:
# Objective function for Optuna hyperparameter tuning
def objective_HF(trial, architecture):

    # Initializing the model & tokenizer from HF, depending on the specified architecture:
    if architecture == "twitter-roberta-base":
        model_name = "cardiffnlp/twitter-roberta-base"
        pretokenized_dir = ("data/tokenized_twitter_roberta_base")  # the folder for saving the model
    else:
        model_name = "vinai/bertweet-base"
        pretokenized_dir = ("data/tokenized_bertweet_base")  # the folder for saving the model

    # Load model and tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5) # 5 labels for the 5 sentiments
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    base_model = model.roberta # Base model for both models (RoBERTa-Base-Tweet & BERTweet-Base) - RoBERTa

    # Hyperparameter search space
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
    weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
    patience = trial.suggest_int("patience", 7, 10)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    num_layers_finetune = trial.suggest_int("num_layers_finetune", 0, 3)
    lr_scheduler_type = trial.suggest_categorical("lr_scheduler_type", ["linear", "cosine", "polynomial"])

    # safety: correct dtypes + torch output
    ds = load_from_disk(pretokenized_dir) #Loads the Arrow-backed HF DatasetDict that are defines later on in the Pre-tokenization part
    for split in ds:
        ds[split] = ds[split].cast_column("input_ids", Sequence(Value("int64")))
        ds[split] = ds[split].cast_column("attention_mask", Sequence(Value("int64")))  # or "bool"
        ds[split] = ds[split].cast_column("labels", Value("int64"))
        ds[split].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    # keep dynamic padding (no tokenization here—collator only pads per batch)
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="pt")

    # Freezing and Unfreezing layers
    for p in base_model.parameters():
        p.requires_grad = False
    if num_layers_finetune > 0:  # safety guard: avoid the "-0" edge case
        for p in base_model.encoder.layer[-num_layers_finetune:].parameters():
            p.requires_grad = True
    for p in model.classifier.parameters():
        p.requires_grad = True

    if wandb.run is not None:
      wandb.finish() # Check if W&B doesn't run anything in parallel. If so, stop the pre-existing run.

   # Initialize Weights & Biases - the values in the config are the properties of each trial.
    wandb.init(
        project=f"{architecture}_HF_CORONA_NLP_Twitter_Sentiment_Analysis_14.8.2025_FULL_HP_TUNING",
        entity="idoshahar96-tel-aviv-university",
        config={
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "patience": patience,
            "batch_size": batch_size,
            "num_layers_finetune": num_layers_finetune,
            "lr_scheduler_type": lr_scheduler_type,
            "architecture": architecture,
            "dataset": "CORONA-NLP-Train_Twitter-Sentiment-Analysis"
        },
        name=f"trial_{trial.number}",
        reinit=True
    )

    # TrainingArguments for the Hugging Face Trainer
    training_args = TrainingArguments(
        output_dir=f"HF-results/trial_{trial.number}",  # where checkpoints will be saved
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        num_train_epochs=20,           # Setting the number of epochs for training - 20
        eval_strategy="epoch",        # evaluate at the end of each epoch
        save_strategy="epoch",        # save a checkpoint at the end of each epoch
        logging_strategy="epoch",     # log metrics at the end of each epoch
        load_best_model_at_end=True,  # reload the best checkpoint (based on metric_for_best_model)
        metric_for_best_model="accuracy", # optimize w.r.t accuracy
        greater_is_better=True,
        save_total_limit=1,           # keep only the best checkpoint
        report_to="wandb",            # log to Weights & Biases
        lr_scheduler_type=lr_scheduler_type
    )

    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds["train_reduced"],
        eval_dataset=ds["validation"],
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
    )

    # Train the model
    trainer.train()

    # Save best trial results
    trainer.save_model(f"HF-results/trial_{trial.number}")  # ensures config.json + weights are there
    tokenizer.save_pretrained(f"HF-results/trial_{trial.number}")

    # Evaluate the best model on the validation set
    eval_metrics = trainer.evaluate()
    wandb.finish()

    # Optuna uses the validation accuracy as the optimization target
    acc = eval_metrics.get("eval_accuracy", 0.0)
    if np.isnan(acc):
        raise optuna.exceptions.TrialPruned()

    return acc

#### **RoBERTa-Base-Tweet:**

In [None]:
# Creating an Optuna Study - RoBERTa -Base-Tweet (rec5):
study_roberta_base_tweet_rec5 = optuna.create_study(direction="maximize")  # Specifies that the goal of the optimization is to maximize the objective function - accuracy in our case.
study_roberta_base_tweet_rec5.optimize(lambda trial: objective_HF(trial, "twitter-roberta-base"), n_trials=12) # Specified 12 trials

print("Best objective value (validation accuracy):", study_roberta_base_tweet_rec5.best_value)
print("The chosen HP combination:", study_roberta_base_tweet_rec5.best_params)
print("Trial number of the best objective (validation accuracy) value:", study_roberta_base_tweet_rec5.best_trial.number)

# Define the path to save the file in Google Drive with REC5 naming
# basic_drive_path = "/content/drive/MyDrive" # USER CAN CHANGE IT IF HE DOESN'T WORK IN DRIVE AND DOWNLOADS FROM DRIVE THE Project_COVID_NLP folder!! (under # but the hashtag sign # can be removed if needed)
project_root = f"{basic_drive_path}/Project_COVID_NLP" # Root project folder
hp_root = f"{project_root}/Model_HPs"
drive_path = f"{hp_root}/best_model_roberta_base_tweet_rec5_hyperparams.json"

with open(drive_path, "w") as f:
    json.dump(study_roberta_base_tweet_rec5.best_params, f)

print(f"\nBest hyperparameters saved to {drive_path}")

#### **BerTweet-Base:**

In [None]:
# Creating an Optuna Study - BerTweet-Base (rec5):
study_bertweet_base_rec5 = optuna.create_study(direction="maximize")  # Specifies that the goal of the optimization is to maximize the objective function - accuracy in our case.
study_bertweet_base_rec5.optimize(lambda trial: objective_HF(trial, "bertweet-base"), n_trials=12) # Specified 12 trials

print("Best objective value (validation accuracy):", study_bertweet_base_rec5.best_value)
print("The chosen HP combination:", study_bertweet_base_rec5.best_params)
print("Trial number of the best objective (validation accuracy) value:", study_bertweet_base_rec5.best_trial.number)

# Define the path to save the file in Google Drive with REC5 naming
# basic_drive_path = "/content/drive/MyDrive" # USER CAN CHANGE IT IF HE DOESN'T WORK IN DRIVE AND DOWNLOADS FROM DRIVE THE Project_COVID_NLP folder!! (under # but the hashtag sign # can be removed if needed)
project_root = f"{basic_drive_path}/Project_COVID_NLP" # Root project folder
hp_root = f"{project_root}/Model_HPs"
drive_path = f"{hp_root}/best_model_bertweet_base_rec5_hyperparams.json"

with open(drive_path, "w") as f:
    json.dump(study_bertweet_base_rec5.best_params, f)

print(f"\nBest hyperparameters saved to {drive_path}")

## **Final Training using HuggingFace Trainer**

After finding the best trial (hyperparameter combination) using the objective-HF function, the `train_model_with_hyperparams_HF` is called for final model training using the obtained hyperparameter combination. It appears similar to the way we trained each model under each trial specification in the Optuna based objective-HF function. This additional function supports model saving too, and generalized for each model architecture. It's worth noting that in practice, the validation dataset in this function would be the actual test set.

In [None]:
def train_model_with_hyperparams_HF(architecture, best_params, save_path):

    # Initializing the model & tokenizer from HF, depending on the specified architecture:
    if architecture == "twitter-roberta-base":
        model_name = "cardiffnlp/twitter-roberta-base"
        pretokenized_dir = ("data/tokenized_twitter_roberta_base")  # the folder for saving the model
    else:
        model_name = "vinai/bertweet-base"
        pretokenized_dir = ("data/tokenized_bertweet_base")  # the folder for saving the model

    # Load model and tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5) # 5 labels for the 5 sentiments
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    base_model = model.roberta # Base model for both models (RoBERTa-Base-Tweet & BERTweet-Base) - RoBERTa

    # safety: correct dtypes + torch output
    ds = load_from_disk(pretokenized_dir) #Loads the Arrow-backed HF DatasetDict that are defines later on in the Pre-tokenization part
    for split in ds:
        ds[split] = ds[split].cast_column("input_ids", Sequence(Value("int64")))
        ds[split] = ds[split].cast_column("attention_mask", Sequence(Value("int64")))  # or "bool"
        ds[split] = ds[split].cast_column("labels", Value("int64"))
        ds[split].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    # Merge train + validation for final training
    full_train_dataset = concatenate_datasets([ds["train_reduced"], ds["validation"]])
    full_train_dataset = full_train_dataset.shuffle(seed=42) # Shuffle the model's training data to add randomness

    # keep dynamic padding (no tokenization here—collator only pads per batch)
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="pt")

    # Freezing and Unfreezing layers
    for p in base_model.parameters():
        p.requires_grad = False
    if best_params["num_layers_finetune"] > 0:  # safety guard: avoid the "-0" edge case
        for p in base_model.encoder.layer[-best_params["num_layers_finetune"]:].parameters():
            p.requires_grad = True
    for p in model.classifier.parameters():
        p.requires_grad = True

    if wandb.run is not None:
      wandb.finish() # Check if W&B doesn't run anything in parallel. If so, stop the pre-existing run.

   # Initialize Weights & Biases - the values in the config are the properties of the best trial found in the Optuna-HP-Tuning step.
    wandb.init(
        project=f"{architecture}_HF_CORONA_NLP_Twitter_Sentiment_Analysis_19.8.2025_FULL_TRAINING",
        entity="idoshahar96-tel-aviv-university",
        config={
            "learning_rate": best_params["learning_rate"],
            "weight_decay": best_params["weight_decay"],
            "patience": best_params["patience"],
            "batch_size": best_params["batch_size"],
            "num_layers_finetune": best_params["num_layers_finetune"],
            "lr_scheduler_type": best_params["lr_scheduler_type"],
            "architecture": architecture,
            "dataset": "CORONA-NLP-Train_Twitter-Sentiment-Analysis"
        },
        name="FINAL_TRAINING",
        reinit=True
    )

    # TrainingArguments for the Hugging Face Trainer
    training_args = TrainingArguments(
        output_dir=save_path,  # where checkpoints will be saved
        per_device_train_batch_size=best_params["batch_size"],
        per_device_eval_batch_size=best_params["batch_size"],
        learning_rate=best_params["learning_rate"],
        weight_decay=best_params["weight_decay"],
        num_train_epochs=25,           # Setting the number of epochs for training - 25
        eval_strategy="epoch",        # evaluate at the end of each epoch
        save_strategy="epoch",        # save a checkpoint at the end of each epoch
        logging_strategy="epoch",     # log metrics at the end of each epoch
        load_best_model_at_end=True,  # reload the best checkpoint (based on metric_for_best_model)
        metric_for_best_model="accuracy", # optimize w.r.t accuracy
        greater_is_better=True,
        save_total_limit=1,           # keep only the best checkpoint
        report_to="wandb",            # log to Weights & Biases
        lr_scheduler_type=best_params["lr_scheduler_type"]
    )

    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=full_train_dataset,
        eval_dataset=ds["test"], # Evaluating the model using the test dataset
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=best_params["patience"])]
    )

    # Train the model
    trainer.train()

    # Save model
    trainer.save_model(save_path)
    tokenizer.save_pretrained(save_path)

    wandb.finish()

#### **RoBERTa-Base-Tweet:**

In [None]:
# best_params = study_roberta_base_tweet_rec5.best_params  # get best HPs from the model's Optuna study
best_params = {'learning_rate': 0.0000860370374400373, 'weight_decay': 0.00008459884214639005, 'patience': 10, 'batch_size': 128, 'num_layers_finetune': 3, 'lr_scheduler_type': 'polynomial'} # Manually typed the best_params for future use
name_path = "/best_model_roberta_base_tweet_rec5"
save_path = model_root + name_path # initialize & define save path for the model's weights

# Training the Model (1), using Optuna-study's best trial HPs - RoBERTa-Base-Tweet:
train_model_with_hyperparams_HF(architecture="twitter-roberta-base", best_params=best_params,save_path=save_path)

# Zip the whole model folder
shutil.make_archive(save_path, "zip", save_path)

# Download the zip to your computer
files.download(f"{save_path}.zip")

#### **BerTweet-Base:**

In [None]:
# best_params = study_bertweet_base_rec5.best_params  # get best HPs from the model's Optuna study
best_params = {'learning_rate': 7.668855564109297e-05, 'weight_decay': 4.8978169582912055e-06, 'patience': 9, 'batch_size': 64, 'num_layers_finetune': 3, 'lr_scheduler_type': 'linear'} # Manually typed the best_params for future use
name_path = "/best_model_bertweet_base_rec5"
save_path = model_root + name_path # initialize & define save path for the model's weights

# Training the Model (2), using Optuna-study's best trial HPs - BERTweet-Base:
train_model_with_hyperparams_HF(architecture="bertweet-base", best_params=best_params,save_path=save_path)

# Zip the whole model folder
shutil.make_archive(save_path, "zip", save_path)

# Download the zip to your computer
files.download(f"{save_path}.zip")