<a href="https://colab.research.google.com/github/IdanKanat/COVID_NLP_Advanced_DL_Project/blob/main/AdvancedTopicsDL_Project_IdanKanat%26IdoShahar_COVID_NLP_21.8.2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%run ./01_EDA_and_data_preprocessing.ipynb



ModuleNotFoundError: No module named 'google.colab'

ModuleNotFoundError: No module named 'google.colab'

## **Importing the Models - *RoBERTa-Base-Tweet* and *BERTweet-Base***

In [None]:
# Load tokenizer and model from Hugging Face
model_name = "cardiffnlp/twitter-roberta-base"
tokenizer_twitter_roberta_base = AutoTokenizer.from_pretrained(model_name)

# Load the first model from HuggingFace - ROBERTA Transformer Encoder, fine-tuned for sentiment analysis from tweets:
roberta_tweets_1_model = AutoModelForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base", num_labels = 5 # 5 labels for the 5 sentiments
).to(device)
roberta_tweets_1_model # glancing at the model architecture

## **Helper Functions**

### **Tweet Dataset Class**

In [None]:
# Defining the TweetDataset class with 3 built in functions (init, len and getitem) for integration with the PyTorch DataLoader object
class TweetDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        """
        Args:
            dataframe (pd.DataFrame): DataFrame containing the data
            tokenizer: HuggingFace tokenizer for text processing
            max_length (int): Maximum sequence length
        """

        self.texts = dataframe['CleanTweet'].tolist()
        self.labels = dataframe['Sentiment'].map(label2id).tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length


    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {"text": self.texts[idx], "label": self.labels[idx]}

### **Early Stopping Check**

In [None]:
# Check for early stopping, applied for regularization. If the relevant validation metric (accuracy) shows no observable
# improvement (w.r.t best observed val metric up until now) over several epochs consecutively, model training stops.
# This function outputs the best_val_accuracy, epoch & early stop flag for each epoch it's called
def early_stop_check(patience, best_val_accuracy, best_val_accuracy_epoch, current_val_accuracy, current_val_accuracy_epoch):
    early_stop_flag = False
    if current_val_accuracy > best_val_accuracy:
        best_val_accuracy = current_val_accuracy
        best_val_accuracy_epoch = current_val_accuracy_epoch
    else:
        if current_val_accuracy_epoch - best_val_accuracy_epoch > patience:
            early_stop_flag = True
    return best_val_accuracy, best_val_accuracy_epoch, early_stop_flag

## **Model Training**
    The train_model_with_hyperparams function trains the model using the given training and validation loaders,
    with early stopping.
    Logs training and validation performance to Weights & Biases (accuracy, precision, recall, F1-score, and confusion matrix).
    Returns the best model validation loss and saves the best model checkpoint per trial.

      Args:
        model (.from_pretrained): Transformer encoder model, imported from HuggingFace
        train_loader (DataLoader): DataLoader for training data
        val_loader (DataLoader): DataLoader for validation data
        optimizer (torch.optim.Optimizer): Optimizer
        criterion (nn.Module): Loss function
        epochs (int): Max number of epochs
        patience (int): Early stopping patience
        trial (optuna.trial.Trial): Current Optuna trial
    Returns:
        float: Best validation accuracy

In [None]:
def train_model_with_hyperparams(model, train_loader, val_loader, optimizer, criterion, epochs, patience, trial):
    # speed toggles (safe to call each time)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    scaler = torch.cuda.amp.GradScaler()

    best_val_accuracy = 0.0 # Initialize best validation accuracy
    best_val_accuracy_epoch = 0 # Track epoch with the best validation accuracy
    early_stop_flag = False
    best_model_state = None # To save the best model (in each trial / final training)

    device_ = next(model.parameters()).device  # robust device grab

    for epoch in range(1, epochs + 1):
        model.train() # Enable training mode
        train_loss = 0.0 # Initializing the cumulative training loss for the current epoch to 0.
        total_train = 0 # Initialize total_train here
        correct_train = 0 # Initialize correct_train here

        train_preds = [] # Store predicted classes for metrics
        train_targets = []  # Store true labels for metrics

        for batch in train_loader: # Iterates over the train_loader, which is a DataLoader object containing batches of training data. Each iteration yields a batch of inputs (images) and corresponding labels (ground-truth classes).
            # Non-blocking H2D copies (works best with pin_memory=True on DataLoader)
            input_ids      = batch["input_ids"].to(device_, non_blocking=True)
            attention_mask = batch["attention_mask"].to(device_, non_blocking=True)
            labels         = batch["labels"].to(device_, non_blocking=True)

            optimizer.zero_grad(set_to_none=True) # Reset gradients


            # AMP forward/backward
            with torch.cuda.amp.autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits  = outputs.logits
                loss    = criterion(logits, labels)

            scaler.scale(loss).backward()
            # (Optional) gradient clipping for extra stability:
            # scaler.unscale_(optimizer)
            # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()


            # Compute metrics
            bs = labels.size(0)
            train_loss += loss.item() * bs
            total_train += bs

            preds = logits.argmax(dim=1)
            correct_train += (preds == labels).sum().item()
            train_preds.extend(preds.detach().cpu().numpy())
            train_targets.extend(labels.detach().cpu().numpy())

        train_loss /= max(total_train, 1)
        train_accuracy = correct_train / max(total_train, 1)
        train_f1 = f1_score(train_targets, train_preds, average='macro', zero_division=0)
        train_precision = precision_score(train_targets, train_preds, average='macro', zero_division=0)
        train_recall = recall_score(train_targets, train_preds, average='macro', zero_division=0)

        # Validation check
        if val_loader is not None:
            model.eval()
            val_loss_sum = 0.0
            total_val = 0
            correct_val = 0
            val_preds, val_targets = [], []

            with torch.no_grad():
                for batch in val_loader:
                    input_ids      = batch["input_ids"].to(device_, non_blocking=True)
                    attention_mask = batch["attention_mask"].to(device_, non_blocking=True)
                    labels         = batch["labels"].to(device_, non_blocking=True)

                    # AMP also speeds up eval
                    with torch.cuda.amp.autocast():
                        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                        logits  = outputs.logits
                        loss    = criterion(logits, labels)

                    bs = labels.size(0)
                    val_loss_sum += loss.item() * bs
                    total_val += bs

                    preds = logits.argmax(dim=1)
                    correct_val += (preds == labels).sum().item()
                    val_preds.extend(preds.detach().cpu().numpy())
                    val_targets.extend(labels.detach().cpu().numpy())

            val_loss = val_loss_sum / max(total_val, 1)
            val_accuracy = correct_val / max(total_val, 1)
            val_precision = precision_score(val_targets, val_preds, average='macro', zero_division=0)
            val_recall = recall_score(val_targets, val_preds, average='macro', zero_division=0)
            val_f1 = f1_score(val_targets, val_preds, average='macro', zero_division=0)

            # Check for Early stopping (& updates best_val_accuracy & epoch)
            if patience is not None:
                best_val_accuracy, best_val_accuracy_epoch, early_stop_flag = early_stop_check(
                    patience, best_val_accuracy, best_val_accuracy_epoch, val_accuracy, epoch
                )

            # Save best-so-far weights (>= to handle ties)
            if val_accuracy >= best_val_accuracy and total_val > 0:
                best_model_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

            # W & B logging (if active)
            if wandb.run is not None:
                wandb.log({
                    "Epoch": epoch,
                    "Train Loss": train_loss,
                    "Train Accuracy": train_accuracy,
                    "Train F1 Score": train_f1,
                    "Train Precision": train_precision,
                    "Train Recall": train_recall,
                    "Validation Loss": val_loss,
                    "Validation Accuracy": val_accuracy,
                    "Validation Precision": val_precision,
                    "Validation Recall": val_recall,
                    "Validation F1": val_f1,
                })

            if early_stop_flag:
                break

    # Save best model weights (if we ever improved)
    # if best_model_state is not None:
        # torch.save(best_model_state, f"best_model_trial_{trial.number}.pt")

    # Restore best weights into the model before returning best_val_accuracy
    if best_model_state is not None:
      model.load_state_dict(best_model_state)

    return best_val_accuracy

## **HP Tuning without HuggingFace's Trainer**

Optuna objective function for tuning the given Transformer encoder model on twitter data.

Each trial runs training with a different set of hyperparameters and logs key training & validation metrics to Weights & Biases.


In [None]:
# Objective Function for Optuna:
def objective(trial, architecture):

    # Initializing the model & tokenizer from HF, depending on the specified architecture:
    if architecture == "twitter-roberta-base":
        model_name = "cardiffnlp/twitter-roberta-base"
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 5).to(device) # initialize RoBerta for twitter from HF, num_labels=5 -> 5 sentiments.
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        base_model = model.roberta
        pretokenized_dir = ("data/tokenized_twitter_roberta_base")  # the folder for saving the model
    else:
        model_name = "vinai/bertweet-base"
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 5).to(device) # initialize RoBerta for twitter from HF, num_labels=5 -> 5 sentiments.
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        base_model = model.roberta
        pretokenized_dir = ("data/tokenized_bertweet_base")  # the folder for saving the model


    # Hyperparameter suggestions:
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
    weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
    patience = trial.suggest_int("patience", 7, 10)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    num_layers_finetune = trial.suggest_int("num_layers_finetune", 0, 3)

    # safety: correct dtypes + torch output
    ds = load_from_disk(pretokenized_dir) #Loads the Arrow-backed HF DatasetDict that are defines later on in the Pre-tokenization part
    for split in ds:
        ds[split] = ds[split].cast_column("input_ids", Sequence(Value("int64")))
        ds[split] = ds[split].cast_column("attention_mask", Sequence(Value("int64")))  # or "bool"
        ds[split] = ds[split].cast_column("labels", Value("int64"))
        ds[split].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    # keep dynamic padding (no tokenization here—collator only pads per batch)
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="pt")

    # build loaders from the pretokenized HF dataset
    train_loader = DataLoader(
        ds["train_reduced"], batch_size=batch_size, shuffle=True,
        collate_fn=collator, num_workers=4, pin_memory=True,
        persistent_workers=True, prefetch_factor=2
    )
    val_loader = DataLoader(
        ds["validation"], batch_size=min(2*batch_size, 128), shuffle=False,
        collate_fn=collator, num_workers=4, pin_memory=True,
        persistent_workers=True, prefetch_factor=2
    )

    #Freezing and Unfreezing layers
    for p in base_model.parameters():
        p.requires_grad = False
    if num_layers_finetune > 0:  # safety guard: avoid the "-0" edge case
        for p in base_model.encoder.layer[-num_layers_finetune:].parameters():
            p.requires_grad = True
    for p in model.classifier.parameters():
        p.requires_grad = True

    # Define optimizer and loss function
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    # Initialize Weights & Biases - the values in the config are the properties of each trial.
    wandb.init(project=f"{architecture}_CORONA_NLP_Twitter_Sentiment_Analysis_13.8.2025_FULL_HP_TUNING",
               entity = "idoshahar96-tel-aviv-university",
               config={
        "learning_rate": learning_rate,
        "weight_decay": weight_decay,
        "patience": patience,
        "batch_size": batch_size,
        "num_layers_finetune": num_layers_finetune,
        "architecture": architecture,
        "dataset": "CORONA-NLP-Train_Twitter-Sentiment-Analysis"},
        name=f"trial_{trial.number}") # The name that will be saved in the W&B platform

    # Train the model and get the best validation accuracy
    best_val_accuracy = train_model_with_hyperparams(model, train_loader, val_loader, optimizer, criterion, epochs=15, patience=patience, trial=trial)

    wandb.finish() # Finish the Weights & Biases run

    return best_val_accuracy # Return best validation accuracy as the objective to maximize

## **Pre-tokenization**

Tokenization is CPU-heavy. If we do it all over again in each of the Optuna trials, then re-tokenizing wastes time.
Pre-tokenization makes that cost zero for subsequent runs.

It includes:
running the tokenizer once over the whole dataset and applying truncation with a fixed ceiling MAX_LEN.
For each sample i, we store a variable-length vector called len_i, which will be the min(original_len_i, MAX_LEN).
Each saved sample can have a different length.
You save the result to disk (Arrow format) with the columns of input_ids, attention_mask, and labels
So, after this step, no trial needs to call the tokenizer and every trial just loads these IDs.

In this step we are doing padding at all:
The DataLoader pulls a batch from the disk. Then, the collator looks at the lengths in that batch, finds the longest sequence in each batch, and pads only up to this length.
This is dynamic padding: it happens per batch, at runtime, and never re-tokenizes—it only adds pad tokens so tensors in the batch share the same shape
The Dynamic padding keeps tensors tight to the batch’s real lengths → fewer pad tokens → fewer FLOPs in the model’s forward pass.

In [None]:
# Converting the sentiment labels into integers via label2id, and drops the original Sentiment column
train_df_reduced_ = train_df_reduced.assign(label=train_df_reduced["Sentiment"].map(label2id)).drop(columns=["Sentiment"])
val_df_ = val_df.assign(label=val_df["Sentiment"].map(label2id)).drop(columns=["Sentiment"])
test_df_ = test_df.assign(label=test_df["Sentiment"].map(label2id)).drop(columns=["Sentiment"])

# SANITY CHECK - to make sure our training works, we added this code to make sure the training works on little training & validation data (as well as few trials & epochs per trial).
# train_df_reduced_ = train_df_reduced_.sample(n=300, random_state=42)  # pick only 300 training rows
# val_df_ = val_df_.sample(n=100, random_state=42)                      # pick only 100 validation rows
# test_df_ = test_df_.sample(n=100, random_state=42)                    # optional: smaller test set too

# Converting the Pandas DataFrames to HuggingFace Datasets and wraping them in a DatasetDict
raw_ds = DatasetDict({
    "train_reduced": Dataset.from_pandas(train_df_reduced_, preserve_index=False),
    "validation": Dataset.from_pandas(val_df_, preserve_index=False),
    "test": Dataset.from_pandas(test_df_, preserve_index=False),
})


def pretokenize_one(model_name: str, save_dir: str):
    tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    # compute a single cap once (same idea you used inside objective)
    enc_tmp  = tok(train_df_reduced["CleanTweet"].tolist(), truncation=False)
    lengths  = [len(x) for x in enc_tmp["input_ids"]]
    MAX_LEN  = max(64, min(int(np.percentile(lengths, 95)), 128))
    print(f"[{model_name}] MAX_LEN={MAX_LEN}")

    # tokenize (NO padding) and save
    tokenized = raw_ds.map(
        lambda b: tok(b["CleanTweet"], truncation=True, max_length=MAX_LEN, padding=False),
        batched=True, remove_columns=["CleanTweet"]
    )
    tokenized = tokenized.rename_column("label", "labels")
    tokenized.save_to_disk(save_dir)
    print(f"Saved to: {save_dir}")

# Run once per architecture you plan to use:
pretokenize_one("cardiffnlp/twitter-roberta-base", "data/tokenized_twitter_roberta_base")
pretokenize_one("vinai/bertweet-base",          "data/tokenized_bertweet_base")

### **Running the Models**

#### ***Model (1) - RoBERTa-Base-Tweet***

In [None]:
# Creating an Optuna Study - RoBERTa-Base-Tweet (rec4):
study = optuna.create_study(direction="maximize")  # Specifies that the goal of the optimization is to maximize the objective function - accuracy in our case.
study.optimize(lambda trial: objective(trial, "twitter-roberta-base"), n_trials=10) # Specified 10 trials

In [None]:
# Documenting best hyperparameter combination - first model - RoBERTa-Base-Tweet - Rec4 code:
study_roberta_base_tweet_rec4 = study
print("Best objective value (validation accuracy):", study.best_value)
print("The chosen HP combination:", study.best_params)
print("Trial number of the best objective (validation accuracy) value:", study.best_trial.number)

print("Best objective value (validation accuracy):", study_roberta_base_tweet_rec4.best_value)
print("The chosen HP combination:", study_roberta_base_tweet_rec4.best_params)
print("Trial number of the best objective (validation accuracy) value:", study_roberta_base_tweet_rec4.best_trial.number)

In [None]:
# Define the path to save the file in Google Drive with REC4 naming
# basic_drive_path = "/content/drive/MyDrive" # USER CAN CHANGE IT IF HE DOESN'T WORK IN DRIVE AND DOWNLOADS FROM DRIVE THE Project_COVID_NLP folder!! (under # but the hashtag sign # can be removed if needed)
project_root = f"{basic_drive_path}/Project_COVID_NLP" # Root project folder
hp_root = f"{project_root}/Model_HPs"
drive_path = f"{hp_root}/best_roberta_base_tweet_rec4_hyperparams.json"

with open(drive_path, "w") as f:
    # json.dump(study_bertweet_base_rec4.best_params, f)
    json.dump({'learning_rate': 0.0003834791389042033, 'weight_decay': 2.88286253103848e-06, 'patience': 7, 'batch_size': 128, 'num_layers_finetune': 3}, f) # Manually typed the best_params for future use

print(f"\nBest hyperparameters saved to {drive_path}")

#### ***Model (2) - BERTweet-Base***

In [None]:
# Load tokenizer and model from Hugging Face
model_name = "vinai/bertweet-base"
tokenizer_bertweet_base = AutoTokenizer.from_pretrained(model_name)

# Load the second model from HuggingFace - BERT-TWEET Transformer Encoder, fine-tuned for sentiment analysis from tweets:
bertweet_base_2_model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/bertweet-base", num_labels = 5 # 5 labels for the 5 sentiments
).to(device)
bertweet_base_2_model # glancing at the model architecture

In [None]:
# Creating an Optuna Study - BERTweet-Base (rec4):
study_bertweet_base_rec4 = optuna.create_study(direction="maximize")  # Specifies that the goal of the optimization is to maximize the objective function - accuracy in our case.
study_bertweet_base_rec4.optimize(lambda trial: objective(trial, "bertweet-base"), n_trials=10) # Specified 10 trials

In [None]:
# Documenting best hyperparameter combination - Second Model - BERTweet-Base - Rec4 code:

# print("Best objective value (validation accuracy):", study_bertweet_base_rec4.best_value)
# print("The chosen HP combination:", study_bertweet_base_rec4.best_params)
# print("Trial number of the best objective (validation accuracy) value:", study_bertweet_base_rec4.best_trial.number)

# Define the path to save the file in Google Drive with REC4 naming
# basic_drive_path = "/content/drive/MyDrive" # USER CAN CHANGE IT IF HE DOESN'T WORK IN DRIVE AND DOWNLOADS FROM DRIVE THE Project_COVID_NLP folder!! (under # but the hashtag sign # can be removed if needed)
project_root = f"{basic_drive_path}/Project_COVID_NLP" # Root project folder
hp_root = f"{project_root}/Model_HPs"
drive_path = f"{hp_root}/best_bertweet_base_rec4_hyperparams.json"

with open(drive_path, "w") as f:
    # json.dump(study_bertweet_base_rec4.best_params, f)
    json.dump({'learning_rate': 0.0001184412471705182, 'weight_decay': 1.2699696348040995e-05, 'patience': 10, 'batch_size': 128, 'num_layers_finetune': 3}, f) # Manually typed the best_params for future use

print(f"\nBest hyperparameters saved to {drive_path}")

## **Final Training WITHOUT using HuggingFace functions (Trainer)**

After finding the best trial (hyperparameter combination) using the objective function, the `FINAL_train_model_with_hyperparams` is called for final model training using the obtained hyperparameter combination. It appears similar to the way we trained each model under each trial specification in the Optuna based objective function. This additional function supports model saving too, and generalized for each model architecture. It's worth noting that in practice, the validation dataset in this function would be the actual test set.

In [None]:
# basic_drive_path = "/content/drive/MyDrive" # USER CAN CHANGE IT IF HE DOESN'T WORK IN DRIVE AND DOWNLOADS FROM DRIVE THE Project_COVID_NLP folder!! (under # but the hashtag sign # can be removed if needed)
project_root = f"{basic_drive_path}/Project_COVID_NLP" # Root project folder

# Define model_root inside the project, for all trained weights
model_root = f"{project_root}/Model_Weights"

In [None]:
def FINAL_train_model_with_hyperparams(architecture, best_params, save_path):

    # Initializing the model & tokenizer from HF, depending on the specified architecture:
    if architecture == "twitter-roberta-base":
        model_name = "cardiffnlp/twitter-roberta-base"
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 5).to(device) # initialize RoBerta for twitter from HF, num_labels=5 -> 5 sentiments.
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        base_model = model.roberta
        pretokenized_dir = ("data/tokenized_twitter_roberta_base")  # the folder for saving the model
    else:
        model_name = "vinai/bertweet-base"
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 5).to(device) # initialize RoBerta for twitter from HF, num_labels=5 -> 5 sentiments.
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        base_model = model.roberta
        pretokenized_dir = ("data/tokenized_bertweet_base")  # the folder for saving the model

    # safety: correct dtypes + torch output
    ds = load_from_disk(pretokenized_dir) #Loads the Arrow-backed HF DatasetDict that are defines later on in the Pre-tokenization part
    for split in ds:
        ds[split] = ds[split].cast_column("input_ids", Sequence(Value("int64")))
        ds[split] = ds[split].cast_column("attention_mask", Sequence(Value("int64")))  # or "bool"
        ds[split] = ds[split].cast_column("labels", Value("int64"))
        ds[split].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    # keep dynamic padding (no tokenization here—collator only pads per batch)
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="pt")

    # Merge train + validation for final training
    full_train_dataset = concatenate_datasets([ds["train_reduced"], ds["validation"]])
    full_train_dataset = full_train_dataset.shuffle(seed=42) # Shuffle the model's training data to add randomness

    # build loaders from the pretokenized HF dataset
    train_loader = DataLoader(
        full_train_dataset, batch_size=best_params["batch_size"], shuffle=True,
        collate_fn=collator, num_workers=4, pin_memory=True,
        persistent_workers=True, prefetch_factor=2
    )
    val_loader = DataLoader(
        ds["test"], batch_size=min(2*best_params["batch_size"], 128), shuffle=False,
        collate_fn=collator, num_workers=4, pin_memory=True,
        persistent_workers=True, prefetch_factor=2
    )

    #Freezing and Unfreezing layers
    for p in base_model.parameters():
        p.requires_grad = False
    if best_params["num_layers_finetune"] > 0:  # safety guard: avoid the "-0" edge case
        for p in base_model.encoder.layer[-best_params["num_layers_finetune"]:].parameters():
            p.requires_grad = True
    for p in model.classifier.parameters():
        p.requires_grad = True

    # Define optimizer and loss function
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=best_params["learning_rate"], weight_decay=best_params["weight_decay"])

    if wandb.run is not None:
      wandb.finish() # Check if W&B doesn't run anything in parallel. If so, stop the pre-existing run.

    # Initialize Weights & Biases - the values in the config are the properties of each trial.
    wandb.init(project=f"{architecture}_CORONA_NLP_Twitter_Sentiment_Analysis_19.8.2025_FULL_TRAINING",
               entity = "idoshahar96-tel-aviv-university",
               config={
        "learning_rate": best_params["learning_rate"],
        "weight_decay": best_params["weight_decay"],
        "patience": best_params["patience"],
        "batch_size": best_params["batch_size"],
        "num_layers_finetune": best_params["num_layers_finetune"],
        "architecture": architecture,
        "dataset": "CORONA-NLP-Train_Twitter-Sentiment-Analysis"},
        name="FINAL_TRAINING", # The name that will be saved in the W&B platform
        reinit=True)

    # Train the model and get the best validation accuracy
    best_val_accuracy = train_model_with_hyperparams(model, train_loader, val_loader, optimizer, criterion, epochs=25, patience=best_params["patience"], trial=None)

    wandb.finish() # Finish the Weights & Biases run

   # Save model
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

In [None]:
# best_params = study_roberta_base_tweet_rec4.best_params  # get best HPs from the model's Optuna study (are under # but the hashtag sign # can be removed if needed, for the sake of manual inscription of best param, check the row below)
best_params = {'learning_rate': 0.0003834791389042033, 'weight_decay': 2.88286253103848e-06, 'patience': 7, 'batch_size': 128, 'num_layers_finetune': 3} # Manually typed the best_params for future use
name_path = "/best_model_roberta_base_tweet_rec4"
save_path = model_root + name_path # initialize & define save path for the model's weights

# Training the Model (1), using Optuna-study's best trial HPs - RoBERTa-Base-Tweet:
FINAL_train_model_with_hyperparams(architecture="twitter-roberta-base", best_params=best_params,save_path=save_path)

# Zip the whole model folder
shutil.make_archive(save_path, "zip", save_path)

# Download the zip to your computer
files.download(f"{save_path}.zip")

In [None]:
# best_params = study_bertweet_base_rec4.best_params  # get best HPs from the model's Optuna study (are under # but the hashtag sign # can be removed if needed, for the sake of manual inscription of best param, check the row below)
best_params = {'learning_rate': 0.0001184412471705182, 'weight_decay': 1.2699696348040995e-05, 'patience': 10, 'batch_size': 128, 'num_layers_finetune': 3} # Manually typed the best_params for future use
name_path = "/best_model_bertweet_base_rec4"
save_path = model_root + name_path # initialize & define save path for the model's weights

# Training the Model (2), using Optuna-study's best trial HPs - BERTweet-Base:
FINAL_train_model_with_hyperparams(architecture="bertweet-base", best_params=best_params,save_path=save_path)

# Zip the whole model folder
shutil.make_archive(save_path, "zip", save_path)

# Download the zip to your computer
files.download(f"{save_path}.zip")