In [3]:
# run_final_experiment.py
import os
import numpy as np
import pandas as pd
import torch
import evaluate
import demoji
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)

# --- CONFIGURATION SECTION ---

# 1. Define the models for the bake-off
MODELS_TO_TEST = [
    "vinai/bertweet-base",
    "cardiffnlp/twitter-roberta-base-emotion"
]

# 2. Define the hyperparameter search space for the winning model
HYPERPARAMETER_SEARCH_SPACE = [
    {"learning_rate": 2e-5, "num_train_epochs": 5, "per_device_train_batch_size": 16},
    {"learning_rate": 3e-5, "num_train_epochs": 8, "per_device_train_batch_size": 16},
    {"learning_rate": 5e-5, "num_train_epochs": 8, "per_device_train_batch_size": 16},
    {"learning_rate": 3e-5, "num_train_epochs": 8, "per_device_train_batch_size": 32}, # Try if GPU has enough memory
]

# 3. Define raw data files
FRIENDS_DATA_PATH = '../data/data1.xlsx' # Your 1K high-quality data
KAGGLE_DATA_PATHS = ['../data/data2.xlsx', '../data/data3.xlsx'] # The generic data

# --- STEP 1: ADVANCED DATA PREPARATION ---

# --- STEP 1: ADVANCED DATA PREPARATION ---

print("--- STEP 1: Preparing Datasets from Excel Files ---")

# Load and combine data
df_friends = pd.read_excel(FRIENDS_DATA_PATH)
df_kaggle_list = [pd.read_excel(p) for p in KAGGLE_DATA_PATHS]
df_kaggle = pd.concat(df_kaggle_list, ignore_index=True)

# --- NEW: Standardize and Rename Columns ---
# First, make all column names lowercase and remove leading/trailing spaces
df_friends.columns = [col.strip().lower() for col in df_friends.columns]
df_kaggle.columns = [col.strip().lower() for col in df_kaggle.columns]

# Now, rename the 'entry' column to 'text' so the rest of the script works
# We will check if the 'entry' column exists before trying to rename it
if 'entry' in df_friends.columns:
    df_friends.rename(columns={'entry': 'text'}, inplace=True)
if 'entry' in df_kaggle.columns:
    df_kaggle.rename(columns={'entry': 'text'}, inplace=True)

print("Standardized Columns in Friends Data:", df_friends.columns)
print("Standardized Columns in Kaggle Data:", df_kaggle.columns)
# -------------------------------------------

# Clean and drop duplicates (This will now work)
df_friends.dropna(subset=['text', 'emotion'], inplace=True)
df_kaggle.dropna(subset=['text', 'emotion'], inplace=True)
df_friends.drop_duplicates(subset=['text'], inplace=True)
df_kaggle.drop_duplicates(subset=['text'], inplace=True)

# Isolate a "golden" test set from your friends' data
test_df = df_friends.sample(frac=0.2, random_state=42)

# The rest of the data forms the training pool
train_pool_df = pd.concat([df_friends.drop(test_df.index), df_kaggle], ignore_index=True)

# Split the pool into training and validation sets
train_df, val_df = train_test_split(train_pool_df, test_size=0.1, random_state=42, stratify=train_pool_df['emotion'])

print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set (friends only): {len(test_df)}")

# Convert pandas DataFrames to Hugging Face Datasets
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

# --- STEP 2: ENHANCED PREPROCESSING & CLASS WEIGHTS ---

print("\n--- STEP 2: Setting up Preprocessing and Class Weights ---")

# Initialize demoji for emoji translation
demoji.download_codes()

def preprocess_text(batch):
    # Convert emojis to text descriptions
    batch['text'] = [demoji.replace_with_desc(str(text), sep=" ") for text in batch['text']]
    return batch

train_ds = train_ds.map(preprocess_text, batched=True)
val_ds = val_ds.map(preprocess_text, batched=True)
test_ds = test_ds.map(preprocess_text, batched=True)

# Create label mappings from the training data
unique_labels = train_df['emotion'].unique()
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for i, label in enumerate(unique_labels)}
num_labels = len(unique_labels)

def encode_labels(batch):
    batch['label'] = [label2id[label] for label in batch['emotion']]
    return batch

train_ds = train_ds.map(encode_labels, batched=True)
val_ds = val_ds.map(encode_labels, batched=True)
test_ds = test_ds.map(encode_labels, batched=True)

# Calculate class weights for handling imbalance
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array(list(label2id.keys())),
    y=train_df['emotion']
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to("cuda")

# Create a Custom Trainer to use the class weights
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Define a shared metric computation function
metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels, average="weighted")

# --- CORE TRAINING FUNCTION ---

def train_and_evaluate(model_checkpoint, training_args_dict, run_name):
    print(f"\n{'='*50}\nSTARTING RUN: {run_name} | MODEL: {model_checkpoint}\n{'='*50}\n")
    
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    def tokenize_fn(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)
    
    tokenized_train_ds = train_ds.map(tokenize_fn, batched=True)
    tokenized_val_ds = val_ds.map(tokenize_fn, batched=True)
    tokenized_test_ds = test_ds.map(tokenize_fn, batched=True)
    
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True  # <-- ADD THIS LINE
    )
    
    output_dir = f"./results/{run_name}"
    training_args = TrainingArguments(output_dir=output_dir, **training_args_dict)

    trainer = CustomTrainer( # Use the CustomTrainer with class weights
        model=model,
        args=training_args,
        train_dataset=tokenized_train_ds,
        eval_dataset=tokenized_val_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )
    
    trainer.train()
    
    # Evaluate on the sacred, friend-only test set
    test_results = trainer.evaluate(eval_dataset=tokenized_test_ds)
    print(f"\n{'*'*20} RESULTS FOR RUN: {run_name} {'*'*20}")
    print(f"Final Test F1 Score on Friends Data: {test_results['eval_f1']:.4f}")
    
    return test_results

# --- MAIN EXECUTION SCRIPT ---

if __name__ == "__main__":
    
    # --- STEP 3: The Model Bake-Off ---
    print("\n\n--- STEP 3: BASELINE MODEL BAKE-OFF ---")
    
    baseline_results = {}
    baseline_args = {
        "evaluation_strategy": "epoch",
        "save_strategy": "epoch",
        "learning_rate": 2e-5,
        "per_device_train_batch_size": 16,
        "num_train_epochs": 5,
        "weight_decay": 0.01,
        "load_best_model_at_end": True,
        "metric_for_best_model": "f1",
        "save_total_limit": 1, # Only keep the single best checkpoint
        "lr_scheduler_type": 'linear',
        "warmup_ratio": 0.1,
        "seed": 42,
    }

    for model_name in MODELS_TO_TEST:
        run_name = f"baseline_{model_name.replace('/', '_')}"
        results = train_and_evaluate(model_name, baseline_args, run_name)
        baseline_results[model_name] = results['eval_f1']

    winning_model = max(baseline_results, key=baseline_results.get)
    print(f"\n\n--- BAKE-OFF COMPLETE ---\nScores: {baseline_results}\nWINNING MODEL: {winning_model}\n\n")

    # --- STEP 4: Hyperparameter Tuning the Winner ---
    print(f"--- STEP 4: HYPERPARAMETER TUNING FOR {winning_model} ---")
    
    tuning_results = []
    
    for i, params in enumerate(HYPERPARAMETER_SEARCH_SPACE):
        run_args = baseline_args.copy()
        run_args.update(params)
        
        run_name = f"tuning_{winning_model.replace('/', '_')}_run_{i+1}"
        results = train_and_evaluate(winning_model, run_args, run_name)
        
        tuning_results.append({
            "run_name": run_name,
            "params": params,
            "f1_score": results['eval_f1']
        })

    best_run = max(tuning_results, key=lambda x: x['f1_score'])
    
    print(f"\n\n{'='*60}\n--- EXPERIMENT COMPLETE ---\n")
    print(f"Best Run: {best_run['run_name']}")
    print(f"Best Hyperparameters: {best_run['params']}")
    print(f"Best Test F1 Score on Friends Data: {best_run['f1_score']:.4f}")
    print(f"\nYour final, best model is saved in: ./results/{best_run['run_name']}")
    print(f"{'='*60}")

--- STEP 1: Preparing Datasets from Excel Files ---
Standardized Columns in Friends Data: Index(['text', 'emotion'], dtype='object')
Standardized Columns in Kaggle Data: Index(['text', 'emotion'], dtype='object')
Train set size: 2509
Validation set size: 279
Test set (friends only): 200


  demoji.download_codes()



--- STEP 2: Setting up Preprocessing and Class Weights ---


Map: 100%|██████████| 2509/2509 [00:01<00:00, 2165.72 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 1945.02 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 1605.58 examples/s]
Map: 100%|██████████| 2509/2509 [00:00<00:00, 384659.29 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 110773.46 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 64542.65 examples/s]




--- STEP 3: BASELINE MODEL BAKE-OFF ---

STARTING RUN: baseline_vinai_bertweet-base | MODEL: vinai/bertweet-base



Map: 100%|██████████| 2509/2509 [00:00<00:00, 4647.80 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 4282.08 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 2273.48 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 20%|██        | 157/785 [00:47<03:36,  2.90it/s]
 20%|██        | 157/785 [00:50<03:36,  2.90it/s]

{'eval_loss': 1.3656238317489624, 'eval_f1': 0.4993275054905067, 'eval_runtime': 2.3168, 'eval_samples_per_second': 120.424, 'eval_steps_per_second': 15.107, 'epoch': 1.0}


 40%|████      | 314/785 [01:59<02:50,  2.77it/s]
 40%|████      | 314/785 [02:03<02:50,  2.77it/s]

{'eval_loss': 0.9663352966308594, 'eval_f1': 0.7099553811490019, 'eval_runtime': 3.1937, 'eval_samples_per_second': 87.36, 'eval_steps_per_second': 10.959, 'epoch': 2.0}


 60%|██████    | 471/785 [03:07<01:31,  3.41it/s]
 60%|██████    | 471/785 [03:08<01:31,  3.41it/s]

{'eval_loss': 0.868071973323822, 'eval_f1': 0.7479345638765481, 'eval_runtime': 1.4314, 'eval_samples_per_second': 194.908, 'eval_steps_per_second': 24.451, 'epoch': 3.0}


 64%|██████▎   | 500/785 [03:19<01:13,  3.89it/s]

{'loss': 1.2143, 'grad_norm': 8.512378692626953, 'learning_rate': 8.073654390934846e-06, 'epoch': 3.18}


 80%|████████  | 628/785 [04:02<00:56,  2.79it/s]
 80%|████████  | 628/785 [04:05<00:56,  2.79it/s]

{'eval_loss': 0.8703208565711975, 'eval_f1': 0.7456291924396639, 'eval_runtime': 2.5455, 'eval_samples_per_second': 109.605, 'eval_steps_per_second': 13.75, 'epoch': 4.0}


100%|██████████| 785/785 [05:04<00:00,  3.02it/s]
100%|██████████| 785/785 [05:07<00:00,  3.02it/s]

{'eval_loss': 0.8760215044021606, 'eval_f1': 0.7513380833739648, 'eval_runtime': 2.6922, 'eval_samples_per_second': 103.631, 'eval_steps_per_second': 13.0, 'epoch': 5.0}


100%|██████████| 785/785 [05:16<00:00,  3.02it/s]

{'train_runtime': 316.3042, 'train_samples_per_second': 39.661, 'train_steps_per_second': 2.482, 'train_loss': 1.0029763580127886, 'epoch': 5.0}


100%|██████████| 785/785 [05:16<00:00,  2.48it/s]
100%|██████████| 25/25 [00:01<00:00, 16.24it/s]



******************** RESULTS FOR RUN: baseline_vinai_bertweet-base ********************
Final Test F1 Score on Friends Data: 0.6437

STARTING RUN: baseline_cardiffnlp_twitter-roberta-base-emotion | MODEL: cardiffnlp/twitter-roberta-base-emotion



Map: 100%|██████████| 2509/2509 [00:00<00:00, 11084.51 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 3207.74 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 1905.46 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 20%|██        | 157/785 [01:02<03:51,  2.71it/s]
 20%|██        | 157/785 [01:04<03:51,  2.71it/s]

{'eval_loss': 1.03691828250885, 'eval_f1': 0.6754469748030677, 'eval_runtime': 2.2547, 'eval_samples_per_second': 123.744, 'eval_steps_per_second': 15.523, 'epoch': 1.0}


 40%|████      | 314/785 [02:08<03:16,  2.40it/s]
 40%|████      | 314/785 [02:11<03:16,  2.40it/s]

{'eval_loss': 0.8898560404777527, 'eval_f1': 0.7235571183779923, 'eval_runtime': 2.8884, 'eval_samples_per_second': 96.592, 'eval_steps_per_second': 12.117, 'epoch': 2.0}


 60%|██████    | 471/785 [03:05<01:40,  3.12it/s]
 60%|██████    | 471/785 [03:08<01:40,  3.12it/s]

{'eval_loss': 0.8357837200164795, 'eval_f1': 0.7482092207851361, 'eval_runtime': 2.2808, 'eval_samples_per_second': 122.325, 'eval_steps_per_second': 15.345, 'epoch': 3.0}


 64%|██████▎   | 500/785 [03:22<01:27,  3.24it/s]

{'loss': 0.9776, 'grad_norm': 12.913827896118164, 'learning_rate': 8.073654390934846e-06, 'epoch': 3.18}


 80%|████████  | 628/785 [04:06<00:54,  2.90it/s]
 80%|████████  | 628/785 [04:08<00:54,  2.90it/s]

{'eval_loss': 0.8653876185417175, 'eval_f1': 0.7477334032502476, 'eval_runtime': 2.4658, 'eval_samples_per_second': 113.149, 'eval_steps_per_second': 14.194, 'epoch': 4.0}


100%|██████████| 785/785 [05:16<00:00,  2.31it/s]
100%|██████████| 785/785 [05:18<00:00,  2.31it/s]

{'eval_loss': 0.8834185600280762, 'eval_f1': 0.7507998610534573, 'eval_runtime': 2.0465, 'eval_samples_per_second': 136.329, 'eval_steps_per_second': 17.102, 'epoch': 5.0}


100%|██████████| 785/785 [05:22<00:00,  2.31it/s]

{'train_runtime': 322.2679, 'train_samples_per_second': 38.927, 'train_steps_per_second': 2.436, 'train_loss': 0.801519133938346, 'epoch': 5.0}


100%|██████████| 785/785 [05:22<00:00,  2.43it/s]
100%|██████████| 25/25 [00:01<00:00, 20.82it/s]



******************** RESULTS FOR RUN: baseline_cardiffnlp_twitter-roberta-base-emotion ********************
Final Test F1 Score on Friends Data: 0.6875


--- BAKE-OFF COMPLETE ---
Scores: {'vinai/bertweet-base': 0.6436663001065445, 'cardiffnlp/twitter-roberta-base-emotion': 0.6874817441757812}
WINNING MODEL: cardiffnlp/twitter-roberta-base-emotion


--- STEP 4: HYPERPARAMETER TUNING FOR cardiffnlp/twitter-roberta-base-emotion ---

STARTING RUN: tuning_cardiffnlp_twitter-roberta-base-emotion_run_1 | MODEL: cardiffnlp/twitter-roberta-base-emotion



Map: 100%|██████████| 2509/2509 [00:00<00:00, 7499.54 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 2896.12 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 8348.12 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 20%|██        | 157/785 [00:56<03:38,  2.87it/s]
 20%|██        | 157/785 [00:59<03:38,  2.87it/s]

{'eval_loss': 1.03691828250885, 'eval_f1': 0.6754469748030677, 'eval_runtime': 2.4777, 'eval_samples_per_second': 112.604, 'eval_steps_per_second': 14.126, 'epoch': 1.0}


 40%|████      | 314/785 [01:56<02:48,  2.80it/s]
 40%|████      | 314/785 [01:58<02:48,  2.80it/s]

{'eval_loss': 0.8898560404777527, 'eval_f1': 0.7235571183779923, 'eval_runtime': 1.9463, 'eval_samples_per_second': 143.351, 'eval_steps_per_second': 17.983, 'epoch': 2.0}


 60%|██████    | 471/785 [02:46<01:11,  4.37it/s]
 60%|██████    | 471/785 [02:48<01:11,  4.37it/s]

{'eval_loss': 0.8357837200164795, 'eval_f1': 0.7482092207851361, 'eval_runtime': 1.4088, 'eval_samples_per_second': 198.044, 'eval_steps_per_second': 24.844, 'epoch': 3.0}


 64%|██████▎   | 500/785 [02:56<01:09,  4.09it/s]

{'loss': 0.9776, 'grad_norm': 12.913827896118164, 'learning_rate': 8.073654390934846e-06, 'epoch': 3.18}


 80%|████████  | 628/785 [03:42<00:39,  3.96it/s]
 80%|████████  | 628/785 [03:44<00:39,  3.96it/s]

{'eval_loss': 0.8653876185417175, 'eval_f1': 0.7477334032502476, 'eval_runtime': 2.1947, 'eval_samples_per_second': 127.127, 'eval_steps_per_second': 15.948, 'epoch': 4.0}


100%|██████████| 785/785 [04:44<00:00,  2.79it/s]
100%|██████████| 785/785 [04:47<00:00,  2.79it/s]

{'eval_loss': 0.8834185600280762, 'eval_f1': 0.7507998610534573, 'eval_runtime': 3.1645, 'eval_samples_per_second': 88.166, 'eval_steps_per_second': 11.06, 'epoch': 5.0}


100%|██████████| 785/785 [04:50<00:00,  2.79it/s]

{'train_runtime': 290.7653, 'train_samples_per_second': 43.145, 'train_steps_per_second': 2.7, 'train_loss': 0.801519133938346, 'epoch': 5.0}


100%|██████████| 785/785 [04:50<00:00,  2.70it/s]
100%|██████████| 25/25 [00:01<00:00, 23.55it/s]



******************** RESULTS FOR RUN: tuning_cardiffnlp_twitter-roberta-base-emotion_run_1 ********************
Final Test F1 Score on Friends Data: 0.6875

STARTING RUN: tuning_cardiffnlp_twitter-roberta-base-emotion_run_2 | MODEL: cardiffnlp/twitter-roberta-base-emotion



Map: 100%|██████████| 2509/2509 [00:00<00:00, 10518.76 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 9506.95 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 5154.23 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 12%|█▎        | 157/1256 [00:53<04:42,  3.88it/s]
 12%|█▎        | 157/1256 [00:56<04:42,  3.88it/s]

{'eval_loss': 1.0137062072753906, 'eval_f1': 0.6784975715127859, 'eval_runtime': 2.2782, 'eval_samples_per_second': 122.467, 'eval_steps_per_second': 15.363, 'epoch': 1.0}


 25%|██▌       | 314/1256 [02:08<07:58,  1.97it/s]
 25%|██▌       | 314/1256 [02:10<07:58,  1.97it/s]

{'eval_loss': 0.8821491599082947, 'eval_f1': 0.7212448325428287, 'eval_runtime': 1.846, 'eval_samples_per_second': 151.142, 'eval_steps_per_second': 18.96, 'epoch': 2.0}


 38%|███▊      | 471/1256 [03:03<03:01,  4.32it/s]
 38%|███▊      | 471/1256 [03:04<03:01,  4.32it/s]

{'eval_loss': 0.8608065247535706, 'eval_f1': 0.7489946131852784, 'eval_runtime': 1.4331, 'eval_samples_per_second': 194.686, 'eval_steps_per_second': 24.423, 'epoch': 3.0}


 40%|███▉      | 501/1256 [03:13<03:04,  4.09it/s]

{'loss': 0.9447, 'grad_norm': 19.133054733276367, 'learning_rate': 2.007079646017699e-05, 'epoch': 3.18}


 50%|█████     | 628/1256 [03:44<02:28,  4.22it/s]
 50%|█████     | 628/1256 [03:46<02:28,  4.22it/s]

{'eval_loss': 0.9502195715904236, 'eval_f1': 0.7494591907247844, 'eval_runtime': 1.4552, 'eval_samples_per_second': 191.728, 'eval_steps_per_second': 24.052, 'epoch': 4.0}


 62%|██████▎   | 785/1256 [04:26<01:51,  4.23it/s]
 62%|██████▎   | 785/1256 [04:27<01:51,  4.23it/s]

{'eval_loss': 1.0534489154815674, 'eval_f1': 0.7392917833352347, 'eval_runtime': 1.4375, 'eval_samples_per_second': 194.086, 'eval_steps_per_second': 24.348, 'epoch': 5.0}


 75%|███████▌  | 942/1256 [05:08<01:14,  4.21it/s]
 75%|███████▌  | 942/1256 [05:09<01:14,  4.21it/s]

{'eval_loss': 1.1791267395019531, 'eval_f1': 0.7536157774520043, 'eval_runtime': 1.5082, 'eval_samples_per_second': 184.991, 'eval_steps_per_second': 23.207, 'epoch': 6.0}


 80%|███████▉  | 1000/1256 [05:25<01:04,  3.98it/s]

{'loss': 0.2524, 'grad_norm': 0.9636558890342712, 'learning_rate': 6.79646017699115e-06, 'epoch': 6.37}


 88%|████████▊ | 1099/1256 [05:50<00:37,  4.20it/s]
 88%|████████▊ | 1099/1256 [05:51<00:37,  4.20it/s]

{'eval_loss': 1.309918999671936, 'eval_f1': 0.7505590424184257, 'eval_runtime': 1.4514, 'eval_samples_per_second': 192.225, 'eval_steps_per_second': 24.114, 'epoch': 7.0}


100%|██████████| 1256/1256 [06:32<00:00,  4.19it/s]
100%|██████████| 1256/1256 [06:33<00:00,  4.19it/s]

{'eval_loss': 1.3308149576187134, 'eval_f1': 0.7599985419960621, 'eval_runtime': 1.4527, 'eval_samples_per_second': 192.055, 'eval_steps_per_second': 24.093, 'epoch': 8.0}


100%|██████████| 1256/1256 [06:35<00:00,  3.17it/s]


{'train_runtime': 395.6131, 'train_samples_per_second': 50.736, 'train_steps_per_second': 3.175, 'train_loss': 0.4977211602933847, 'epoch': 8.0}


100%|██████████| 25/25 [00:00<00:00, 26.61it/s]



******************** RESULTS FOR RUN: tuning_cardiffnlp_twitter-roberta-base-emotion_run_2 ********************
Final Test F1 Score on Friends Data: 0.7137

STARTING RUN: tuning_cardiffnlp_twitter-roberta-base-emotion_run_3 | MODEL: cardiffnlp/twitter-roberta-base-emotion



Map: 100%|██████████| 2509/2509 [00:00<00:00, 16391.30 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 14854.54 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 14196.08 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 12%|█▎        | 157/1256 [00:38<04:20,  4.22it/s]
 12%|█▎        | 157/1256 [00:39<04:20,  4.22it/s]

{'eval_loss': 1.0079212188720703, 'eval_f1': 0.6691762410822509, 'eval_runtime': 1.4462, 'eval_samples_per_second': 192.919, 'eval_steps_per_second': 24.201, 'epoch': 1.0}


 25%|██▌       | 314/1256 [01:20<03:41,  4.26it/s]
 25%|██▌       | 314/1256 [01:21<03:41,  4.26it/s]

{'eval_loss': 0.8993061184883118, 'eval_f1': 0.7230815343536356, 'eval_runtime': 1.4405, 'eval_samples_per_second': 193.688, 'eval_steps_per_second': 24.298, 'epoch': 2.0}


 38%|███▊      | 471/1256 [02:01<03:06,  4.22it/s]
 38%|███▊      | 471/1256 [02:03<03:06,  4.22it/s]

{'eval_loss': 0.9878883361816406, 'eval_f1': 0.711335621209246, 'eval_runtime': 1.4527, 'eval_samples_per_second': 192.052, 'eval_steps_per_second': 24.093, 'epoch': 3.0}


 40%|███▉      | 500/1256 [02:12<03:06,  4.05it/s]

{'loss': 0.9131, 'grad_norm': 6.141289234161377, 'learning_rate': 3.345132743362832e-05, 'epoch': 3.18}


 50%|█████     | 628/1256 [02:43<02:29,  4.21it/s]
 50%|█████     | 628/1256 [02:45<02:29,  4.21it/s]

{'eval_loss': 1.0376378297805786, 'eval_f1': 0.7267045800273023, 'eval_runtime': 1.4617, 'eval_samples_per_second': 190.88, 'eval_steps_per_second': 23.946, 'epoch': 4.0}


 62%|██████▎   | 785/1256 [03:25<01:52,  4.17it/s]
 62%|██████▎   | 785/1256 [03:27<01:52,  4.17it/s]

{'eval_loss': 1.1268608570098877, 'eval_f1': 0.7515824076844541, 'eval_runtime': 1.4549, 'eval_samples_per_second': 191.765, 'eval_steps_per_second': 24.057, 'epoch': 5.0}


 75%|███████▌  | 942/1256 [04:07<01:14,  4.20it/s]
 75%|███████▌  | 942/1256 [04:08<01:14,  4.20it/s]

{'eval_loss': 1.1340246200561523, 'eval_f1': 0.7816471366327522, 'eval_runtime': 1.455, 'eval_samples_per_second': 191.752, 'eval_steps_per_second': 24.055, 'epoch': 6.0}


 80%|███████▉  | 1000/1256 [04:24<01:02,  4.10it/s]

{'loss': 0.2114, 'grad_norm': 0.13736696541309357, 'learning_rate': 1.1327433628318584e-05, 'epoch': 6.37}


 88%|████████▊ | 1099/1256 [04:49<00:37,  4.22it/s]
 88%|████████▊ | 1099/1256 [04:50<00:37,  4.22it/s]

{'eval_loss': 1.25164794921875, 'eval_f1': 0.8033604871938292, 'eval_runtime': 1.4525, 'eval_samples_per_second': 192.083, 'eval_steps_per_second': 24.096, 'epoch': 7.0}


100%|██████████| 1256/1256 [05:31<00:00,  4.09it/s]
100%|██████████| 1256/1256 [05:32<00:00,  4.09it/s]

{'eval_loss': 1.3045963048934937, 'eval_f1': 0.7776270078219826, 'eval_runtime': 1.4411, 'eval_samples_per_second': 193.597, 'eval_steps_per_second': 24.286, 'epoch': 8.0}


100%|██████████| 1256/1256 [05:35<00:00,  3.75it/s]


{'train_runtime': 334.8635, 'train_samples_per_second': 59.941, 'train_steps_per_second': 3.751, 'train_loss': 0.4608848763119643, 'epoch': 8.0}


100%|██████████| 25/25 [00:00<00:00, 25.64it/s]



******************** RESULTS FOR RUN: tuning_cardiffnlp_twitter-roberta-base-emotion_run_3 ********************
Final Test F1 Score on Friends Data: 0.7352

STARTING RUN: tuning_cardiffnlp_twitter-roberta-base-emotion_run_4 | MODEL: cardiffnlp/twitter-roberta-base-emotion



Map: 100%|██████████| 2509/2509 [00:00<00:00, 16850.77 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 11358.62 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 13461.40 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 12%|█▎        | 79/632 [00:34<03:31,  2.62it/s]
 12%|█▎        | 79/632 [00:36<03:31,  2.62it/s]

{'eval_loss': 1.087255835533142, 'eval_f1': 0.633933393655002, 'eval_runtime': 1.4588, 'eval_samples_per_second': 191.247, 'eval_steps_per_second': 23.992, 'epoch': 1.0}


 25%|██▌       | 158/632 [01:13<03:08,  2.51it/s]
 25%|██▌       | 158/632 [01:14<03:08,  2.51it/s]

{'eval_loss': 0.9204866290092468, 'eval_f1': 0.712683375753676, 'eval_runtime': 1.4757, 'eval_samples_per_second': 189.063, 'eval_steps_per_second': 23.718, 'epoch': 2.0}


 38%|███▊      | 237/632 [01:51<02:32,  2.58it/s]
 38%|███▊      | 237/632 [01:52<02:32,  2.58it/s]

{'eval_loss': 0.8400378227233887, 'eval_f1': 0.7184390178951897, 'eval_runtime': 1.4821, 'eval_samples_per_second': 188.246, 'eval_steps_per_second': 23.615, 'epoch': 3.0}


 50%|█████     | 316/632 [02:28<01:50,  2.86it/s]
 50%|█████     | 316/632 [02:30<01:50,  2.86it/s]

{'eval_loss': 0.8847299814224243, 'eval_f1': 0.7325553785053422, 'eval_runtime': 1.4852, 'eval_samples_per_second': 187.848, 'eval_steps_per_second': 23.565, 'epoch': 4.0}


 62%|██████▎   | 395/632 [03:06<01:28,  2.69it/s]
 62%|██████▎   | 395/632 [03:08<01:28,  2.69it/s]

{'eval_loss': 0.9525181651115417, 'eval_f1': 0.7524498573739182, 'eval_runtime': 1.4341, 'eval_samples_per_second': 194.548, 'eval_steps_per_second': 24.406, 'epoch': 5.0}


 75%|███████▌  | 474/632 [03:45<01:00,  2.61it/s]
 75%|███████▌  | 474/632 [03:46<01:00,  2.61it/s]

{'eval_loss': 0.9755450487136841, 'eval_f1': 0.7471660749616624, 'eval_runtime': 1.4619, 'eval_samples_per_second': 190.845, 'eval_steps_per_second': 23.941, 'epoch': 6.0}


 79%|███████▉  | 500/632 [03:59<00:56,  2.32it/s]

{'loss': 0.7075, 'grad_norm': 1.85690176486969, 'learning_rate': 6.971830985915493e-06, 'epoch': 6.33}


 88%|████████▊ | 553/632 [04:23<00:28,  2.74it/s]
 88%|████████▊ | 553/632 [04:24<00:28,  2.74it/s]

{'eval_loss': 1.039838194847107, 'eval_f1': 0.7460399795347236, 'eval_runtime': 1.4812, 'eval_samples_per_second': 188.354, 'eval_steps_per_second': 23.629, 'epoch': 7.0}


100%|██████████| 632/632 [05:01<00:00,  2.77it/s]
100%|██████████| 632/632 [05:02<00:00,  2.77it/s]

{'eval_loss': 1.0504496097564697, 'eval_f1': 0.7495195699324888, 'eval_runtime': 1.4657, 'eval_samples_per_second': 190.357, 'eval_steps_per_second': 23.88, 'epoch': 8.0}


100%|██████████| 632/632 [05:05<00:00,  2.77it/s]

{'train_runtime': 305.3529, 'train_samples_per_second': 65.734, 'train_steps_per_second': 2.07, 'train_loss': 0.5965528276902211, 'epoch': 8.0}


100%|██████████| 632/632 [05:05<00:00,  2.07it/s]
100%|██████████| 25/25 [00:01<00:00, 24.66it/s]


******************** RESULTS FOR RUN: tuning_cardiffnlp_twitter-roberta-base-emotion_run_4 ********************
Final Test F1 Score on Friends Data: 0.6380


--- EXPERIMENT COMPLETE ---

Best Run: tuning_cardiffnlp_twitter-roberta-base-emotion_run_3
Best Hyperparameters: {'learning_rate': 5e-05, 'num_train_epochs': 8, 'per_device_train_batch_size': 16}
Best Test F1 Score on Friends Data: 0.7352

Your final, best model is saved in: ./results/tuning_cardiffnlp_twitter-roberta-base-emotion_run_3



