In [1]:
# Cell 1: Imports and Setup
import os
import numpy as np
import pandas as pd
import torch
import evaluate
import demoji
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    AdamW,
    get_scheduler
)
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

print("Imports complete. Setup ready.")

  from .autonotebook import tqdm as notebook_tqdm


Imports complete. Setup ready.


In [2]:
# Cell 2: Configuration - SET YOUR FILE PATHS HERE
# Define the champion model we are going to tune
MODEL_CHECKPOINT = "cardiffnlp/twitter-roberta-base-emotion"

# Define raw data files (MAKE SURE THESE PATHS ARE CORRECT FOR YOUR NOTEBOOK)
FRIENDS_DATA_PATH = '../data/data1.xlsx'
KAGGLE_DATA_PATHS = ['../data/data2.xlsx', '../data/data3.xlsx']

# Define a single, strong set of hyperparameters for this run
# We will combine these with the differential learning rates later
TRAINING_PARAMS = {
    "evaluation_strategy": "epoch",
    "save_strategy": "epoch",
    "per_device_train_batch_size": 16,
    "num_train_epochs": 8,
    "weight_decay": 0.01,
    "load_best_model_at_end": True,
    "metric_for_best_model": "f1",
    "save_total_limit": 1,
    "seed": 42,
}

# Define a name for this experimental run
RUN_NAME = "final_tune_roberta_augmented_diff_lr"

print("Configuration loaded.")



Configuration loaded.


In [3]:
# Cell 3: Data Loading, Cleaning, and Splitting
print("--- STEP 1: Preparing Datasets from Excel Files ---")

# Load data
df_friends = pd.read_excel(FRIENDS_DATA_PATH)
df_kaggle_list = [pd.read_excel(p) for p in KAGGLE_DATA_PATHS]
df_kaggle = pd.concat(df_kaggle_list, ignore_index=True)

# Standardize and Rename Columns
df_friends.columns = [col.strip().lower() for col in df_friends.columns]
df_kaggle.columns = [col.strip().lower() for col in df_kaggle.columns]
if 'entry' in df_friends.columns:
    df_friends.rename(columns={'entry': 'text'}, inplace=True)
if 'entry' in df_kaggle.columns:
    df_kaggle.rename(columns={'entry': 'text'}, inplace=True)

# Clean and drop duplicates
df_friends.dropna(subset=['text', 'emotion'], inplace=True)
df_kaggle.dropna(subset=['text', 'emotion'], inplace=True)
df_friends.drop_duplicates(subset=['text'], inplace=True)
df_kaggle.drop_duplicates(subset=['text'], inplace=True)

# Strategic Split: Isolate a "golden" test set from friends' data
test_df = df_friends.sample(frac=0.2, random_state=42)
train_pool_df = pd.concat([df_friends.drop(test_df.index), df_kaggle], ignore_index=True)
train_df, val_df = train_test_split(train_pool_df, test_size=0.1, random_state=42, stratify=train_pool_df['emotion'])

print(f"Original train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Sacred Test set (friends only): {len(test_df)}")

--- STEP 1: Preparing Datasets from Excel Files ---
Original train set size: 2509
Validation set size: 279
Sacred Test set (friends only): 200


In [5]:
# Cell 4: Data Augmentation
print("\n--- STEP 2: Augmenting Training Data ---")

# Setup Augmenters
char_inserter = nac.KeyboardAug(aug_char_p=0.03, aug_word_p=0.01)
word_substituter = naw.ContextualWordEmbsAug(model_path='roberta-base', action="substitute", aug_p=0.15, device="cuda")

def augment_text(df, num_augmented_samples=1500):
    """Augments a dataframe to create new training samples."""
    augmented_texts = []
    original_df = df.copy()
    
    # Generate augmented samples
    print(f"Generating {num_augmented_samples} new samples... (This may take a while)")
    for i in range(num_augmented_samples):
        if (i + 1) % 100 == 0:
            print(f"  ...augmented {i+1}/{num_augmented_samples}")
            
        sample = original_df.sample(1)
        original_text = sample['text'].iloc[0]
        original_emotion = sample['emotion'].iloc[0]
        
        # --- THIS IS THE FIX ---
        # Apply augmentations and take the first element [0] from the returned list
        augmented_text_1 = char_inserter.augment(original_text)[0]
        augmented_text_2 = word_substituter.augment(augmented_text_1)[0]
        # -----------------------
        
        augmented_texts.append({'text': augmented_text_2, 'emotion': original_emotion})
        
    return pd.concat([original_df, pd.DataFrame(augmented_texts)], ignore_index=True)

# Augment the training data
train_df_augmented = augment_text(train_df)
print(f"Augmented train set size: {len(train_df_augmented)}")

# Convert all DataFrames to Hugging Face Datasets
# This will now succeed because the 'text' column contains only strings
train_ds = Dataset.from_pandas(train_df_augmented)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

print("Data augmentation and conversion to Datasets complete.")


--- STEP 2: Augmenting Training Data ---
Generating 1500 new samples... (This may take a while)
  ...augmented 100/1500
  ...augmented 200/1500
  ...augmented 300/1500
  ...augmented 400/1500
  ...augmented 500/1500
  ...augmented 600/1500
  ...augmented 700/1500
  ...augmented 800/1500
  ...augmented 900/1500
  ...augmented 1000/1500
  ...augmented 1100/1500
  ...augmented 1200/1500
  ...augmented 1300/1500
  ...augmented 1400/1500
  ...augmented 1500/1500
Augmented train set size: 4009
Data augmentation and conversion to Datasets complete.


In [6]:
# Cell 5: Final Preprocessing, Class Weights, and Custom Trainer
print("\n--- STEP 3: Final Preprocessing and Setup ---")

demoji.download_codes()
def preprocess_text_and_labels(batch):
    # Convert emojis to text descriptions
    batch['text'] = [demoji.replace_with_desc(str(text), sep=" ") for text in batch['text']]
    # Encode labels
    batch['label'] = [label2id[label] for label in batch['emotion']]
    return batch

unique_labels = train_df['emotion'].unique()
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for i, label in enumerate(unique_labels)}
num_labels = len(unique_labels)

# Apply final preprocessing
train_ds = train_ds.map(preprocess_text_and_labels, batched=True)
val_ds = val_ds.map(preprocess_text_and_labels, batched=True)
test_ds = test_ds.map(preprocess_text_and_labels, batched=True)

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.array(list(label2id.keys())), y=train_df['emotion'])
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to("cuda")

# Create Custom Trainer for class weights
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels, average="weighted")

print("Setup complete. Ready for training.")

  demoji.download_codes()



--- STEP 3: Final Preprocessing and Setup ---


Map: 100%|██████████| 4009/4009 [00:02<00:00, 1892.36 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 2244.92 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 1502.52 examples/s]


Setup complete. Ready for training.


In [7]:
# Cell 6: The Final Training Run
print(f"\n{'='*50}\nSTARTING FINAL RUN: {RUN_NAME}\n{'='*50}\n")

# Load tokenizer and tokenize all datasets
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

tokenized_train_ds = train_ds.map(tokenize_fn, batched=True)
tokenized_val_ds = val_ds.map(tokenize_fn, batched=True)
tokenized_test_ds = test_ds.map(tokenize_fn, batched=True)

# Load the model, authorizing the head replacement
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

# Setup Differential Learning Rates
optimizer_grouped_parameters = [
    {"params": [p for n, p in model.named_parameters() if "classifier" not in n], "lr": 2e-6},
    {"params": [p for n, p in model.named_parameters() if "classifier" in n], "lr": 3e-5},
]
optimizer = AdamW(optimizer_grouped_parameters)

output_dir = f"./results/{RUN_NAME}"
training_args = TrainingArguments(output_dir=output_dir, **TRAINING_PARAMS)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    optimizers=(optimizer, None)
)

# Launch Training!
trainer.train()


STARTING FINAL RUN: final_tune_roberta_augmented_diff_lr



Map: 100%|██████████| 4009/4009 [00:00<00:00, 24087.93 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 19223.49 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 12391.40 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                    
 12%|█▎        | 251/2008 [18:31<1:51:55,  3.82s/it]

{'eval_loss': 1.225176215171814, 'eval_f1': 0.5828963668291316, 'eval_runtime': 2.3209, 'eval_samples_per_second': 120.212, 'eval_steps_per_second': 15.08, 'epoch': 1.0}


 25%|██▍       | 500/2008 [37:44<1:55:24,  4.59s/it]

{'loss': 1.3323, 'grad_norm': 12.245803833007812, 'learning_rate': 1.50199203187251e-06, 'epoch': 1.99}


                                                    
 25%|██▌       | 502/2008 [37:54<1:39:50,  3.98s/it]

{'eval_loss': 1.0887625217437744, 'eval_f1': 0.5977087344465216, 'eval_runtime': 2.2654, 'eval_samples_per_second': 123.158, 'eval_steps_per_second': 15.45, 'epoch': 2.0}


                                                    
 38%|███▊      | 753/2008 [58:00<1:28:03,  4.21s/it]

{'eval_loss': 1.0091911554336548, 'eval_f1': 0.6515686383002268, 'eval_runtime': 1.8613, 'eval_samples_per_second': 149.899, 'eval_steps_per_second': 18.804, 'epoch': 3.0}


 50%|████▉     | 1000/2008 [1:15:57<1:12:20,  4.31s/it]

{'loss': 1.0774, 'grad_norm': 15.448877334594727, 'learning_rate': 1.0039840637450198e-06, 'epoch': 3.98}


                                                       
 50%|█████     | 1004/2008 [1:16:14<1:01:41,  3.69s/it]

{'eval_loss': 0.9652299880981445, 'eval_f1': 0.6592583349793465, 'eval_runtime': 2.1331, 'eval_samples_per_second': 130.794, 'eval_steps_per_second': 16.408, 'epoch': 4.0}


                                                       
 62%|██████▎   | 1255/2008 [1:29:19<33:39,  2.68s/it]

{'eval_loss': 0.9431164264678955, 'eval_f1': 0.6931609790547192, 'eval_runtime': 1.5992, 'eval_samples_per_second': 174.459, 'eval_steps_per_second': 21.886, 'epoch': 5.0}


 75%|███████▍  | 1500/2008 [1:42:29<26:55,  3.18s/it]

{'loss': 0.9784, 'grad_norm': 14.455154418945312, 'learning_rate': 5.059760956175299e-07, 'epoch': 5.98}


                                                     
 75%|███████▌  | 1506/2008 [1:42:48<24:01,  2.87s/it]

{'eval_loss': 0.9236071705818176, 'eval_f1': 0.6845298535363846, 'eval_runtime': 1.5977, 'eval_samples_per_second': 174.623, 'eval_steps_per_second': 21.906, 'epoch': 6.0}


                                                     
 88%|████████▊ | 1757/2008 [1:53:44<10:11,  2.44s/it]

{'eval_loss': 0.9201882481575012, 'eval_f1': 0.6878212087259226, 'eval_runtime': 1.6321, 'eval_samples_per_second': 170.947, 'eval_steps_per_second': 21.445, 'epoch': 7.0}


100%|█████████▉| 2000/2008 [2:03:54<00:20,  2.51s/it]

{'loss': 0.9282, 'grad_norm': 12.525500297546387, 'learning_rate': 7.96812749003984e-09, 'epoch': 7.97}


                                                     
100%|██████████| 2008/2008 [2:04:15<00:00,  2.20s/it]

{'eval_loss': 0.915812075138092, 'eval_f1': 0.6885497897890427, 'eval_runtime': 1.6279, 'eval_samples_per_second': 171.389, 'eval_steps_per_second': 21.5, 'epoch': 8.0}


100%|██████████| 2008/2008 [2:04:18<00:00,  3.71s/it]

{'train_runtime': 7457.9486, 'train_samples_per_second': 4.3, 'train_steps_per_second': 0.269, 'train_loss': 1.078308966292803, 'epoch': 8.0}





TrainOutput(global_step=2008, training_loss=1.078308966292803, metrics={'train_runtime': 7457.9486, 'train_samples_per_second': 4.3, 'train_steps_per_second': 0.269, 'total_flos': 2109700207742976.0, 'train_loss': 1.078308966292803, 'epoch': 8.0})

In [8]:
# Cell 7: Final Evaluation
print("\n--- Evaluating the final model on the sacred test set ---")

# The trainer automatically loaded the best model from the run because of load_best_model_at_end=True
test_results = trainer.evaluate(eval_dataset=tokenized_test_ds)

print(f"\n\n{'='*60}\n--- EXPERIMENT COMPLETE ---\n")
print(f"Final Model: {RUN_NAME}")
print(f"Previous Best F1 Score: 0.7352")
print(f"New Best Test F1 Score on Friends Data: {test_results['eval_f1']:.4f}")
print(f"\nYour final, best model is saved in: {output_dir}")

if test_results['eval_f1'] > 0.7352:
    print("\nCongratulations! You have successfully improved the model's performance!")
else:
    print("\nThe model performance was similar to the previous run. This is a solid result, and the model is now more robust due to augmentation!")
print(f"{'='*60}")


--- Evaluating the final model on the sacred test set ---


100%|██████████| 25/25 [00:01<00:00, 23.41it/s]



--- EXPERIMENT COMPLETE ---

Final Model: final_tune_roberta_augmented_diff_lr
Previous Best F1 Score: 0.7352
New Best Test F1 Score on Friends Data: 0.6076

Your final, best model is saved in: ./results/final_tune_roberta_augmented_diff_lr

The model performance was similar to the previous run. This is a solid result, and the model is now more robust due to augmentation!



