In [1]:
# Cell 1: Imports and Setup
import os
import numpy as np
import pandas as pd
import torch
import evaluate
import demoji
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
import random # Add this import for the new augmentation

print("Imports complete. Setup ready.")

  from .autonotebook import tqdm as notebook_tqdm


Imports complete. Setup ready.


In [2]:
# Cell 2: New Configuration and Augmentation Setup

# --- NEW CONFIGURATION ---
MODEL_CHECKPOINT = "cardiffnlp/twitter-roberta-base-emotion-multilabel-latest"
RUN_NAME = "final_tune_random_deletion" # A new name for our final run

# --- Define paths to your data ---
FRIENDS_DATA_PATH = '../data/data1.xlsx'
KAGGLE_DATA_PATHS = ['../data/data2.xlsx', '../data/data3.xlsx']

# --- NEW: Setup Random Deletion Augmenter ---
# This function requires no new libraries!
def augment_with_random_deletion(df, p=0.15, num_augmented_samples=1000):
    augmented_texts = []
    original_df = df.copy()
    
    print(f"Generating {num_augmented_samples} new samples via Random Deletion...")
    # Make sure there is text to sample from
    if len(original_df.dropna(subset=['text'])) == 0:
        print("Warning: DataFrame to augment is empty or contains no text. Skipping augmentation.")
        return original_df

    for i in range(num_augmented_samples):
        # Sample only from rows that have valid text
        sample = original_df.dropna(subset=['text']).sample(1)
        original_text = sample['text'].iloc[0]
        original_emotion = sample['emotion'].iloc[0]
        
        words = str(original_text).split()
        # Only augment if the text is reasonably long
        if len(words) < 5: 
            continue
            
        # Keep each word with probability (1-p)
        new_words = [word for word in words if random.random() > p]
        
        if len(new_words) > 0:
            new_text = " ".join(new_words)
            augmented_texts.append({'text': new_text, 'emotion': original_emotion})
            
    return pd.concat([original_df, pd.DataFrame(augmented_texts)], ignore_index=True)

print("Advanced setup with Random Deletion complete.")

Advanced setup with Random Deletion complete.


In [3]:
# Cell 3: Full Data Preparation with New Augmentation

print("\n--- STEP 1: Preparing Datasets with Augmentation ---")

# Load and combine data
df_friends = pd.read_excel(FRIENDS_DATA_PATH)
df_kaggle_list = [pd.read_excel(p) for p in KAGGLE_DATA_PATHS]
df_kaggle = pd.concat(df_kaggle_list, ignore_index=True)

# Standardize and rename columns
df_friends.columns = [col.strip().lower() for col in df_friends.columns]
df_kaggle.columns = [col.strip().lower() for col in df_kaggle.columns]
if 'entry' in df_friends.columns: df_friends.rename(columns={'entry': 'text'}, inplace=True)
if 'entry' in df_kaggle.columns: df_kaggle.rename(columns={'entry': 'text'}, inplace=True)

# Clean and drop duplicates
df_friends.dropna(subset=['text', 'emotion'], inplace=True)
df_kaggle.dropna(subset=['text', 'emotion'], inplace=True)
df_friends.drop_duplicates(subset=['text'], inplace=True)
df_kaggle.drop_duplicates(subset=['text'], inplace=True)

# Create the same strategic split as before
test_df = df_friends.sample(frac=0.2, random_state=42)
train_friends_df = df_friends.drop(test_df.index) # Specialist data
train_pool_df = pd.concat([train_friends_df, df_kaggle], ignore_index=True)
train_pool_df, val_df = train_test_split(train_pool_df, test_size=0.1, random_state=42, stratify=train_pool_df['emotion'])

# --- APPLY THE NEW AUGMENTATION ---
train_pool_df = augment_with_random_deletion(train_pool_df)
# ------------------------------------

# Convert pandas DataFrames to Hugging Face Datasets
train_pool_ds = Dataset.from_pandas(train_pool_df)
train_friends_ds = Dataset.from_pandas(train_friends_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

print(f"\nAugmented General Training Pool size: {len(train_pool_df)}")
print(f"Specialist Training set (friends only): {len(train_friends_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Sacred Test set (friends only): {len(test_df)}")


--- STEP 1: Preparing Datasets with Augmentation ---
Generating 1000 new samples via Random Deletion...

Augmented General Training Pool size: 3486
Specialist Training set (friends only): 801
Validation set size: 279
Sacred Test set (friends only): 200


In [4]:
# Cell 4: Preprocessing, Custom Trainer, and Tokenization

print("\n--- STEP 2: Final Preprocessing and Setup ---")

# Initialize demoji for emoji translation
demoji.download_codes()

# Create label mappings from the full training data
unique_labels = sorted(train_pool_df['emotion'].unique())
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for i, label in enumerate(unique_labels)}
num_labels = len(unique_labels)

def preprocess_text_and_labels(batch):
    # Convert emojis to text descriptions
    batch['text'] = [demoji.replace_with_desc(str(text), sep=" ") for text in batch['text']]
    # Encode labels
    batch['label'] = [label2id[label] for label in batch['emotion']]
    return batch

# Apply preprocessing to all datasets
train_pool_ds = train_pool_ds.map(preprocess_text_and_labels, batched=True, remove_columns=['emotion'])
train_friends_ds = train_friends_ds.map(preprocess_text_and_labels, batched=True, remove_columns=['emotion'])
val_ds = val_ds.map(preprocess_text_and_labels, batched=True, remove_columns=['emotion'])
test_ds = test_ds.map(preprocess_text_and_labels, batched=True, remove_columns=['emotion'])

# Calculate class weights for handling imbalance
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array(unique_labels),
    y=train_pool_df['emotion'] # Use the augmented pool for weights
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to("cuda")

# Create a Custom Trainer to use the class weights
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Define a shared metric computation function
metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels, average="weighted")

# Initialize tokenizer and tokenize datasets
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

tokenized_train_pool_ds = train_pool_ds.map(tokenize_fn, batched=True, remove_columns=['text'])
tokenized_train_friends_ds = train_friends_ds.map(tokenize_fn, batched=True, remove_columns=['text'])
tokenized_val_ds = val_ds.map(tokenize_fn, batched=True, remove_columns=['text'])
tokenized_test_ds = test_ds.map(tokenize_fn, batched=True, remove_columns=['text'])

print("Setup complete. Ready for Two-Stage Training.")

  demoji.download_codes()



--- STEP 2: Final Preprocessing and Setup ---


Map: 100%|██████████| 3486/3486 [00:01<00:00, 2821.93 examples/s]
Map: 100%|██████████| 801/801 [00:00<00:00, 1798.69 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 2568.66 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 1496.99 examples/s]
Map: 100%|██████████| 3486/3486 [00:00<00:00, 20063.59 examples/s]
Map: 100%|██████████| 801/801 [00:00<00:00, 21641.57 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 14680.67 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 16521.14 examples/s]

Setup complete. Ready for Two-Stage Training.





In [5]:
# Cell 5: Two-Stage Training and Final Evaluation

print(f"\n{'='*50}\nSTARTING TWO-STAGE TRAINING: {RUN_NAME}\n{'='*50}\n")

# Load the initial model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True # Important for multilabel -> single-label
)

output_dir = f"./results/{RUN_NAME}"

# --- STAGE 1: Generalist Training on Augmented Data ---
print("\n--- STAGE 1: Training on General Pool (Friends + Kaggle + Augmented) ---")
stage1_args = TrainingArguments(
    output_dir=f"{output_dir}/stage1",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    num_train_epochs=5, # 5 epochs should be enough for the larger dataset
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=1,
    seed=42,
)

trainer_stage1 = CustomTrainer(
    model=model,
    args=stage1_args,
    train_dataset=tokenized_train_pool_ds,
    eval_dataset=tokenized_val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer_stage1.train()


# --- STAGE 2: Specialist Training on Friends Data ---
print("\n--- STAGE 2: Fine-tuning on Specialist Data (Friends Only) ---")
stage2_args = TrainingArguments(
    output_dir=output_dir, # Save the FINAL model here
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5, # Use a smaller learning rate for specialization
    per_device_train_batch_size=8, # A smaller batch size can help on small datasets
    num_train_epochs=8, # More epochs on the specialist data
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=1,
    seed=42,
)

trainer_stage2 = CustomTrainer(
    model=trainer_stage1.model, # Use the model that finished Stage 1
    args=stage2_args,
    train_dataset=tokenized_train_friends_ds,
    eval_dataset=tokenized_val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], # More patience here
)

trainer_stage2.train()

# --- FINAL EVALUATION ---
print("\n--- Evaluating the final SPECIALIST model on the sacred test set ---")
test_results = trainer_stage2.evaluate(eval_dataset=tokenized_test_ds)

print(f"\n\n{'='*60}\n--- FINAL EXPERIMENT COMPLETE ---\n")
print(f"Final Model: {RUN_NAME}")
print(f"Previous Best F1 Score: 0.69") # Our score to beat
print(f"Final Specialist F1 Score on Friends Data: {test_results['eval_f1']:.4f}")
print(f"\nYour final, best specialist model is saved in: {output_dir}")

if test_results['eval_f1'] > 0.69:
    print("\nSUCCESS! The new strategy improved performance.")
else:
    print("\nPerformance was similar to the previous run. This is still a highly robust model.")
print(f"{'='*60}")


STARTING TWO-STAGE TRAINING: final_tune_random_deletion



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion-multilabel-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([11, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([11]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- STAGE 1: Training on General Pool (Friends + Kaggle + Augmented) ---


                                                  
 20%|██        | 218/1090 [00:49<03:11,  4.55it/s]

{'eval_loss': 0.9040754437446594, 'eval_f1': 0.6777127477305828, 'eval_runtime': 1.252, 'eval_samples_per_second': 222.844, 'eval_steps_per_second': 27.955, 'epoch': 1.0}


                                                  
 40%|████      | 436/1090 [01:41<02:22,  4.59it/s]

{'eval_loss': 0.8899373412132263, 'eval_f1': 0.7038165537003402, 'eval_runtime': 1.2994, 'eval_samples_per_second': 214.71, 'eval_steps_per_second': 26.935, 'epoch': 2.0}


 46%|████▌     | 500/1090 [01:57<02:13,  4.41it/s]

{'loss': 0.9479, 'grad_norm': 25.551973342895508, 'learning_rate': 1.6238532110091743e-05, 'epoch': 2.29}


                                                  
 60%|██████    | 654/1090 [02:34<01:39,  4.40it/s]

{'eval_loss': 0.9623381495475769, 'eval_f1': 0.7327338213899331, 'eval_runtime': 1.3662, 'eval_samples_per_second': 204.214, 'eval_steps_per_second': 25.618, 'epoch': 3.0}


                                                  
 80%|████████  | 872/1090 [03:28<00:52,  4.18it/s]

{'eval_loss': 0.9936513304710388, 'eval_f1': 0.7297673536646305, 'eval_runtime': 1.4367, 'eval_samples_per_second': 194.189, 'eval_steps_per_second': 24.361, 'epoch': 4.0}


 92%|█████████▏| 1001/1090 [04:01<00:21,  4.16it/s]

{'loss': 0.374, 'grad_norm': 41.7637825012207, 'learning_rate': 2.4770642201834866e-06, 'epoch': 4.59}


                                                   
100%|██████████| 1090/1090 [04:24<00:00,  4.20it/s]

{'eval_loss': 1.0543166399002075, 'eval_f1': 0.7412929737899686, 'eval_runtime': 1.409, 'eval_samples_per_second': 198.008, 'eval_steps_per_second': 24.84, 'epoch': 5.0}


100%|██████████| 1090/1090 [04:26<00:00,  4.09it/s]


{'train_runtime': 266.0447, 'train_samples_per_second': 65.515, 'train_steps_per_second': 4.097, 'train_loss': 0.626404221560977, 'epoch': 5.0}

--- STAGE 2: Fine-tuning on Specialist Data (Friends Only) ---


 12%|█▎        | 101/808 [00:14<01:35,  7.39it/s]
 12%|█▎        | 101/808 [00:16<01:35,  7.39it/s]

{'eval_loss': 1.0658799409866333, 'eval_f1': 0.8004439240028748, 'eval_runtime': 1.3888, 'eval_samples_per_second': 200.899, 'eval_steps_per_second': 25.202, 'epoch': 1.0}


 25%|██▍       | 201/808 [00:32<01:28,  6.85it/s]
 25%|██▌       | 202/808 [00:33<01:28,  6.85it/s]

{'eval_loss': 1.1217749118804932, 'eval_f1': 0.7913037909410715, 'eval_runtime': 1.38, 'eval_samples_per_second': 202.167, 'eval_steps_per_second': 25.361, 'epoch': 2.0}


 37%|███▋      | 302/808 [00:49<01:15,  6.68it/s]
 38%|███▊      | 303/808 [00:51<01:15,  6.68it/s]

{'eval_loss': 1.1040960550308228, 'eval_f1': 0.7965370697816657, 'eval_runtime': 1.3935, 'eval_samples_per_second': 200.21, 'eval_steps_per_second': 25.116, 'epoch': 3.0}


 50%|████▉     | 403/808 [01:08<00:59,  6.84it/s]
 50%|█████     | 404/808 [01:09<00:59,  6.84it/s]

{'eval_loss': 1.1084553003311157, 'eval_f1': 0.8123984948187972, 'eval_runtime': 1.3842, 'eval_samples_per_second': 201.563, 'eval_steps_per_second': 25.286, 'epoch': 4.0}


 62%|██████▏   | 501/808 [01:25<00:44,  6.85it/s]

{'loss': 0.199, 'grad_norm': 0.14616155624389648, 'learning_rate': 3.8118811881188123e-06, 'epoch': 4.95}


 62%|██████▏   | 504/808 [01:25<00:44,  6.89it/s]
 62%|██████▎   | 505/808 [01:27<00:44,  6.89it/s]

{'eval_loss': 1.2167236804962158, 'eval_f1': 0.8165956546253277, 'eval_runtime': 1.401, 'eval_samples_per_second': 199.141, 'eval_steps_per_second': 24.982, 'epoch': 5.0}


 75%|███████▍  | 605/808 [01:43<00:30,  6.61it/s]
 75%|███████▌  | 606/808 [01:45<00:30,  6.61it/s]

{'eval_loss': 1.2397997379302979, 'eval_f1': 0.8103734038569294, 'eval_runtime': 1.4201, 'eval_samples_per_second': 196.465, 'eval_steps_per_second': 24.646, 'epoch': 6.0}


 88%|████████▊ | 707/808 [02:01<00:13,  7.43it/s]
 88%|████████▊ | 707/808 [02:02<00:13,  7.43it/s]

{'eval_loss': 1.2387895584106445, 'eval_f1': 0.8100678991840738, 'eval_runtime': 1.4187, 'eval_samples_per_second': 196.66, 'eval_steps_per_second': 24.671, 'epoch': 7.0}


100%|█████████▉| 807/808 [02:19<00:00,  6.74it/s]
100%|██████████| 808/808 [02:20<00:00,  6.74it/s]

{'eval_loss': 1.2494066953659058, 'eval_f1': 0.8104053634802086, 'eval_runtime': 1.425, 'eval_samples_per_second': 195.792, 'eval_steps_per_second': 24.562, 'epoch': 8.0}


100%|██████████| 808/808 [02:23<00:00,  5.65it/s]


{'train_runtime': 142.9412, 'train_samples_per_second': 44.83, 'train_steps_per_second': 5.653, 'train_loss': 0.1428397327366442, 'epoch': 8.0}

--- Evaluating the final SPECIALIST model on the sacred test set ---


100%|██████████| 25/25 [00:00<00:00, 25.75it/s]



--- FINAL EXPERIMENT COMPLETE ---

Final Model: final_tune_random_deletion
Previous Best F1 Score: 0.69
Final Specialist F1 Score on Friends Data: 0.7250

Your final, best specialist model is saved in: ./results/final_tune_random_deletion

SUCCESS! The new strategy improved performance.



