In [1]:
# Cell 1: Imports
import os
import numpy as np
import pandas as pd
import torch
import evaluate
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    pipeline # Import the pipeline for back-translation
)

print("Imports complete. Setup ready for the final experiment.")

  from .autonotebook import tqdm as notebook_tqdm


Imports complete. Setup ready for the final experiment.


In [2]:
# Cell 2: Configuration and Back-Translation Augmentation

# --- CONFIGURATION ---
# Using the same powerful model as last time
MODEL_CHECKPOINT = "cardiffnlp/twitter-roberta-base-emotion-multilabel-latest"
RUN_NAME = "final_run_clean_data_backtranslation"

# --- Define paths to your data ---
FRIENDS_DATA_PATH = '../data/data1.xlsx' # <-- Using your newly cleaned file!
KAGGLE_DATA_PATHS = ['../data/data2.xlsx', '../data/data3.xlsx']

# --- Setup Back-Translation Augmenter using transformers ---
print("Loading translation models for back-translation (this may take a moment)...")
translator_to_de = pipeline("translation", model="Helsinki-NLP/opus-mt-en-de", device=0 if torch.cuda.is_available() else -1)
translator_to_en = pipeline("translation", model="Helsinki-NLP/opus-mt-de-en", device=0 if torch.cuda.is_available() else -1)
print("Translation models loaded.")

def augment_with_backtranslation(df, num_augmented_samples=1000):
    augmented_texts = []
    original_df = df.copy().dropna(subset=['text'])

    print(f"Generating {num_augmented_samples} new samples via Back-Translation...")
    if len(original_df) == 0:
        print("Warning: DataFrame is empty. Skipping augmentation.")
        return original_df

    for i in range(num_augmented_samples):
        if (i + 1) % 50 == 0:
            print(f"  ...augmented {i+1}/{num_augmented_samples}")
            
        sample = original_df.sample(1)
        original_text = sample['text'].iloc[0]
        original_emotion = sample['emotion'].iloc[0]

        # Translate to German and back to English to create a paraphrase
        translated = translator_to_de(original_text, max_length=128)[0]['translation_text']
        paraphrase = translator_to_en(translated, max_length=128)[0]['translation_text']
        
        augmented_texts.append({'text': paraphrase, 'emotion': original_emotion})
        
    return pd.concat([original_df, pd.DataFrame(augmented_texts)], ignore_index=True)

Loading translation models for back-translation (this may take a moment)...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Translation models loaded.


In [3]:
# Cell 3: Full Data Preparation with Back-Translation

print("\n--- STEP 1: Preparing Datasets with Clean Data and Augmentation ---")

# Load and combine data
df_friends = pd.read_excel(FRIENDS_DATA_PATH)
df_kaggle_list = [pd.read_excel(p) for p in KAGGLE_DATA_PATHS]
df_kaggle = pd.concat(df_kaggle_list, ignore_index=True)

# Standardize and rename columns
df_friends.columns = [col.strip().lower() for col in df_friends.columns]
df_kaggle.columns = [col.strip().lower() for col in df_kaggle.columns]
if 'entry' in df_friends.columns: df_friends.rename(columns={'entry': 'text'}, inplace=True)
if 'entry' in df_kaggle.columns: df_kaggle.rename(columns={'entry': 'text'}, inplace=True)

# Clean and drop duplicates
df_friends.dropna(subset=['text', 'emotion'], inplace=True)
df_kaggle.dropna(subset=['text', 'emotion'], inplace=True)
df_friends.drop_duplicates(subset=['text'], inplace=True)
df_kaggle.drop_duplicates(subset=['text'], inplace=True)

# Create the same strategic split as before
test_df = df_friends.sample(frac=0.2, random_state=42)
train_friends_df = df_friends.drop(test_df.index) # Specialist data
train_pool_df = pd.concat([train_friends_df, df_kaggle], ignore_index=True)
train_pool_df, val_df = train_test_split(train_pool_df, test_size=0.1, random_state=42, stratify=train_pool_df['emotion'])

# --- APPLY BACK-TRANSLATION AUGMENTATION ---
train_pool_df = augment_with_backtranslation(train_pool_df)
# ---------------------------------------------

# Convert pandas DataFrames to Hugging Face Datasets
train_pool_ds = Dataset.from_pandas(train_pool_df)
train_friends_ds = Dataset.from_pandas(train_friends_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

print(f"\nAugmented General Training Pool size: {len(train_pool_df)}")
print(f"Specialist Training set (friends only): {len(train_friends_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Sacred Test set (friends only): {len(test_df)}")


--- STEP 1: Preparing Datasets with Clean Data and Augmentation ---
Generating 1000 new samples via Back-Translation...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Your input_length: 174 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 127 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


  ...augmented 50/1000
  ...augmented 100/1000
  ...augmented 150/1000
  ...augmented 200/1000
  ...augmented 250/1000


Your input_length: 178 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 127 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


  ...augmented 300/1000
  ...augmented 350/1000
  ...augmented 400/1000
  ...augmented 450/1000


Your input_length: 186 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 127 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


  ...augmented 500/1000


Your input_length: 120 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


  ...augmented 550/1000
  ...augmented 600/1000


Your input_length: 174 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 127 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


  ...augmented 650/1000


Your input_length: 182 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 127 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 120 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


  ...augmented 700/1000
  ...augmented 750/1000


Your input_length: 184 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 127 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


  ...augmented 800/1000
  ...augmented 850/1000
  ...augmented 900/1000


Your input_length: 192 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 127 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 198 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 127 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


  ...augmented 950/1000
  ...augmented 1000/1000

Augmented General Training Pool size: 3509
Specialist Training set (friends only): 801
Validation set size: 279
Sacred Test set (friends only): 200


In [4]:
# Cell 4: Preprocessing, Custom Trainer, and Tokenization

print("\n--- STEP 2: Final Preprocessing and Setup ---")

# Create label mappings from the full training data
unique_labels = sorted(train_pool_df['emotion'].unique())
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for i, label in enumerate(unique_labels)}
num_labels = len(unique_labels)

# --- CRITICAL CHANGE: We no longer translate emojis ---
def preprocess_and_encode(batch):
    # The text remains as is, with raw emojis.
    # We only encode the labels.
    batch['label'] = [label2id[label] for label in batch['emotion']]
    return batch

# Apply preprocessing to all datasets
train_pool_ds = train_pool_ds.map(preprocess_and_encode, batched=True, remove_columns=['emotion'])
train_friends_ds = train_friends_ds.map(preprocess_and_encode, batched=True, remove_columns=['emotion'])
val_ds = val_ds.map(preprocess_and_encode, batched=True, remove_columns=['emotion'])
test_ds = test_ds.map(preprocess_and_encode, batched=True, remove_columns=['emotion'])

# Calculate class weights for handling imbalance
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array(unique_labels),
    y=train_pool_df['emotion']
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to("cuda")

# Create a Custom Trainer to use the class weights
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Define a shared metric computation function
metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels, average="weighted")

# Initialize tokenizer and tokenize datasets
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

tokenized_train_pool_ds = train_pool_ds.map(tokenize_fn, batched=True, remove_columns=['text'])
tokenized_train_friends_ds = train_friends_ds.map(tokenize_fn, batched=True, remove_columns=['text'])
tokenized_val_ds = val_ds.map(tokenize_fn, batched=True, remove_columns=['text'])
tokenized_test_ds = test_ds.map(tokenize_fn, batched=True, remove_columns=['text'])

print("Setup complete. Ready for the ultimate Two-Stage Training.")


--- STEP 2: Final Preprocessing and Setup ---


Map: 100%|██████████| 3509/3509 [00:00<00:00, 447281.96 examples/s]
Map: 100%|██████████| 801/801 [00:00<00:00, 160181.06 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 92984.57 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 80427.69 examples/s]
Map: 100%|██████████| 3509/3509 [00:00<00:00, 23212.28 examples/s]
Map: 100%|██████████| 801/801 [00:00<00:00, 14689.19 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 17422.15 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 15429.03 examples/s]

Setup complete. Ready for the ultimate Two-Stage Training.





In [5]:
# Cell 5: Two-Stage Training and Final Evaluation

print(f"\n{'='*50}\nSTARTING TWO-STAGE TRAINING: {RUN_NAME}\n{'='*50}\n")

# Load the initial model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

output_dir = f"./results/{RUN_NAME}"

# --- STAGE 1: Generalist Training on Augmented Data ---
print("\n--- STAGE 1: Training on General Pool (Cleaned + Kaggle + Back-Translation) ---")
stage1_args = TrainingArguments(
    output_dir=f"{output_dir}/stage1",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=1,
    seed=42,
)

trainer_stage1 = CustomTrainer(
    model=model,
    args=stage1_args,
    train_dataset=tokenized_train_pool_ds,
    eval_dataset=tokenized_val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer_stage1.train()


# --- STAGE 2: Specialist Training on Cleaned Friends Data ---
print("\n--- STAGE 2: Fine-tuning on Specialist Data (Cleaned Friends Only) ---")
stage2_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    num_train_epochs=8,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=1,
    seed=42,
)

trainer_stage2 = CustomTrainer(
    model=trainer_stage1.model,
    args=stage2_args,
    train_dataset=tokenized_train_friends_ds,
    eval_dataset=tokenized_val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer_stage2.train()

# --- FINAL EVALUATION ---
print("\n--- Evaluating the final SPECIALIST model on the sacred test set ---")
test_results = trainer_stage2.evaluate(eval_dataset=tokenized_test_ds)

print(f"\n\n{'='*60}\n--- FINAL EXPERIMENT COMPLETE ---\n")
print(f"Final Model: {RUN_NAME}")
print(f"Previous Best F1 Score: 0.7250")
print(f"Final Specialist F1 Score on Friends Data: {test_results['eval_f1']:.4f}")
print(f"\nYour final, best specialist model is saved in: {output_dir}")

if test_results['eval_f1'] > 0.7250:
    print("\nCHAMPION! This is the new best model!")
else:
    print("\nPerformance was similar. The cleaned data and new augmentation have made the model more robust.")
print(f"{'='*60}")


STARTING TWO-STAGE TRAINING: final_run_clean_data_backtranslation



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion-multilabel-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([11, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([11]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- STAGE 1: Training on General Pool (Cleaned + Kaggle + Back-Translation) ---


                                                  
 20%|██        | 220/1100 [00:51<02:48,  5.24it/s]

{'eval_loss': 0.9580445885658264, 'eval_f1': 0.668790288439472, 'eval_runtime': 1.2761, 'eval_samples_per_second': 218.638, 'eval_steps_per_second': 27.428, 'epoch': 1.0}


                                                  
 40%|████      | 440/1100 [01:59<03:48,  2.89it/s]

{'eval_loss': 0.9182518124580383, 'eval_f1': 0.6670591323691726, 'eval_runtime': 2.2989, 'eval_samples_per_second': 121.365, 'eval_steps_per_second': 15.225, 'epoch': 2.0}


 46%|████▌     | 501/1100 [02:19<02:14,  4.45it/s]

{'loss': 0.9189, 'grad_norm': 16.875532150268555, 'learning_rate': 1.6363636363636363e-05, 'epoch': 2.27}


                                                  
 60%|██████    | 660/1100 [02:56<01:25,  5.17it/s]

{'eval_loss': 0.9603327512741089, 'eval_f1': 0.7056607869709697, 'eval_runtime': 1.3256, 'eval_samples_per_second': 210.472, 'eval_steps_per_second': 26.403, 'epoch': 3.0}


                                                  
 80%|████████  | 880/1100 [03:52<00:44,  4.95it/s]

{'eval_loss': 1.0790836811065674, 'eval_f1': 0.7138750894114849, 'eval_runtime': 1.3765, 'eval_samples_per_second': 202.695, 'eval_steps_per_second': 25.428, 'epoch': 4.0}


 91%|█████████ | 1000/1100 [04:23<00:24,  4.06it/s]

{'loss': 0.3522, 'grad_norm': 0.9848214387893677, 'learning_rate': 2.7272727272727272e-06, 'epoch': 4.55}


                                                   
100%|██████████| 1100/1100 [04:48<00:00,  4.97it/s]

{'eval_loss': 1.1269820928573608, 'eval_f1': 0.7216312957439908, 'eval_runtime': 1.3761, 'eval_samples_per_second': 202.749, 'eval_steps_per_second': 25.434, 'epoch': 5.0}


100%|██████████| 1100/1100 [04:50<00:00,  3.78it/s]


{'train_runtime': 290.7028, 'train_samples_per_second': 60.354, 'train_steps_per_second': 3.784, 'train_loss': 0.5980449728532271, 'epoch': 5.0}

--- STAGE 2: Fine-tuning on Specialist Data (Cleaned Friends Only) ---


 12%|█▎        | 101/808 [00:23<02:34,  4.56it/s]
 12%|█▎        | 101/808 [00:24<02:34,  4.56it/s]

{'eval_loss': 0.9879629611968994, 'eval_f1': 0.7654741971361175, 'eval_runtime': 1.3213, 'eval_samples_per_second': 211.16, 'eval_steps_per_second': 26.49, 'epoch': 1.0}


                                                 
 25%|██▌       | 202/808 [00:50<02:09,  4.67it/s]

{'eval_loss': 1.0310834646224976, 'eval_f1': 0.7844232648781769, 'eval_runtime': 1.3356, 'eval_samples_per_second': 208.9, 'eval_steps_per_second': 26.206, 'epoch': 2.0}


 38%|███▊      | 303/808 [01:15<01:49,  4.60it/s]
 38%|███▊      | 303/808 [01:17<01:49,  4.60it/s]

{'eval_loss': 1.0360987186431885, 'eval_f1': 0.8111528382969713, 'eval_runtime': 1.3089, 'eval_samples_per_second': 213.157, 'eval_steps_per_second': 26.74, 'epoch': 3.0}


 50%|█████     | 404/808 [01:42<01:30,  4.45it/s]
 50%|█████     | 404/808 [01:43<01:30,  4.45it/s]

{'eval_loss': 1.1280171871185303, 'eval_f1': 0.8186603584788058, 'eval_runtime': 1.3048, 'eval_samples_per_second': 213.833, 'eval_steps_per_second': 26.825, 'epoch': 4.0}


 62%|██████▏   | 501/808 [02:07<01:10,  4.34it/s]

{'loss': 0.1769, 'grad_norm': 0.14687153697013855, 'learning_rate': 3.8118811881188123e-06, 'epoch': 4.95}


 62%|██████▎   | 505/808 [02:08<01:04,  4.70it/s]
 62%|██████▎   | 505/808 [02:10<01:04,  4.70it/s]

{'eval_loss': 1.1726030111312866, 'eval_f1': 0.8075795906522949, 'eval_runtime': 1.3211, 'eval_samples_per_second': 211.188, 'eval_steps_per_second': 26.493, 'epoch': 5.0}


 75%|███████▌  | 606/808 [02:35<00:41,  4.87it/s]
 75%|███████▌  | 606/808 [02:36<00:41,  4.87it/s]

{'eval_loss': 1.2375930547714233, 'eval_f1': 0.7939227090350217, 'eval_runtime': 1.3136, 'eval_samples_per_second': 212.4, 'eval_steps_per_second': 26.645, 'epoch': 6.0}


 88%|████████▊ | 707/808 [03:01<00:22,  4.51it/s]
 88%|████████▊ | 707/808 [03:03<00:22,  4.51it/s]

{'eval_loss': 1.2343744039535522, 'eval_f1': 0.8071103999881813, 'eval_runtime': 1.2942, 'eval_samples_per_second': 215.574, 'eval_steps_per_second': 27.043, 'epoch': 7.0}


 88%|████████▊ | 707/808 [03:05<00:26,  3.81it/s]


{'train_runtime': 185.3493, 'train_samples_per_second': 34.573, 'train_steps_per_second': 4.359, 'train_loss': 0.137194749489532, 'epoch': 7.0}

--- Evaluating the final SPECIALIST model on the sacred test set ---


100%|██████████| 25/25 [00:00<00:00, 27.45it/s]



--- FINAL EXPERIMENT COMPLETE ---

Final Model: final_run_clean_data_backtranslation
Previous Best F1 Score: 0.7250
Final Specialist F1 Score on Friends Data: 0.7570

Your final, best specialist model is saved in: ./results/final_run_clean_data_backtranslation

CHAMPION! This is the new best model!



