In [1]:
# Cell 1: Imports and Setup (No Changes)
import os
import numpy as np
import pandas as pd
import torch
import evaluate
import demoji
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    AdamW
)

print("Imports complete. Setup ready.")

  from .autonotebook import tqdm as notebook_tqdm


Imports complete. Setup ready.


In [2]:
# Cell 2: Configuration (No Changes)
MODEL_CHECKPOINT = "cardiffnlp/twitter-roberta-base-emotion"
FRIENDS_DATA_PATH = '../data/data1.xlsx'
KAGGLE_DATA_PATHS = ['../data/data2.xlsx', '../data/data3.xlsx']
RUN_NAME = "specialist_two_stage_tune"
print("Configuration loaded.")

Configuration loaded.


In [3]:
# Cell 3: Data Loading and Splitting (Slight Modification)
print("--- STEP 1: Preparing Datasets for Two-Stage Training ---")
# Load and clean data
df_friends = pd.read_excel(FRIENDS_DATA_PATH)
df_kaggle_list = [pd.read_excel(p) for p in KAGGLE_DATA_PATHS]
df_kaggle = pd.concat(df_kaggle_list, ignore_index=True)
df_friends.columns = [col.strip().lower() for col in df_friends.columns]
df_kaggle.columns = [col.strip().lower() for col in df_kaggle.columns]
if 'entry' in df_friends.columns:
    df_friends.rename(columns={'entry': 'text'}, inplace=True)
if 'entry' in df_kaggle.columns:
    df_kaggle.rename(columns={'entry': 'text'}, inplace=True)
df_friends.dropna(subset=['text', 'emotion'], inplace=True)
df_kaggle.dropna(subset=['text', 'emotion'], inplace=True)
df_friends.drop_duplicates(subset=['text'], inplace=True)
df_kaggle.drop_duplicates(subset=['text'], inplace=True)

# Strategic Split
test_df = df_friends.sample(frac=0.2, random_state=42)
train_friends_df = df_friends.drop(test_df.index)
train_pool_df = pd.concat([train_friends_df, df_kaggle], ignore_index=True)
train_pool_df, val_df = train_test_split(train_pool_df, test_size=0.1, random_state=42, stratify=train_pool_df['emotion'])

# We now have THREE dataframes for training:
# 1. train_pool_df: The large general dataset (Friends + Kaggle)
# 2. train_friends_df: The smaller specialist dataset (Friends only)
# 3. val_df: The validation set (from the general pool)
# 4. test_df: The sacred test set (from friends only)

print(f"General Training Pool size: {len(train_pool_df)}")
print(f"Specialist Training set (friends only): {len(train_friends_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Sacred Test set (friends only): {len(test_df)}")

# Convert to Hugging Face Datasets
train_pool_ds = Dataset.from_pandas(train_pool_df)
train_friends_ds = Dataset.from_pandas(train_friends_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)


--- STEP 1: Preparing Datasets for Two-Stage Training ---
General Training Pool size: 2509
Specialist Training set (friends only): 801
Validation set size: 279
Sacred Test set (friends only): 200


In [4]:
# Cell 4: Preprocessing and Setup (No Changes)
print("\n--- STEP 2: Final Preprocessing and Setup ---")
demoji.download_codes()
def preprocess_text_and_labels(batch):
    batch['text'] = [demoji.replace_with_desc(str(text), sep=" ") for text in batch['text']]
    batch['label'] = [label2id[label] for label in batch['emotion']]
    return batch

unique_labels = train_pool_df['emotion'].unique()
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for i, label in enumerate(unique_labels)}
num_labels = len(unique_labels)

# Apply preprocessing to all datasets
train_pool_ds = train_pool_ds.map(preprocess_text_and_labels, batched=True)
train_friends_ds = train_friends_ds.map(preprocess_text_and_labels, batched=True)
val_ds = val_ds.map(preprocess_text_and_labels, batched=True)
test_ds = test_ds.map(preprocess_text_and_labels, batched=True)

class_weights = compute_class_weight('balanced', classes=np.array(list(label2id.keys())), y=train_pool_df['emotion'])
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to("cuda")

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels, average="weighted")

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

tokenized_train_pool_ds = train_pool_ds.map(tokenize_fn, batched=True)
tokenized_train_friends_ds = train_friends_ds.map(tokenize_fn, batched=True)
tokenized_val_ds = val_ds.map(tokenize_fn, batched=True)
tokenized_test_ds = test_ds.map(tokenize_fn, batched=True)

print("Setup complete. Ready for Two-Stage Training.")

  demoji.download_codes()



--- STEP 2: Final Preprocessing and Setup ---


Map: 100%|██████████| 2509/2509 [00:01<00:00, 2096.78 examples/s]
Map: 100%|██████████| 801/801 [00:00<00:00, 1445.16 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 2359.07 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 1350.31 examples/s]
Map: 100%|██████████| 2509/2509 [00:00<00:00, 11386.91 examples/s]
Map: 100%|██████████| 801/801 [00:00<00:00, 21606.08 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 18571.53 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 16647.04 examples/s]

Setup complete. Ready for Two-Stage Training.





In [6]:
# Cell 5: Two-Stage Training
print(f"\n{'='*50}\nSTARTING TWO-STAGE TRAINING: {RUN_NAME}\n{'='*50}\n")

# Load the initial model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

output_dir = f"./results/{RUN_NAME}"

# --- STAGE 1: Generalist Training ---
print("\n--- STAGE 1: Training on General Pool (Friends + Kaggle) ---")
stage1_args = TrainingArguments(
    output_dir=f"{output_dir}/stage1_checkpoints", # Save intermediate checkpoints here
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=1,
    seed=42,
)

# Initialize the first trainer
trainer_stage1 = CustomTrainer(
    model=model,
    args=stage1_args,
    train_dataset=tokenized_train_pool_ds,
    eval_dataset=tokenized_val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# Run the first stage of training
trainer_stage1.train()

# The best model from stage 1 is now loaded into trainer_stage1.model


# --- STAGE 2: Specialist Training ---
print("\n--- STAGE 2: Fine-tuning on Specialist Data (Friends Only) ---")

stage2_args = TrainingArguments(
    output_dir=output_dir, # Save the FINAL model to the main run directory
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=1,
    seed=42,
)

# --- THIS IS THE FIX ---
# Create a NEW Trainer for the second stage.
# It uses the SAME model object, which now contains the weights from Stage 1.
trainer_stage2 = CustomTrainer(
    model=trainer_stage1.model, # Use the model that finished Stage 1
    args=stage2_args,
    train_dataset=tokenized_train_friends_ds,
    eval_dataset=tokenized_val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)
# --------------------

# Run the second, specialist stage of training
trainer_stage2.train()


STARTING TWO-STAGE TRAINING: specialist_two_stage_tune



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- STAGE 1: Training on General Pool (Friends + Kaggle) ---


 25%|██▌       | 157/628 [00:47<02:17,  3.42it/s]
 25%|██▌       | 157/628 [00:48<02:17,  3.42it/s]

{'eval_loss': 0.9491251111030579, 'eval_f1': 0.699398630570837, 'eval_runtime': 1.6556, 'eval_samples_per_second': 168.515, 'eval_steps_per_second': 21.14, 'epoch': 1.0}


 50%|█████     | 314/628 [01:55<01:57,  2.68it/s]
 50%|█████     | 314/628 [01:57<01:57,  2.68it/s]

{'eval_loss': 0.8604706525802612, 'eval_f1': 0.7403891023656725, 'eval_runtime': 2.6594, 'eval_samples_per_second': 104.912, 'eval_steps_per_second': 13.161, 'epoch': 2.0}


 75%|███████▌  | 471/628 [03:00<01:00,  2.57it/s]
 75%|███████▌  | 471/628 [03:02<01:00,  2.57it/s]

{'eval_loss': 0.8669783473014832, 'eval_f1': 0.738660690144155, 'eval_runtime': 1.9336, 'eval_samples_per_second': 144.291, 'eval_steps_per_second': 18.101, 'epoch': 3.0}


 80%|███████▉  | 500/628 [03:18<00:55,  2.31it/s]

{'loss': 0.8451, 'grad_norm': 6.169551849365234, 'learning_rate': 6.114649681528663e-06, 'epoch': 3.18}


100%|██████████| 628/628 [04:02<00:00,  3.07it/s]
100%|██████████| 628/628 [04:05<00:00,  3.07it/s]

{'eval_loss': 0.8949955701828003, 'eval_f1': 0.745667436341887, 'eval_runtime': 2.3235, 'eval_samples_per_second': 120.08, 'eval_steps_per_second': 15.064, 'epoch': 4.0}


100%|██████████| 628/628 [04:09<00:00,  3.07it/s]

{'train_runtime': 249.7721, 'train_samples_per_second': 40.181, 'train_steps_per_second': 2.514, 'train_loss': 0.7610877395435504, 'epoch': 4.0}


100%|██████████| 628/628 [04:10<00:00,  2.51it/s]



--- STAGE 2: Fine-tuning on Specialist Data (Friends Only) ---


 17%|█▋        | 101/606 [00:27<02:19,  3.62it/s]
 17%|█▋        | 101/606 [00:29<02:19,  3.62it/s]

{'eval_loss': 0.8314382433891296, 'eval_f1': 0.7886173534495715, 'eval_runtime': 2.217, 'eval_samples_per_second': 125.843, 'eval_steps_per_second': 15.787, 'epoch': 1.0}


 33%|███▎      | 202/606 [00:56<02:18,  2.91it/s]
 33%|███▎      | 202/606 [00:58<02:18,  2.91it/s]

{'eval_loss': 0.8567943572998047, 'eval_f1': 0.8083294503562307, 'eval_runtime': 2.8122, 'eval_samples_per_second': 99.21, 'eval_steps_per_second': 12.446, 'epoch': 2.0}


 50%|█████     | 303/606 [01:37<02:21,  2.15it/s]
 50%|█████     | 303/606 [01:39<02:21,  2.15it/s]

{'eval_loss': 0.9350295066833496, 'eval_f1': 0.8047111907412341, 'eval_runtime': 2.2499, 'eval_samples_per_second': 124.005, 'eval_steps_per_second': 15.556, 'epoch': 3.0}


 67%|██████▋   | 404/606 [02:09<00:51,  3.89it/s]
 67%|██████▋   | 404/606 [02:11<00:51,  3.89it/s]

{'eval_loss': 0.939440131187439, 'eval_f1': 0.8201078033639704, 'eval_runtime': 1.9943, 'eval_samples_per_second': 139.902, 'eval_steps_per_second': 17.55, 'epoch': 4.0}


 83%|████████▎ | 501/606 [02:38<00:25,  4.09it/s]

{'loss': 0.2706, 'grad_norm': 0.4594554305076599, 'learning_rate': 1.7491749174917493e-06, 'epoch': 4.95}


 83%|████████▎ | 505/606 [02:39<00:28,  3.59it/s]
 83%|████████▎ | 505/606 [02:42<00:28,  3.59it/s]

{'eval_loss': 0.9866503477096558, 'eval_f1': 0.813989877363965, 'eval_runtime': 2.3724, 'eval_samples_per_second': 117.605, 'eval_steps_per_second': 14.753, 'epoch': 5.0}


100%|█████████▉| 605/606 [03:05<00:00,  7.00it/s]
100%|██████████| 606/606 [03:06<00:00,  7.00it/s]

{'eval_loss': 0.9987325668334961, 'eval_f1': 0.8066750478935846, 'eval_runtime': 1.3378, 'eval_samples_per_second': 208.555, 'eval_steps_per_second': 26.163, 'epoch': 6.0}


100%|██████████| 606/606 [03:09<00:00,  3.20it/s]

{'train_runtime': 189.3307, 'train_samples_per_second': 25.384, 'train_steps_per_second': 3.201, 'train_loss': 0.24605793213293498, 'epoch': 6.0}





TrainOutput(global_step=606, training_loss=0.24605793213293498, metrics={'train_runtime': 189.3307, 'train_samples_per_second': 25.384, 'train_steps_per_second': 3.201, 'total_flos': 316139286555648.0, 'train_loss': 0.24605793213293498, 'epoch': 6.0})

In [7]:
# Cell 6: Final Evaluation
print("\n--- Evaluating the final SPECIALIST model on the sacred test set ---")
test_results = trainer.evaluate(eval_dataset=tokenized_test_ds)
print(f"\n\n{'='*60}\n--- FINAL EXPERIMENT COMPLETE ---\n")
print(f"Final Model: {RUN_NAME} (Two-Stage Trained)")
print(f"Previous Honest Baseline F1 Score: 0.5771")
print(f"Final Specialist F1 Score on Friends Data: {test_results['eval_f1']:.4f}")
print(f"\nYour final, best specialist model is saved in: {output_dir}")

if test_results['eval_f1'] > 0.5771:
    print("\nSUCCESS! The two-stage training strategy improved performance.")
else:
    print("\nThis is still a strong result. The model is now highly specialized for your data.")
print(f"{'='*60}")


--- Evaluating the final SPECIALIST model on the sacred test set ---


100%|██████████| 25/25 [00:00<00:00, 27.21it/s]



--- FINAL EXPERIMENT COMPLETE ---

Final Model: specialist_two_stage_tune (Two-Stage Trained)
Previous Honest Baseline F1 Score: 0.5771
Final Specialist F1 Score on Friends Data: 0.6665

Your final, best specialist model is saved in: ./results/specialist_two_stage_tune

SUCCESS! The two-stage training strategy improved performance.



