In [1]:
# Cell 1: Imports and Setup (No Changes)
import os
import numpy as np
import pandas as pd
import torch
import evaluate
import demoji
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    AdamW,
    get_scheduler
)

print("Imports complete. Setup ready.")

  from .autonotebook import tqdm as notebook_tqdm


Imports complete. Setup ready.


In [4]:
# Cell 2: Configuration (No Changes)
MODEL_CHECKPOINT = "cardiffnlp/twitter-roberta-base-emotion"
FRIENDS_DATA_PATH = '../data/data1.xlsx'
KAGGLE_DATA_PATHS = ['../data/data2.xlsx', '../data/data3.xlsx']
TRAINING_PARAMS = {
    "evaluation_strategy": "epoch",
    "save_strategy": "epoch",
    "per_device_train_batch_size": 16,
    "num_train_epochs": 8,
    "weight_decay": 0.01,
    "load_best_model_at_end": True,
    "metric_for_best_model": "f1",
    "save_total_limit": 1,
    "seed": 42,
}
RUN_NAME = "controlled_run_no_augmentation"
print("Configuration loaded.")

Configuration loaded.


In [5]:
# Cell 3: Data Loading, Cleaning, and Splitting (No Changes)
print("--- STEP 1: Preparing Datasets from Excel Files ---")
df_friends = pd.read_excel(FRIENDS_DATA_PATH)
df_kaggle_list = [pd.read_excel(p) for p in KAGGLE_DATA_PATHS]
df_kaggle = pd.concat(df_kaggle_list, ignore_index=True)
df_friends.columns = [col.strip().lower() for col in df_friends.columns]
df_kaggle.columns = [col.strip().lower() for col in df_kaggle.columns]
if 'entry' in df_friends.columns:
    df_friends.rename(columns={'entry': 'text'}, inplace=True)
if 'entry' in df_kaggle.columns:
    df_kaggle.rename(columns={'entry': 'text'}, inplace=True)
df_friends.dropna(subset=['text', 'emotion'], inplace=True)
df_kaggle.dropna(subset=['text', 'emotion'], inplace=True)
df_friends.drop_duplicates(subset=['text'], inplace=True)
df_kaggle.drop_duplicates(subset=['text'], inplace=True)
test_df = df_friends.sample(frac=0.2, random_state=42)
train_pool_df = pd.concat([df_friends.drop(test_df.index), df_kaggle], ignore_index=True)
train_df, val_df = train_test_split(train_pool_df, test_size=0.1, random_state=42, stratify=train_pool_df['emotion'])
print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Sacred Test set (friends only): {len(test_df)}")

--- STEP 1: Preparing Datasets from Excel Files ---
Train set size: 2509
Validation set size: 279
Sacred Test set (friends only): 200


In [6]:
# Cell 4: Convert to Datasets (AUGMENTATION IS REMOVED)
print("\n--- STEP 2: Converting to Datasets (No Augmentation) ---")

# We are NOT augmenting the data in this run. We use the original train_df.
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

print("Conversion to Datasets complete.")


--- STEP 2: Converting to Datasets (No Augmentation) ---
Conversion to Datasets complete.


In [7]:
# Cell 5: Final Preprocessing and Setup (No Changes)
print("\n--- STEP 3: Final Preprocessing and Setup ---")
demoji.download_codes()
def preprocess_text_and_labels(batch):
    batch['text'] = [demoji.replace_with_desc(str(text), sep=" ") for text in batch['text']]
    batch['label'] = [label2id[label] for label in batch['emotion']]
    return batch
unique_labels = train_df['emotion'].unique()
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for i, label in enumerate(unique_labels)}
num_labels = len(unique_labels)
train_ds = train_ds.map(preprocess_text_and_labels, batched=True)
val_ds = val_ds.map(preprocess_text_and_labels, batched=True)
test_ds = test_ds.map(preprocess_text_and_labels, batched=True)
class_weights = compute_class_weight('balanced', classes=np.array(list(label2id.keys())), y=train_df['emotion'])
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to("cuda")
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss
metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels, average="weighted")
print("Setup complete. Ready for training.")

  demoji.download_codes()



--- STEP 3: Final Preprocessing and Setup ---


Map: 100%|██████████| 2509/2509 [00:01<00:00, 2293.11 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 2410.56 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 1846.72 examples/s]


Setup complete. Ready for training.


In [8]:
# Cell 6: The Controlled Training Run (No Changes)
print(f"\n{'='*50}\nSTARTING CONTROLLED RUN: {RUN_NAME}\n{'='*50}\n")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)
tokenized_train_ds = train_ds.map(tokenize_fn, batched=True)
tokenized_val_ds = val_ds.map(tokenize_fn, batched=True)
tokenized_test_ds = test_ds.map(tokenize_fn, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
optimizer_grouped_parameters = [
    {"params": [p for n, p in model.named_parameters() if "classifier" not in n], "lr": 2e-6},
    {"params": [p for n, p in model.named_parameters() if "classifier" in n], "lr": 3e-5},
]
optimizer = AdamW(optimizer_grouped_parameters)
output_dir = f"./results/{RUN_NAME}"
training_args = TrainingArguments(output_dir=output_dir, **TRAINING_PARAMS)
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    optimizers=(optimizer, None)
)
trainer.train()


STARTING CONTROLLED RUN: controlled_run_no_augmentation



Map: 100%|██████████| 2509/2509 [00:00<00:00, 10182.35 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 19644.63 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 16540.68 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                  
 12%|█▎        | 157/1256 [02:12<14:08,  1.29it/s]

{'eval_loss': 1.316928744316101, 'eval_f1': 0.5470190852510645, 'eval_runtime': 1.2646, 'eval_samples_per_second': 220.624, 'eval_steps_per_second': 27.677, 'epoch': 1.0}


                                                  
 25%|██▌       | 314/1256 [05:24<19:36,  1.25s/it]

{'eval_loss': 1.212338924407959, 'eval_f1': 0.5924467254331978, 'eval_runtime': 2.8969, 'eval_samples_per_second': 96.309, 'eval_steps_per_second': 12.082, 'epoch': 2.0}


                                                    
 38%|███▊      | 471/1256 [08:55<15:25,  1.18s/it]

{'eval_loss': 1.1269726753234863, 'eval_f1': 0.6098513399744158, 'eval_runtime': 1.7276, 'eval_samples_per_second': 161.494, 'eval_steps_per_second': 20.259, 'epoch': 3.0}


 40%|███▉      | 500/1256 [09:41<16:57,  1.35s/it]

{'loss': 1.2667, 'grad_norm': 21.793153762817383, 'learning_rate': 1.2038216560509554e-06, 'epoch': 3.18}


                                                  
 50%|█████     | 628/1256 [12:18<14:48,  1.42s/it]

{'eval_loss': 1.0786001682281494, 'eval_f1': 0.61896156601311, 'eval_runtime': 4.2663, 'eval_samples_per_second': 65.396, 'eval_steps_per_second': 8.204, 'epoch': 4.0}


                                                  
 62%|██████▎   | 785/1256 [16:09<09:57,  1.27s/it]

{'eval_loss': 1.0460480451583862, 'eval_f1': 0.6229294896686146, 'eval_runtime': 4.2856, 'eval_samples_per_second': 65.102, 'eval_steps_per_second': 8.167, 'epoch': 5.0}


                                                  
 75%|███████▌  | 942/1256 [20:05<06:18,  1.21s/it]

{'eval_loss': 1.0209320783615112, 'eval_f1': 0.6392388465040618, 'eval_runtime': 4.7813, 'eval_samples_per_second': 58.352, 'eval_steps_per_second': 7.32, 'epoch': 6.0}


 80%|███████▉  | 1000/1256 [21:35<06:02,  1.41s/it]

{'loss': 1.0088, 'grad_norm': 11.8970308303833, 'learning_rate': 4.0764331210191083e-07, 'epoch': 6.37}


                                                   
 88%|████████▊ | 1099/1256 [23:56<03:23,  1.29s/it]

{'eval_loss': 1.009823203086853, 'eval_f1': 0.6582077430105637, 'eval_runtime': 3.495, 'eval_samples_per_second': 79.828, 'eval_steps_per_second': 10.014, 'epoch': 7.0}


                                                   
100%|██████████| 1256/1256 [27:27<00:00,  1.01s/it]

{'eval_loss': 1.0074412822723389, 'eval_f1': 0.655455357535635, 'eval_runtime': 1.2796, 'eval_samples_per_second': 218.041, 'eval_steps_per_second': 27.353, 'epoch': 8.0}


100%|██████████| 1256/1256 [27:29<00:00,  1.31s/it]

{'train_runtime': 1649.5704, 'train_samples_per_second': 12.168, 'train_steps_per_second': 0.761, 'train_loss': 1.1004562377929688, 'epoch': 8.0}





TrainOutput(global_step=1256, training_loss=1.1004562377929688, metrics={'train_runtime': 1649.5704, 'train_samples_per_second': 12.168, 'train_steps_per_second': 0.761, 'total_flos': 1320338693246976.0, 'train_loss': 1.1004562377929688, 'epoch': 8.0})

In [9]:
# Cell 7: Final Evaluation (No Changes)
print("\n--- Evaluating the final model on the sacred test set ---")
test_results = trainer.evaluate(eval_dataset=tokenized_test_ds)
print(f"\n\n{'='*60}\n--- CONTROLLED EXPERIMENT COMPLETE ---\n")
print(f"Final Model: {RUN_NAME}")
print(f"This run did NOT use data augmentation.")
print(f"New Honest Baseline F1 Score on Friends Data: {test_results['eval_f1']:.4f}")
print(f"\nYour final, best model is saved in: {output_dir}")
print(f"{'='*60}")


--- Evaluating the final model on the sacred test set ---


100%|██████████| 25/25 [00:00<00:00, 28.11it/s]



--- CONTROLLED EXPERIMENT COMPLETE ---

Final Model: controlled_run_no_augmentation
This run did NOT use data augmentation.
New Honest Baseline F1 Score on Friends Data: 0.5771

Your final, best model is saved in: ./results/controlled_run_no_augmentation



