In [2]:
# Cell 1: Imports and Configuration
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from datasets import Dataset
from sklearn.metrics import classification_report

# --- CONFIGURATION ---
# IMPORTANT: Update these paths to the final checkpoints of your two best runs
MODEL_1_PATH = './results/final_tune_random_deletion/checkpoint-505' #<-- CHECK AND UPDATE THIS!
MODEL_2_PATH = './results/final_run_clean_data_backtranslation/checkpoint-404' #<-- CHECK AND UPDATE THIS!

FRIENDS_DATA_PATH = '../data/data1.xlsx'

print("Configuration loaded. Ready to ensemble.")
# Tip: Find the checkpoint number by looking inside the results folders for each run.
# It's usually the folder with the highest number.

Configuration loaded. Ready to ensemble.


In [3]:
# Cell 2: Load Both Models and the Test Data

print("Loading models and tokenizer...")
# We only need one tokenizer as both models share the same architecture
tokenizer = AutoTokenizer.from_pretrained(MODEL_2_PATH)

model1 = AutoModelForSequenceClassification.from_pretrained(MODEL_1_PATH)
trainer1 = Trainer(model=model1)
print(f"Loaded Model 1 from: {MODEL_1_PATH}")

model2 = AutoModelForSequenceClassification.from_pretrained(MODEL_2_PATH)
trainer2 = Trainer(model=model2)
print(f"Loaded Model 2 from: {MODEL_2_PATH}")

print("\nLoading and preparing the sacred test set...")
# Load the CLEANED friends dataset
df_friends = pd.read_excel(FRIENDS_DATA_PATH)

# --- Data Cleaning (must be identical to your previous runs) ---
df_friends.columns = [col.strip().lower() for col in df_friends.columns]
if 'entry' in df_friends.columns: df_friends.rename(columns={'entry': 'text'}, inplace=True)
df_friends.dropna(subset=['text', 'emotion'], inplace=True)
df_friends.drop_duplicates(subset=['text'], inplace=True)

# --- Recreate the EXACT same test set ---
test_df = df_friends.sample(frac=0.2, random_state=42)
test_ds = Dataset.from_pandas(test_df)

# --- Preprocessing (must be identical to your last run - NO demoji) ---
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

tokenized_test_ds = test_ds.map(tokenize_fn, batched=True, remove_columns=['text'])

print(f"Loaded {len(test_df)} test entries.")

Loading models and tokenizer...
Loaded Model 1 from: ./results/final_tune_random_deletion/checkpoint-505
Loaded Model 2 from: ./results/final_run_clean_data_backtranslation/checkpoint-404

Loading and preparing the sacred test set...


Map: 100%|██████████| 200/200 [00:00<00:00, 9092.26 examples/s]

Loaded 200 test entries.





In [4]:
# Cell 3: Get Predictions from Each Model

print("Getting predictions from Model 1 (Random Deletion)...")
predictions1 = trainer1.predict(tokenized_test_ds)
logits1 = torch.from_numpy(predictions1.predictions)

print("Getting predictions from Model 2 (Back-Translation)...")
predictions2 = trainer2.predict(tokenized_test_ds)
logits2 = torch.from_numpy(predictions2.predictions)

print("Individual predictions complete.")

Getting predictions from Model 1 (Random Deletion)...


100%|██████████| 25/25 [00:00<00:00, 30.81it/s]


Getting predictions from Model 2 (Back-Translation)...


100%|██████████| 25/25 [00:00<00:00, 31.01it/s]

Individual predictions complete.





In [6]:
# Cell 4: Ensemble Predictions and Evaluate Final Performance

print("\n--- Ensembling Results ---")

# The Ensemble Strategy: Average the logits from both models
ensembled_logits = (logits1 + logits2) / 2.0

# Get the final predicted class indices from the ensembled logits
ensembled_preds_indices = torch.argmax(ensembled_logits, axis=1).numpy()

# Get the string labels for the predictions
# Use the config from our best model (model2) to map indices to labels
ensembled_preds_labels = [model2.config.id2label[i] for i in ensembled_preds_indices]

# Get the true labels
y_true = test_df['emotion'].tolist()

# Generate and print the final classification report
print("\n--- Final Ensemble Classification Report ---")
print(classification_report(y_true, ensembled_preds_labels, digits=4))

# Extract the final weighted F1-score to compare
report = classification_report(y_true, ensembled_preds_labels, output_dict=True)
ensemble_f1 = report['weighted avg']['f1-score']

print(f"\n{'='*60}")
print(f"Previous Best Single Model F1 Score: 0.7570")
print(f"Final Ensemble F1 Score: {ensemble_f1:.4f}")

if ensemble_f1 > 0.7570:
    print("\nSUCCESS! The ensemble strategy improved performance!")
    if ensemble_f1 >= 0.80:
        print("MILESTONE ACHIEVED: You have broken the 80% F1-score barrier!")
else:
    print("\nThe ensemble performance was similar to the best single model.")
print(f"{'='*60}")


--- Ensembling Results ---

--- Final Ensemble Classification Report ---
              precision    recall  f1-score   support

       Anger     0.7143    0.8000    0.7547        25
     Disgust     0.8462    0.6875    0.7586        16
        Fear     0.8571    0.8000    0.8276        30
         Joy     0.9020    0.9020    0.9020        51
     Neutral     0.7188    0.6765    0.6970        34
     Sadness     0.8125    0.8864    0.8478        44

    accuracy                         0.8150       200
   macro avg     0.8085    0.7920    0.7979       200
weighted avg     0.8165    0.8150    0.8142       200


Previous Best Single Model F1 Score: 0.7570
Final Ensemble F1 Score: 0.8142

SUCCESS! The ensemble strategy improved performance!
MILESTONE ACHIEVED: You have broken the 80% F1-score barrier!
