In [4]:
# Cell 1: Imports and Configuration
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from datasets import Dataset
import demoji
import torch.nn.functional as F

# --- CONFIGURATION ---
# IMPORTANT: Update this path to the final checkpoint of your best run
BEST_MODEL_PATH = './results/final_tune_random_deletion/checkpoint-505' #<-- CHECK AND UPDATE THIS!
FRIENDS_DATA_PATH = '../data/data1.xlsx'
OUTPUT_CORRECTION_FILE = 'label_correction_sheet.xlsx'

print("Configuration loaded. Ready to find potential mislabels.")
# Tip: To find the checkpoint number, look inside the 'final_tune_random_deletion' folder.
# It's usually the folder with the highest number, e.g., 'checkpoint-235'.

Configuration loaded. Ready to find potential mislabels.


In [5]:
# Cell 2: Load Model and Full Dataset

print(f"Loading model from: {BEST_MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(BEST_MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(BEST_MODEL_PATH)
trainer = Trainer(model=model) # A simple trainer for running predictions

print(f"Loading data from: {FRIENDS_DATA_PATH}")
# Load the ENTIRE friends dataset
review_df = pd.read_excel(FRIENDS_DATA_PATH)

# --- Data Cleaning (must be identical to your training script) ---
# Standardize columns
review_df.columns = [col.strip().lower() for col in review_df.columns]
if 'entry' in review_df.columns:
    review_df.rename(columns={'entry': 'text'}, inplace=True)

# Drop rows with missing text or emotion, and duplicates
review_df.dropna(subset=['text', 'emotion'], inplace=True)
review_df.drop_duplicates(subset=['text'], inplace=True)
review_df = review_df.reset_index(drop=True) # Reset index for clean processing

print(f"Loaded {len(review_df)} unique entries to review.")



Loading model from: ./results/final_tune_random_deletion/checkpoint-505
Loading data from: ../data/data1.xlsx
Loaded 1001 unique entries to review.


In [6]:
### **Cell 3: Preprocess Data and Get Model Predictions**


# Cell 3: Preprocess Data and Get Model Predictions

# Convert to Hugging Face Dataset
review_ds = Dataset.from_pandas(review_df)

# --- Preprocessing (identical to your training script) ---
demoji.download_codes()
def preprocess_for_prediction(batch):
    # Only run demoji conversion
    batch['text'] = [demoji.replace_with_desc(str(text), sep=" ") for text in batch['text']]
    return batch

def tokenize_fn(batch):
    # Just tokenize the text
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

print("Preprocessing and tokenizing all entries...")
review_ds = review_ds.map(preprocess_for_prediction, batched=True)
tokenized_review_ds = review_ds.map(tokenize_fn, batched=True)

print("Running model to get predictions...")
predictions = trainer.predict(tokenized_review_ds)

# Get the predicted label index and the raw logits
predicted_indices = np.argmax(predictions.predictions, axis=1)
logits = torch.from_numpy(predictions.predictions)

# Get the confidence score (softmax probability) for the predicted class
confidence_scores = F.softmax(logits, dim=1).max(dim=1).values.numpy()

# Convert predicted indices back to string labels
predicted_labels = [model.config.id2label[i] for i in predicted_indices]

print("Prediction complete.")

  demoji.download_codes()


Preprocessing and tokenizing all entries...


Map: 100%|██████████| 1001/1001 [00:00<00:00, 1836.78 examples/s]
Map: 100%|██████████| 1001/1001 [00:00<00:00, 16964.66 examples/s]


Running model to get predictions...


100%|██████████| 126/126 [00:04<00:00, 29.64it/s]

Prediction complete.





In [7]:
# Cell 4: Generate and Save the Correction Sheet

# Add the new information to our DataFrame
review_df['model_suggestion'] = predicted_labels
review_df['confidence'] = confidence_scores

# Filter to find only the entries where the model disagrees with the original label
disagreements_df = review_df[review_df['emotion'] != review_df['model_suggestion']].copy()

# Sort by the model's confidence in its suggestion (highest confidence first)
# These are the most likely candidates for an incorrect original label.
disagreements_df = disagreements_df.sort_values(by='confidence', ascending=False)

# Add a blank column for your final decision
disagreements_df['human_corrected_label'] = ''

# Reorder columns for clarity
final_sheet = disagreements_df[['text', 'emotion', 'model_suggestion', 'confidence', 'human_corrected_label']]

# Save the candidates to an Excel file
final_sheet.to_excel(OUTPUT_CORRECTION_FILE, index=False)

print(f"\n--- SURGICAL LABEL CORRECTION SHEET ---")
print(f"Found {len(final_sheet)} potential mislabels.")
print(f"Saved them to '{OUTPUT_CORRECTION_FILE}' for your review.")
print("\n--- Top 20 Candidates for Review (Most Confident Model Suggestions) ---")
print(final_sheet.head(20))


--- SURGICAL LABEL CORRECTION SHEET ---
Found 61 potential mislabels.
Saved them to 'label_correction_sheet.xlsx' for your review.

--- Top 20 Candidates for Review (Most Confident Model Suggestions) ---
                                                  text  emotion  \
371  They changed the app again, and now nothin' wo...    Anger   
78   Everyone was speaking, and for some reason, th...  Disgust   
521  All my life I've seen my friends lead a chill ...  Disgust   
584  Practised Kerala entrance biology diagrams, co...      Joy   
593  Realised I haven’t touched English literature ...     Fear   
107           Saw my childhood pics and felt nostalgic      Joy   
72   Yesterday Virat retired and today the Warriors...  Sadness   
526  Blank on Article 32 in Polity viva. Switching ...    Anger   
261  Mid-sem blues: Indian Economy presentation oka...    Anger   
139  We never started as strangers, but we kept it ...  Sadness   
432  The fan is humming, kinda annoying but also co...  Ne