In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from tqdm import tqdm
import numpy as np

print("="*80)
print("EMOTION CLASSIFICATION - FULL DATASET")
print("="*80)

# ============================================================================
# 1. LOAD DATASET
# ============================================================================

df = pd.read_csv("reddit_posts_finance_labeled_cleaned.csv")
print(f"✓ Loaded dataset: {len(df):,} posts")

# ============================================================================
# 2. SETUP DEVICE
# ============================================================================

if torch.backends.mps.is_available():
    device = 0  # MPS
    print("✓ Using Apple Silicon GPU (MPS)")
else:
    device = -1  # CPU
    print("⚠️ MPS not available, using CPU")

# ============================================================================
# 3. LOAD EMOTION LABELS
# ============================================================================

from datasets import load_dataset
dataset = load_dataset("go_emotions")
emotion_labels = dataset["train"].features["labels"].feature.names

print(f"✓ Loaded {len(emotion_labels)} emotion labels")

# ============================================================================
# 4. LOAD MODEL
# ============================================================================

model_path = "./mentalbert-goemotions-final"
print(f"✓ Loading model from: {model_path}")

emotion_classifier = pipeline(
    "text-classification",
    model=model_path,
    tokenizer=model_path,
    device=device,
    top_k=None
)

print("✓ Model loaded successfully")

# ============================================================================
# 5. PREPARE TEXT
# ============================================================================

print("\n⚙️  Preparing text...")
df['combined_text'] = df['title'].fillna('') + ' ' + df['text'].fillna('')

# Remove empty posts
df = df[df['combined_text'].str.strip().str.len() > 0].copy()
print(f"✓ {len(df):,} posts with valid text")

# ============================================================================
# 6. EMOTION CLASSIFICATION FUNCTION
# ============================================================================

def get_emotion_scores(text, threshold=0.1):
    """
    Get emotions above threshold for a text.
    Returns dict of {emotion_name: score}
    """
    if pd.isna(text) or len(str(text).strip()) == 0:
        return {}
    
    try:
        full_text = str(text)[:512]  # Truncate to 512 chars
        results = emotion_classifier(full_text)[0]
        
        emotions = {}
        for r in results:
            label_idx = int(r['label'].split('_')[1])
            emotion_name = emotion_labels[label_idx]
            score = r['score']
            
            if score > threshold:
                emotions[emotion_name] = round(score, 4)
        
        return emotions
    except Exception as e:
        print(f"Error: {str(e)[:50]}")
        return {}

# ============================================================================
# 7. PROCESS ALL POSTS
# ============================================================================

print("\n" + "="*80)
print("PROCESSING ALL POSTS")
print("="*80)

# Process with progress bar
tqdm.pandas(desc="Classifying emotions")
df['emotions'] = df['combined_text'].progress_apply(get_emotion_scores)

print(f"\n✓ Processed {len(df):,} posts")

# ============================================================================
# 8. EXTRACT TOP EMOTION
# ============================================================================

print("\n⚙️  Extracting top emotions...")

def get_top_emotion(emotion_dict):
    if not emotion_dict or len(emotion_dict) == 0:
        return ('none', 0.0)
    top = max(emotion_dict.items(), key=lambda x: x[1])
    return top

df[['top_emotion', 'top_emotion_score']] = df['emotions'].apply(
    lambda x: pd.Series(get_top_emotion(x))
)

print("✓ Top emotions extracted")

# ============================================================================
# 9. CREATE INDIVIDUAL EMOTION COLUMNS
# ============================================================================

print("\n⚙️  Creating individual emotion columns...")

for emotion in emotion_labels:
    df[f'emotion_{emotion}'] = df['emotions'].apply(
        lambda x: x.get(emotion, 0.0) if isinstance(x, dict) else 0.0
    )

print(f"✓ Created {len(emotion_labels)} emotion score columns")

# Count emotions per post
df['num_emotions'] = df['emotions'].apply(
    lambda x: len(x) if isinstance(x, dict) else 0
)

# ============================================================================
# 10. SUMMARY STATISTICS
# ============================================================================

print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)

print("\nTop 15 Most Common Emotions (as primary emotion):")
top_emotions = df['top_emotion'].value_counts().head(15)
for emotion, count in top_emotions.items():
    percentage = (count / len(df)) * 100
    print(f"  {emotion:15s}: {count:6,} ({percentage:5.2f}%)")

print("\nTop 15 Emotions by Average Score Across All Posts:")
emotion_cols = [col for col in df.columns if col.startswith('emotion_')]
avg_scores = df[emotion_cols].mean().sort_values(ascending=False).head(15)
for col, score in avg_scores.items():
    emotion = col.replace('emotion_', '')
    print(f"  {emotion:15s}: {score:.4f}")

print(f"\nEmotion Co-occurrence:")
print(f"  Posts with 0 emotions: {(df['num_emotions'] == 0).sum():,}")
print(f"  Posts with 1 emotion:  {(df['num_emotions'] == 1).sum():,}")
print(f"  Posts with 2 emotions: {(df['num_emotions'] == 2).sum():,}")
print(f"  Posts with 3+ emotions: {(df['num_emotions'] >= 3).sum():,}")
print(f"  Average emotions/post: {df['num_emotions'].mean():.2f}")
print(f"  Max emotions in a post: {df['num_emotions'].max()}")

# ============================================================================
# 11. FINANCE vs NON-FINANCE COMPARISON
# ============================================================================

print("\n" + "="*80)
print("FINANCE vs NON-FINANCE COMPARISON")
print("="*80)

finance_posts = df[df['finance_label'] == 'finance']
non_finance_posts = df[df['finance_label'] == 'not finance']

print(f"\nFinance posts: {len(finance_posts):,}")
print(f"Non-finance posts: {len(non_finance_posts):,}")

if len(finance_posts) > 0:
    print("\nTop 10 emotions in FINANCE posts:")
    for emotion, count in finance_posts['top_emotion'].value_counts().head(10).items():
        pct = (count / len(finance_posts)) * 100
        print(f"  {emotion:15s}: {count:5,} ({pct:5.2f}%)")

if len(non_finance_posts) > 0:
    print("\nTop 10 emotions in NON-FINANCE posts:")
    for emotion, count in non_finance_posts['top_emotion'].value_counts().head(10).items():
        pct = (count / len(non_finance_posts)) * 100
        print(f"  {emotion:15s}: {count:5,} ({pct:5.2f}%)")

# ============================================================================
# 12. SAVE RESULTS
# ============================================================================

print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

# Full dataset with all columns
output_full = "reddit_with_emotions_full.csv"
df.to_csv(output_full, index=False)
print(f"✓ Full dataset saved: {output_full}")
print(f"  Size: {len(df):,} rows, {len(df.columns)} columns")

# Summary version (without individual emotion columns)
summary_cols = ['author', 'title', 'score', 'created', 'link', 'text', 
                'year', 'year_month', 'finance_label', 'finance_score',
                'emotions', 'top_emotion', 'top_emotion_score', 'num_emotions']
summary_cols = [col for col in summary_cols if col in df.columns]

output_summary = "reddit_emotions_summary.csv"
df[summary_cols].to_csv(output_summary, index=False)
print(f"✓ Summary saved: {output_summary}")
print(f"  Size: {len(df):,} rows, {len(summary_cols)} columns")

# ============================================================================
# 13. SAMPLE OUTPUT
# ============================================================================

print("\n" + "="*80)
print("SAMPLE RESULTS (First 5 Posts)")
print("="*80)

for idx in range(min(5, len(df))):
    row = df.iloc[idx]
    print(f"\n--- Post {idx+1} ---")
    print(f"Title: {row['title'][:80]}...")
    print(f"Finance: {row['finance_label']}")
    print(f"Top emotion: {row['top_emotion']} ({row['top_emotion_score']:.3f})")
    if len(row['emotions']) > 0:
        print(f"All emotions ({len(row['emotions'])}): {dict(list(row['emotions'].items())[:5])}...")
    else:
        print(f"All emotions: none detected")

print("\n" + "="*80)
print("✓ PROCESSING COMPLETE!")
print("="*80)
print(f"\nTotal posts processed: {len(df):,}")
print(f"\nOutput files created:")
print(f"  1. {output_full}")
print(f"  2. {output_summary}")
print("\nYou can now use these files for further analysis and visualization!")

EMOTION CLASSIFICATION - FULL DATASET
✓ Loaded dataset: 106,963 posts
✓ Using Apple Silicon GPU (MPS)
✓ Loaded 28 emotion labels
✓ Loading model from: ./mentalbert-goemotions-final


Device set to use mps:0


✓ Model loaded successfully

⚙️  Preparing text...
✓ 106,963 posts with valid text

PROCESSING ALL POSTS


Classifying emotions:  29%|██▉       | 31139/106963 [1:31:05<6:06:00,  3.45it/s] 