### Examining the facial emotion results

I used the [vit-Facial-Expression-Recognition](https://huggingface.co/mo-thecreator/vit-Facial-Expression-Recognition) model to classify emotions on faces. This notebook analyzes the results and creates similar visualizations to the text sentiment analysis.

In [8]:
import pandas as pd
import json
from collections import defaultdict
import numpy as np

In [9]:
# Load facial emotions data
df = pd.read_csv('results_facial_emotions.csv')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20603 entries, 0 to 20602
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   episode_key           20603 non-null  int64  
 1   sketch_id             20603 non-null  int64  
 2   frame_num_in_episode  20603 non-null  int64  
 3   character_id          20603 non-null  int64  
 4   bbox_x1               20603 non-null  float64
 5   bbox_y1               20603 non-null  float64
 6   bbox_x2               20603 non-null  float64
 7   bbox_y2               20603 non-null  float64
 8   det_score             20603 non-null  float64
 9   emotion               20603 non-null  object 
 10  confidence            20603 non-null  float64
 11  timestamp             20603 non-null  float64
dtypes: float64(7), int64(4), object(1)
memory usage: 1.9+ MB


In [11]:
# Load metadata
metadata = pd.read_csv('episode_metadata.csv')

# Merge facial emotion data with metadata
df_merged = df.merge(metadata, left_on='sketch_id', right_on='id', how='left')

# Basic stats
print(f"Total face detections: {len(df)}")
print(f"Total sketches: {df['sketch_id'].nunique()}")
print(f"Total episodes: {df_merged.groupby(['season', 'episode']).ngroups if 'season' in df_merged.columns else 'N/A'}")
print(f"Total seasons: {df_merged['season'].nunique() if 'season' in df_merged.columns else 'N/A'}")
print(f"\nUnique emotions: {df['emotion'].nunique()}")
print(f"Unique categories: {df_merged['category'].nunique() if 'category' in df_merged.columns else 'N/A'}")
print(f"Unique characters: {df['character_id'].nunique()}")
print(f"\nEmotion types: {sorted(df['emotion'].unique())}")

Total face detections: 20603
Total sketches: 86
Total episodes: 18
Total seasons: 3

Unique emotions: 7
Unique categories: 22
Unique characters: 65

Emotion types: ['anger', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']


In [19]:
# 1. Overall emotion distribution
emotion_counts = df['emotion'].value_counts()
emotion_counts
# Calculate percentages
emotion_counts_pct = emotion_counts / emotion_counts.sum() * 100

# Combine into a DataFrame for export
emotion_dist_df = pd.DataFrame({
    'emotion': emotion_counts.index,
    'count': emotion_counts.values,
    'percentage': emotion_counts_pct.values
})

# Export to CSV
emotion_dist_df.to_csv('facial_overall_emotion_distribution.csv', index=False)

# Optionally, print for quick view
print(emotion_dist_df)
print("Exported facial_overall_emotion_distribution.csv")


    emotion  count  percentage
0   neutral   5688   27.607630
1       sad   4748   23.045188
2     happy   4726   22.938407
3  surprise   1750    8.493909
4     anger   1559    7.566859
5   disgust   1492    7.241664
6      fear    640    3.106344
Exported facial_overall_emotion_distribution.csv


In [None]:
# 2. Emotions by category
if 'category' in df_merged.columns:
    category_emotion = df_merged.groupby(['category', 'emotion']).size().reset_index(name='count')
    category_totals = df_merged.groupby('category').size()
    
    # Calculate percentages
    category_emotion_pct = category_emotion.merge(
        category_totals.reset_index(name='total'), 
        on='category'
    )
    category_emotion_pct['percentage'] = (category_emotion_pct['count'] / category_emotion_pct['total']) * 100
    
    # Top emotions per category
    top_emotions_by_category = category_emotion_pct.sort_values(['category', 'count'], ascending=[True, False]).groupby('category').head(5)
    print("Top 5 emotions by category:")
    print(top_emotions_by_category)

Top 5 emotions by category:
    category   emotion  count  total  percentage
4      class   neutral    135    442   30.542986
5      class       sad    135    442   30.542986
3      class     happy     70    442   15.837104
2      class      fear     34    442    7.692308
6      class  surprise     28    442    6.334842
..       ...       ...    ...    ...         ...
149     tour       sad    183    420   43.571429
148     tour   neutral     99    420   23.571429
147     tour     happy     90    420   21.428571
144     tour     anger     24    420    5.714286
146     tour      fear     13    420    3.095238

[110 rows x 5 columns]


In [21]:
# Export top emotions by category
if 'top_emotions_by_category' in locals():
    top_emotions_by_category.to_csv('facial_top_emotions_by_category.csv', index=False)
    print(f"Exported facial_top_emotions_by_category.csv")

Exported facial_top_emotions_by_category.csv


In [22]:
# 3. Emotions by season
if 'season' in df_merged.columns:
    season_emotion = df_merged.groupby(['season', 'emotion']).size().reset_index(name='count')
    season_totals = df_merged.groupby('season').size()
    
    season_emotion_pct = season_emotion.merge(
        season_totals.reset_index(name='total'),
        on='season'
    )
    season_emotion_pct['percentage'] = (season_emotion_pct['count'] / season_emotion_pct['total']) * 100
    
    print("Emotion distribution by season:")
    print(season_emotion_pct.sort_values(['season', 'count'], ascending=[True, False]).groupby('season').head(5))

Emotion distribution by season:
    season   emotion  count  total  percentage
4        1   neutral   2079   7024   29.598519
3        1     happy   1689   7024   24.046128
5        1       sad   1601   7024   22.793280
6        1  surprise    603   7024    8.584852
1        1   disgust    488   7024    6.947608
12       2       sad   1676   6634   25.263793
11       2   neutral   1669   6634   25.158276
10       2     happy   1336   6634   20.138680
13       2  surprise    665   6634   10.024118
7        2     anger    542   6634    8.170033
18       3   neutral   1940   6945   27.933765
17       3     happy   1701   6945   24.492441
19       3       sad   1471   6945   21.180706
14       3     anger    623   6945    8.970482
15       3   disgust    509   6945    7.329014


In [23]:
# 4. Emotion evolution within sketches
# Calculate relative position in sketch using timestamp
# First, get sketch duration from metadata or calculate from data
sketch_timestamps = df_merged.groupby('sketch_id')['timestamp'].agg(['min', 'max'])
sketch_timestamps['duration'] = sketch_timestamps['max'] - sketch_timestamps['min']

# Merge back to get relative position
df_merged = df_merged.merge(sketch_timestamps[['duration']], left_on='sketch_id', right_index=True)
df_merged['relative_position'] = (df_merged['timestamp'] - df_merged.groupby('sketch_id')['timestamp'].transform('min')) / df_merged['duration'].replace(0, 1)

# Bin into beginning, middle, end
df_merged['position_bin'] = pd.cut(df_merged['relative_position'], 
                            bins=[0, 0.25, 0.75, 1.0], 
                            labels=['beginning', 'middle', 'end'],
                            include_lowest=True)

position_emotion = df_merged.groupby(['position_bin', 'emotion'], observed=True).size().reset_index(name='count')
print("Emotions by position in sketch:")
print(position_emotion.sort_values(['position_bin', 'count'], ascending=[True, False]).groupby('position_bin', observed=True).head(5))

Emotions by position in sketch:
   position_bin   emotion  count
3     beginning     happy   1968
4     beginning   neutral   1500
5     beginning       sad   1056
6     beginning  surprise    395
1     beginning   disgust    307
11       middle   neutral   2797
12       middle       sad   2479
10       middle     happy   1849
13       middle  surprise    939
7        middle     anger    864
18          end   neutral   1391
19          end       sad   1213
17          end     happy    909
20          end  surprise    416
14          end     anger    414


In [24]:
# 5. Most emotional vs. most neutral sketches
sketch_stats = df_merged.groupby('sketch_id').agg({
    'emotion': lambda x: (x == 'neutral').sum() / len(x) if len(x) > 0 else 0,  # % neutral
    'confidence': ['mean', 'std'],
    'timestamp': 'count'  # number of detections
}).reset_index()
sketch_stats.columns = ['sketch_id', 'pct_neutral', 'avg_confidence', 'confidence_std', 'num_detections']

# Add sketch names and category
sketch_stats = sketch_stats.merge(
    df_merged[['sketch_id', 'name']].drop_duplicates(),
    on='sketch_id'
)
if 'category' in df_merged.columns:
    sketch_stats = sketch_stats.merge(
        df_merged[['sketch_id', 'category']].drop_duplicates(),
        on='sketch_id',
        how='left'
    )

# Show 10 sketches with lowest % neutral (most emotional)
most_emotional = sketch_stats.nsmallest(10, 'pct_neutral')[['sketch_id', 'name', 'category', 'pct_neutral', 'avg_confidence']]
print("Most emotional sketches (top 10):")
display(most_emotional)

# Show 10 sketches with highest % neutral (most neutral)
most_neutral = sketch_stats.nlargest(10, 'pct_neutral')[['sketch_id', 'name', 'category', 'pct_neutral', 'avg_confidence']]
print("\nMost neutral sketches (top 10):")
display(most_neutral)

Most emotional sketches (top 10):


Unnamed: 0,sketch_id,name,category,pct_neutral,avg_confidence
52,53,Little Buff Boys Competition,commercial,0.040816,0.717186
41,42,Grambles Lorelei Lounge,restaurant,0.061674,0.699221
11,12,Which Hand,magic show,0.067073,0.679913
32,33,Corncob TV,commercial,0.069565,0.625446
56,57,Tammy Craps,commercial,0.087302,0.696341
51,52,Joanie's Birthday,party,0.100671,0.654336
3,4,Instagram,social media,0.107914,0.741256
59,60,Barley Tonight,reality tv,0.113402,0.636396
38,39,Diner Wink,family,0.135417,0.624211
46,47,Wife Joke,party,0.136095,0.669517



Most neutral sketches (top 10):


Unnamed: 0,sketch_id,name,category,pct_neutral,avg_confidence
77,78,Summer Loving Farewell Package,dating,0.75,0.798941
14,15,New Joe,funeral,0.621795,0.676037
39,40,The Shops at the Creeks,commercial,0.538462,0.778902
84,85,Don Bondarley,party,0.489071,0.680591
17,18,A Christmas Carol,sci-fi/fantasy,0.471264,0.572971
21,22,Choking,restaurant,0.458042,0.668321
34,35,Little Buff Boys,game show,0.455,0.643889
62,63,Dad Video,family,0.436709,0.676519
40,41,Baby Cries,party,0.426786,0.712872
19,20,Traffic,driving,0.412541,0.688461


In [25]:
# 6. Emotions by character
character_emotion = df.groupby(['character_id', 'emotion']).size().reset_index(name='count')
character_totals = df.groupby('character_id').size()

character_emotion_pct = character_emotion.merge(
    character_totals.reset_index(name='total'),
    on='character_id'
)
character_emotion_pct['percentage'] = (character_emotion_pct['count'] / character_emotion_pct['total']) * 100

print("Top emotions by character:")
print(character_emotion_pct.sort_values(['character_id', 'count'], ascending=[True, False]).groupby('character_id').head(3))

Top emotions by character:
     character_id  emotion  count  total  percentage
4               0  neutral    953   3817   24.967252
5               0      sad    900   3817   23.578727
3               0    happy    824   3817   21.587634
11              1  neutral    874   3253   26.867507
12              1      sad    762   3253   23.424531
..            ...      ...    ...    ...         ...
283            86    happy      4      4  100.000000
284            89    happy      1      1  100.000000
286            92     fear      3      6   50.000000
287            92      sad      2      6   33.333333
285            92    anger      1      6   16.666667

[163 rows x 5 columns]


## Heatmap: Emotions Over Time for Each Sketch

Prepare data for a heatmap showing emotion evolution across all sketches.
- Y-axis: Sketches (ordered by season, episode, sketch_id)
- X-axis: Time (normalized position within sketch, binned)
- Color: Emotion type

In [26]:
# Prepare data for sketch × time heatmap
# Normalize time position for each sketch (0-100)
df_merged['time_percent'] = df_merged['relative_position'] * 100

# Create time bins (e.g., 0-5%, 5-10%, etc.)
# Using 20 bins (5% increments) for good resolution without too much granularity
NUM_TIME_BINS = 20
df_merged['time_bin'] = pd.cut(df_merged['time_percent'], 
                        bins=NUM_TIME_BINS, 
                        labels=[f"{i*100/NUM_TIME_BINS:.1f}-{(i+1)*100/NUM_TIME_BINS:.1f}%" 
                                for i in range(NUM_TIME_BINS)],
                        include_lowest=True)

# Get sketch metadata for ordering
if 'season' in df_merged.columns and 'episode' in df_merged.columns:
    sketch_order = df_merged[['sketch_id', 'name', 'season', 'episode']].drop_duplicates().sort_values(
        ['season', 'episode', 'sketch_id']
    ).reset_index(drop=True)
else:
    sketch_order = df_merged[['sketch_id', 'name']].drop_duplicates().sort_values('sketch_id').reset_index(drop=True)
    sketch_order['season'] = 0
    sketch_order['episode'] = 0

sketch_order['sketch_order'] = sketch_order.index

# Add ordering to main dataframe
df_merged = df_merged.merge(sketch_order[['sketch_id', 'sketch_order']], on='sketch_id')

print(f"Time bins: {NUM_TIME_BINS} bins ({100/NUM_TIME_BINS:.1f}% each)")
print(f"Total sketches: {len(sketch_order)}")
print(f"\nFirst few sketches in order:")
print(sketch_order.head(10))

Time bins: 20 bins (5.0% each)
Total sketches: 86

First few sketches in order:
   sketch_id                            name  season  episode  sketch_order
0          1                       Both Ways       1        1             0
1          2  Has This Ever Happened to You?       1        1             1
2          3                Baby of the Year       1        1             2
3          4                       Instagram       1        1             3
4          5                    Gift Receipt       1        1             4
5          6                       Biker Guy       1        2             5
6          7             River Mountain High       1        2             6
7          8                Wilson's Toupees       1        2             7
8          9                        Pink Bag       1        2             8
9         10             River Mountain High       1        2             9


In [27]:
# For each sketch × time_bin, capture ALL emotions (not just dominant)
heatmap_data_all = df_merged.groupby(['sketch_id', 'time_bin', 'emotion'], observed=True).size().reset_index(name='count')

# Calculate total detections per bin for percentages
bin_totals = df_merged.groupby(['sketch_id', 'time_bin'], observed=True).size().reset_index(name='total_detections')
heatmap_data_all = heatmap_data_all.merge(bin_totals, on=['sketch_id', 'time_bin'])
heatmap_data_all['percentage'] = (heatmap_data_all['count'] / heatmap_data_all['total_detections']) * 100

# Also determine the dominant emotion for comparison
heatmap_data_sorted = heatmap_data_all.sort_values(['sketch_id', 'time_bin', 'count'], ascending=[True, True, False])
dominant_emotions = heatmap_data_sorted.groupby(['sketch_id', 'time_bin'], observed=True).first().reset_index()
dominant_emotions = dominant_emotions[['sketch_id', 'time_bin', 'emotion', 'count', 'total_detections']]
dominant_emotions['dominance_ratio'] = dominant_emotions['count'] / dominant_emotions['total_detections']

print("Sample of ALL emotions by sketch × time bin:")
print(heatmap_data_all.head(20))
print(f"\nTotal emotion × time_bin combinations: {len(heatmap_data_all)}")
print(f"Average emotions per sketch × time_bin: {heatmap_data_all.groupby(['sketch_id', 'time_bin'], observed=True).size().mean():.2f}")

Sample of ALL emotions by sketch × time bin:
    sketch_id    time_bin   emotion  count  total_detections  percentage
0           1    0.0-5.0%     anger      5                 9   55.555556
1           1    0.0-5.0%     happy      2                 9   22.222222
2           1    0.0-5.0%       sad      2                 9   22.222222
3           1   5.0-10.0%     anger      5                13   38.461538
4           1   5.0-10.0%     happy      5                13   38.461538
5           1   5.0-10.0%   neutral      1                13    7.692308
6           1   5.0-10.0%       sad      2                13   15.384615
7           1  10.0-15.0%     anger      1                 6   16.666667
8           1  10.0-15.0%     happy      3                 6   50.000000
9           1  10.0-15.0%   neutral      1                 6   16.666667
10          1  10.0-15.0%       sad      1                 6   16.666667
11          1  15.0-20.0%   neutral      1                 5   20.000000
12    

In [28]:
# Add sketch metadata and ordering to ALL emotions dataset
heatmap_all_emotions = heatmap_data_all.merge(
    sketch_order[['sketch_id', 'name', 'season', 'episode', 'sketch_order']],
    on='sketch_id'
)

# Add category if available
if 'category' in df_merged.columns:
    sketch_categories = df_merged[['sketch_id', 'category', 'category2']].drop_duplicates()
    heatmap_all_emotions = heatmap_all_emotions.merge(sketch_categories, on='sketch_id', how='left')

# Convert time_bin to numeric for easier sorting/plotting
heatmap_all_emotions['time_bin_start'] = heatmap_all_emotions['time_bin'].astype(str).str.split('-').str[0].str.rstrip('%').astype(float)
heatmap_all_emotions['time_bin_end'] = heatmap_all_emotions['time_bin'].astype(str).str.split('-').str[1].str.rstrip('%').astype(float)
heatmap_all_emotions['time_bin_center'] = (heatmap_all_emotions['time_bin_start'] + heatmap_all_emotions['time_bin_end']) / 2

# Sort by sketch order, time bin, and count (descending)
heatmap_all_emotions = heatmap_all_emotions.sort_values(['sketch_order', 'time_bin_start', 'count'], ascending=[True, True, False]).reset_index(drop=True)

# Also create dominant-only version for comparison
heatmap_export = dominant_emotions.merge(
    sketch_order[['sketch_id', 'name', 'season', 'episode', 'sketch_order']],
    on='sketch_id'
)
if 'category' in df_merged.columns:
    heatmap_export = heatmap_export.merge(sketch_categories, on='sketch_id', how='left')
heatmap_export['time_bin_start'] = heatmap_export['time_bin'].astype(str).str.split('-').str[0].str.rstrip('%').astype(float)
heatmap_export['time_bin_end'] = heatmap_export['time_bin'].astype(str).str.split('-').str[1].str.rstrip('%').astype(float)
heatmap_export['time_bin_center'] = (heatmap_export['time_bin_start'] + heatmap_export['time_bin_end']) / 2
heatmap_export = heatmap_export.sort_values(['sketch_order', 'time_bin_start']).reset_index(drop=True)

print(f"ALL emotions heatmap data shape: {heatmap_all_emotions.shape}")
print(f"Dominant-only heatmap data shape: {heatmap_export.shape}")
print(f"\nSample of ALL emotions data:")
print(heatmap_all_emotions[['name', 'time_bin', 'emotion', 'count', 'percentage']].head(20))

ALL emotions heatmap data shape: (6189, 15)
Dominant-only heatmap data shape: (1662, 15)

Sample of ALL emotions data:
         name    time_bin   emotion  count  percentage
0   Both Ways    0.0-5.0%     anger      5   55.555556
1   Both Ways    0.0-5.0%     happy      2   22.222222
2   Both Ways    0.0-5.0%       sad      2   22.222222
3   Both Ways   5.0-10.0%     anger      5   38.461538
4   Both Ways   5.0-10.0%     happy      5   38.461538
5   Both Ways   5.0-10.0%       sad      2   15.384615
6   Both Ways   5.0-10.0%   neutral      1    7.692308
7   Both Ways  10.0-15.0%     happy      3   50.000000
8   Both Ways  10.0-15.0%     anger      1   16.666667
9   Both Ways  10.0-15.0%   neutral      1   16.666667
10  Both Ways  10.0-15.0%       sad      1   16.666667
11  Both Ways  15.0-20.0%       sad      3   60.000000
12  Both Ways  15.0-20.0%   neutral      1   20.000000
13  Both Ways  15.0-20.0%  surprise      1   20.000000
14  Both Ways  20.0-25.0%  surprise      1  100.000000
1

In [29]:
# Also create an alternative version with average confidence score per bin
heatmap_score_data = df_merged.groupby(['sketch_id', 'time_bin'], observed=True).agg({
    'confidence': 'mean',
    'emotion': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'neutral'  # most common emotion
}).reset_index()
heatmap_score_data.columns = ['sketch_id', 'time_bin', 'avg_confidence', 'most_common_emotion']

# Add metadata
heatmap_score_export = heatmap_score_data.merge(
    sketch_order[['sketch_id', 'name', 'season', 'episode', 'sketch_order']],
    on='sketch_id'
)
if 'category' in df_merged.columns:
    heatmap_score_export = heatmap_score_export.merge(sketch_categories, on='sketch_id', how='left')

# Add time bin numeric values
heatmap_score_export['time_bin_start'] = heatmap_score_export['time_bin'].astype(str).str.split('-').str[0].str.rstrip('%').astype(float)
heatmap_score_export['time_bin_end'] = heatmap_score_export['time_bin'].astype(str).str.split('-').str[1].str.rstrip('%').astype(float)
heatmap_score_export['time_bin_center'] = (heatmap_score_export['time_bin_start'] + heatmap_score_export['time_bin_end']) / 2

heatmap_score_export = heatmap_score_export.sort_values(['sketch_order', 'time_bin_start']).reset_index(drop=True)

print("Alternative version with confidence scores:")
print(heatmap_score_export[['name', 'time_bin', 'most_common_emotion', 'avg_confidence']].head(15))

Alternative version with confidence scores:
         name    time_bin most_common_emotion  avg_confidence
0   Both Ways    0.0-5.0%               anger        0.583908
1   Both Ways   5.0-10.0%               anger        0.688675
2   Both Ways  10.0-15.0%               happy        0.782999
3   Both Ways  15.0-20.0%                 sad        0.497361
4   Both Ways  20.0-25.0%            surprise        0.800216
5   Both Ways  25.0-30.0%             disgust        0.578273
6   Both Ways  30.0-35.0%                 sad        0.553286
7   Both Ways  35.0-40.0%               happy        0.661433
8   Both Ways  40.0-45.0%             disgust        0.575155
9   Both Ways  50.0-55.0%             neutral        0.399828
10  Both Ways  55.0-60.0%                fear        0.451276
11  Both Ways  60.0-65.0%             neutral        0.655202
12  Both Ways  65.0-70.0%               happy        0.877965
13  Both Ways  70.0-75.0%               happy        0.839132
14  Both Ways  75.0-80.0% 

In [30]:
# Create comprehensive exports with multiple versions
# Version 1: ALL emotions per time bin
heatmap_all_final = heatmap_all_emotions[[
    'sketch_id', 'name', 'sketch_order', 'season', 'episode',
    'time_bin', 'time_bin_start', 'time_bin_end', 'time_bin_center',
    'emotion', 'count', 'total_detections', 'percentage'
]].copy()

if 'category' in heatmap_all_emotions.columns:
    heatmap_all_final['category'] = heatmap_all_emotions['category']
    if 'category2' in heatmap_all_emotions.columns:
        heatmap_all_final['category2'] = heatmap_all_emotions['category2']

# Version 2: Dominant emotion only
heatmap_final = heatmap_export[[
    'sketch_id', 'name', 'sketch_order', 'season', 'episode',
    'time_bin', 'time_bin_start', 'time_bin_end', 'time_bin_center',
    'emotion', 'count', 'total_detections', 'dominance_ratio'
]].copy()

if 'category' in heatmap_export.columns:
    heatmap_final['category'] = heatmap_export['category']
    if 'category2' in heatmap_export.columns:
        heatmap_final['category2'] = heatmap_export['category2']

# Version 3: Score-based version
heatmap_score_final = heatmap_score_export[[
    'sketch_id', 'name', 'sketch_order', 'season', 'episode',
    'time_bin', 'time_bin_start', 'time_bin_end', 'time_bin_center',
    'most_common_emotion', 'avg_confidence'
]].copy()

if 'category' in heatmap_score_export.columns:
    heatmap_score_final['category'] = heatmap_score_export['category']
    if 'category2' in heatmap_score_export.columns:
        heatmap_score_final['category2'] = heatmap_score_export['category2']

# Calculate emotion diversity metrics per sketch
emotion_diversity = heatmap_all_emotions.groupby('sketch_id').agg({
    'emotion': 'nunique',  # number of unique emotions
    'count': 'sum'  # total detections
}).reset_index()
emotion_diversity.columns = ['sketch_id', 'num_unique_emotions', 'total_detections']
emotion_diversity = emotion_diversity.merge(
    sketch_order[['sketch_id', 'name', 'season', 'episode', 'sketch_order']],
    on='sketch_id'
)

print("Final heatmap datasets prepared:")
print(f"  - ALL emotions version: {len(heatmap_all_final)} rows")
print(f"  - Dominant emotion version: {len(heatmap_final)} rows")
print(f"  - Score-based version: {len(heatmap_score_final)} rows")
print(f"\nUnique emotions across all sketches: {heatmap_all_final['emotion'].nunique()}")
print(f"Emotions: {sorted(heatmap_all_final['emotion'].unique())}")
print(f"\nEmotion diversity per sketch:")
print(f"  Average unique emotions per sketch: {emotion_diversity['num_unique_emotions'].mean():.2f}")
print(f"  Range: {emotion_diversity['num_unique_emotions'].min()} - {emotion_diversity['num_unique_emotions'].max()} unique emotions")

Final heatmap datasets prepared:
  - ALL emotions version: 6189 rows
  - Dominant emotion version: 1662 rows
  - Score-based version: 1662 rows

Unique emotions across all sketches: 7
Emotions: ['anger', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']

Emotion diversity per sketch:
  Average unique emotions per sketch: 6.77
  Range: 3 - 7 unique emotions


## Data Structure: Emotions in Order of Appearance per Sketch

Create a structured data format that organizes emotions chronologically for each sketch, making it easy to visualize the emotional arc over time.

In [None]:
# Ensure data is sorted by sketch_id and timestamp for chronological order
df_sorted = df_merged.sort_values(['sketch_id', 'timestamp', 'character_id']).reset_index(drop=True)

# Get all unique emotions for reference
all_emotions = sorted(df_sorted['emotion'].unique())
print(f"Total unique emotions: {len(all_emotions)}")
print(f"All emotions: {all_emotions}")
print(f"\nColumns in df_sorted: {list(df_sorted.columns)}")

Total unique emotions: 7
All emotions: ['anger', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']

Columns in df_sorted: ['episode_key', 'sketch_id', 'frame_num_in_episode', 'character_id', 'bbox_x1', 'bbox_y1', 'bbox_x2', 'bbox_y2', 'det_score', 'emotion', 'confidence', 'timestamp', 'season', 'episode', 'id', 'name', 'start', 'end', 'category', 'category2', 'song', 'description', 'duration_x', 'relative_position', 'position_bin', 'duration_y', 'duration', 'time_percent', 'time_bin', 'sketch_order']


In [32]:
# Create the main data structure: emotions in order of appearance per sketch
emotions_by_sketch = {}

# Process each sketch
for sketch_id in sorted(df_sorted['sketch_id'].unique()):
    sketch_data = df_sorted[df_sorted['sketch_id'] == sketch_id].copy()
    
    # Get sketch metadata (should be same for all rows of a sketch)
    first_row = sketch_data.iloc[0]
    
    # Create ordered list of emotions with their positions
    emotion_sequence = []
    for idx, row in sketch_data.iterrows():
        emotion_sequence.append({
            'timestamp': float(row['timestamp']),
            'frame_num': int(row['frame_num_in_episode']),
            'character_id': int(row['character_id']),
            'emotion': row['emotion'],
            'confidence': float(row['confidence']),
            'det_score': float(row['det_score'])
        })
    
    # Store in dictionary
    emotions_by_sketch[sketch_id] = {
        'sketch_id': int(sketch_id),
        'sketch_name': first_row.get('name', ''),
        'season': int(first_row.get('season', 0)) if pd.notna(first_row.get('season', 0)) else 0,
        'episode': int(first_row.get('episode', 0)) if pd.notna(first_row.get('episode', 0)) else 0,
        'category': first_row.get('category', ''),
        'category2': first_row.get('category2', ''),
        'start_time': first_row.get('start', ''),
        'end_time': first_row.get('end', ''),
        'total_detections': len(emotion_sequence),
        'emotion_sequence': emotion_sequence,  # Ordered list of emotions
        'emotion_counts': sketch_data['emotion'].value_counts().to_dict(),  # Count of each emotion
        'unique_emotions': sorted(sketch_data['emotion'].unique().tolist())  # Unique emotions in this sketch
    }

print(f"Created data structure for {len(emotions_by_sketch)} sketches")
# Get first available sketch ID for example
first_sketch_id = min(emotions_by_sketch.keys()) if emotions_by_sketch else None
if first_sketch_id:
    print(f"\nExample structure for sketch {first_sketch_id}:")
    example_keys = ['sketch_id', 'sketch_name', 'season', 'episode', 'category', 'total_detections', 'unique_emotions']
    example_dict = {k: emotions_by_sketch[first_sketch_id][k] for k in example_keys}
    print(json.dumps(example_dict, indent=2))
    print(f"\nFirst 5 emotions in sequence:")
    for i, emo in enumerate(emotions_by_sketch[first_sketch_id]['emotion_sequence'][:5]):
        print(f"  {i+1}. Timestamp {emo['timestamp']:.2f}s, Character {emo['character_id']}: {emo['emotion']} (confidence: {emo['confidence']:.3f})")
else:
    print("No sketches found in data")

Created data structure for 86 sketches

Example structure for sketch 1:
{
  "sketch_id": 1,
  "sketch_name": "Both Ways",
  "season": 1,
  "episode": 1,
  "category": "office",
  "total_detections": 86,
  "unique_emotions": [
    "anger",
    "disgust",
    "fear",
    "happy",
    "neutral",
    "sad",
    "surprise"
  ]
}

First 5 emotions in sequence:
  1. Timestamp 8.26s, Character 0: happy (confidence: 0.618)
  2. Timestamp 9.01s, Character 0: happy (confidence: 0.696)
  3. Timestamp 9.01s, Character 1: anger (confidence: 0.466)
  4. Timestamp 9.76s, Character 1: anger (confidence: 0.687)
  5. Timestamp 10.51s, Character 0: anger (confidence: 0.750)


In [33]:
# Create a normalized position-based structure for easier charting
emotions_by_sketch_normalized = {}

for sketch_id, data in emotions_by_sketch.items():
    emotion_sequence_normalized = []
    total = data['total_detections']
    
    # Get min and max timestamps for normalization
    timestamps = [e['timestamp'] for e in data['emotion_sequence']]
    min_ts = min(timestamps) if timestamps else 0
    max_ts = max(timestamps) if timestamps else 1
    ts_range = max_ts - min_ts if max_ts > min_ts else 1
    
    for emo in data['emotion_sequence']:
        normalized_position = (emo['timestamp'] - min_ts) / ts_range  # 0.0 to 1.0
        emotion_sequence_normalized.append({
            'normalized_position': float(normalized_position),
            'timestamp': emo['timestamp'],
            'frame_num': emo['frame_num'],
            'character_id': emo['character_id'],
            'emotion': emo['emotion'],
            'confidence': emo['confidence']
        })
    
    emotions_by_sketch_normalized[sketch_id] = {
        **{k: v for k, v in data.items() if k != 'emotion_sequence'},
        'emotion_sequence_normalized': emotion_sequence_normalized
    }

print(f"Created normalized position structure for {len(emotions_by_sketch_normalized)} sketches")
# Get first available sketch ID for example
first_sketch_id = min(emotions_by_sketch_normalized.keys()) if emotions_by_sketch_normalized else None
if first_sketch_id:
    print(f"\nExample normalized sequence for sketch {first_sketch_id} (first 5):")
    for emo in emotions_by_sketch_normalized[first_sketch_id]['emotion_sequence_normalized'][:5]:
        print(f"  Position {emo['normalized_position']:.3f} (timestamp {emo['timestamp']:.2f}s): Character {emo['character_id']} - {emo['emotion']}")
else:
    print("No sketches found in normalized data")

Created normalized position structure for 86 sketches

Example normalized sequence for sketch 1 (first 5):
  Position 0.000 (timestamp 8.26s): Character 0 - happy
  Position 0.009 (timestamp 9.01s): Character 0 - happy
  Position 0.009 (timestamp 9.01s): Character 1 - anger
  Position 0.018 (timestamp 9.76s): Character 1 - anger
  Position 0.026 (timestamp 10.51s): Character 0 - anger


In [34]:
# Create a long-format DataFrame for easier plotting
plotting_data = []

for sketch_id, data in emotions_by_sketch.items():
    timestamps = [e['timestamp'] for e in data['emotion_sequence']]
    min_ts = min(timestamps) if timestamps else 0
    max_ts = max(timestamps) if timestamps else 1
    ts_range = max_ts - min_ts if max_ts > min_ts else 1
    
    for emo in data['emotion_sequence']:
        normalized_position = (emo['timestamp'] - min_ts) / ts_range
        plotting_data.append({
            'sketch_id': data['sketch_id'],
            'sketch_name': data['sketch_name'],
            'season': data['season'],
            'episode': data['episode'],
            'category': data['category'],
            'category2': data['category2'],
            'timestamp': emo['timestamp'],
            'frame_num': emo['frame_num'],
            'normalized_position': normalized_position,
            'character_id': emo['character_id'],
            'emotion': emo['emotion'],
            'confidence': emo['confidence']
        })

df_emotions_plotting = pd.DataFrame(plotting_data)

print(f"Plotting DataFrame shape: {df_emotions_plotting.shape}")
print(f"\nFirst 10 rows:")
df_emotions_plotting.head(10)

Plotting DataFrame shape: (20603, 12)

First 10 rows:


Unnamed: 0,sketch_id,sketch_name,season,episode,category,category2,timestamp,frame_num,normalized_position,character_id,emotion,confidence
0,1,Both Ways,1,1,office,,8.25825,198,0.0,0,happy,0.617962
1,1,Both Ways,1,1,office,,9.009,216,0.008772,0,happy,0.696091
2,1,Both Ways,1,1,office,,9.009,216,0.008772,1,anger,0.465669
3,1,Both Ways,1,1,office,,9.75975,234,0.017544,1,anger,0.686593
4,1,Both Ways,1,1,office,,10.5105,252,0.026316,0,anger,0.749711
5,1,Both Ways,1,1,office,,10.5105,252,0.026316,1,anger,0.501236
6,1,Both Ways,1,1,office,,11.26125,270,0.035088,0,sad,0.303702
7,1,Both Ways,1,1,office,,12.012,288,0.04386,0,sad,0.67354
8,1,Both Ways,1,1,office,,12.012,288,0.04386,1,anger,0.560665
9,1,Both Ways,1,1,office,,12.76275,306,0.052632,0,sad,0.7369


In [35]:
# Create a summary DataFrame for easier analysis
sketch_summaries = []

for sketch_id, data in emotions_by_sketch.items():
    # Create a simple list of emotions in order
    emotion_list = [e['emotion'] for e in data['emotion_sequence']]
    
    sketch_summaries.append({
        'sketch_id': data['sketch_id'],
        'sketch_name': data['sketch_name'],
        'season': data['season'],
        'episode': data['episode'],
        'category': data['category'],
        'category2': data['category2'],
        'total_detections': data['total_detections'],
        'num_unique_emotions': len(data['unique_emotions']),
        'emotion_sequence': emotion_list,  # List of emotions in order
        'emotion_string': ' -> '.join(emotion_list),  # String representation for quick viewing
        'emotion_counts': data['emotion_counts']
    })

df_sketch_emotions_summary = pd.DataFrame(sketch_summaries)

print(f"Summary DataFrame shape: {df_sketch_emotions_summary.shape}")
print(f"\nFirst few sketches:")
df_sketch_emotions_summary[['sketch_id', 'sketch_name', 'season', 'episode', 'total_detections', 'num_unique_emotions', 'emotion_string']].head(10)

Summary DataFrame shape: (86, 11)

First few sketches:


Unnamed: 0,sketch_id,sketch_name,season,episode,total_detections,num_unique_emotions,emotion_string
0,1,Both Ways,1,1,86,7,happy -> happy -> anger -> anger -> anger -> a...
1,2,Has This Ever Happened to You?,1,1,126,7,neutral -> disgust -> disgust -> surprise -> h...
2,3,Baby of the Year,1,1,291,7,sad -> neutral -> sad -> sad -> disgust -> ang...
3,4,Instagram,1,1,139,6,happy -> happy -> happy -> happy -> happy -> h...
4,5,Gift Receipt,1,1,517,7,sad -> sad -> neutral -> happy -> happy -> hap...
5,6,Biker Guy,1,2,152,7,happy -> happy -> sad -> sad -> neutral -> sad...
6,7,River Mountain High,1,2,174,6,happy -> happy -> happy -> happy -> happy -> h...
7,8,Wilson's Toupees,1,2,157,7,surprise -> happy -> happy -> happy -> happy -...
8,9,Pink Bag,1,2,323,7,happy -> happy -> happy -> surprise -> sad -> ...
9,10,River Mountain High,1,2,92,7,disgust -> surprise -> neutral -> neutral -> s...


In [36]:
# Display statistics about the data structure
print("=" * 60)
print("DATA STRUCTURE STATISTICS")
print("=" * 60)

print(f"\nTotal sketches: {len(emotions_by_sketch)}")
print(f"Total emotion detections: {len(df_emotions_plotting)}")
print(f"Total unique emotions: {len(all_emotions)}")
print(f"\nAll {len(all_emotions)} emotions:")
for i, emo in enumerate(all_emotions, 1):
    print(f"  {i:2d}. {emo}")

if 'season' in df_emotions_plotting.columns:
    print(f"\n\nSketches by season:")
    print(df_emotions_plotting.groupby('season')['sketch_id'].nunique())

    print(f"\n\nSketches by episode:")
    episode_counts = df_emotions_plotting.groupby(['season', 'episode'])['sketch_id'].nunique()
    print(episode_counts)

print(f"\n\nAverage detections per sketch:")
detections_per_sketch = df_emotions_plotting.groupby('sketch_id').size()
print(f"  Mean: {detections_per_sketch.mean():.1f}")
print(f"  Median: {detections_per_sketch.median():.1f}")
print(f"  Min: {detections_per_sketch.min()}")
print(f"  Max: {detections_per_sketch.max()}")

print(f"\n\nEmotion distribution across all sketches:")
emotion_counts = df_emotions_plotting['emotion'].value_counts()
print(emotion_counts)

DATA STRUCTURE STATISTICS

Total sketches: 86
Total emotion detections: 20603
Total unique emotions: 7

All 7 emotions:
   1. anger
   2. disgust
   3. fear
   4. happy
   5. neutral
   6. sad
   7. surprise


Sketches by season:
season
1    31
2    28
3    27
Name: sketch_id, dtype: int64


Sketches by episode:
season  episode
1       1          5
        2          6
        3          5
        4          4
        5          5
        6          6
2       1          5
        2          5
        3          5
        4          3
        5          5
        6          5
3       1          5
        2          5
        3          5
        4          5
        5          3
        6          4
Name: sketch_id, dtype: int64


Average detections per sketch:
  Mean: 239.6
  Median: 211.5
  Min: 13
  Max: 677


Emotion distribution across all sketches:
emotion
neutral     5688
sad         4748
happy       4726
surprise    1750
anger       1559
disgust     1492
fear         640
Name: c

In [None]:
# Save the data structures for later use
# Helper function to convert numpy types to native Python types for JSON serialization
def convert_to_json_serializable(obj):
    """Recursively convert numpy types to native Python types"""
    if isinstance(obj, dict):
        result = {}
        for k, v in obj.items():
            if isinstance(k, (np.integer, np.int64, np.int32)):
                key = int(k)
            else:
                key = k
            result[key] = convert_to_json_serializable(v)
        return result
    elif isinstance(obj, list):
        return [convert_to_json_serializable(item) for item in obj]
    elif isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif pd.isna(obj):
        return None
    else:
        return obj

# 1. Full structured data (JSON)
emotions_by_sketch_json = convert_to_json_serializable(emotions_by_sketch)
with open('facial_emotions_by_sketch_structured.json', 'w') as f:
    json.dump(emotions_by_sketch_json, f, indent=2)

# 2. Normalized version (JSON)
emotions_by_sketch_normalized_json = convert_to_json_serializable(emotions_by_sketch_normalized)
with open('facial_emotions_by_sketch_normalized.json', 'w') as f:
    json.dump(emotions_by_sketch_normalized_json, f, indent=2)

# 3. Plotting DataFrame (CSV)
df_emotions_plotting.to_csv('facial_emotions_plotting_data.csv', index=False)

# 4. Summary DataFrame (CSV)
df_sketch_emotions_export = df_sketch_emotions_summary.copy()
df_sketch_emotions_export['emotion_sequence'] = df_sketch_emotions_export['emotion_sequence'].apply(lambda x: '|'.join(x))
df_sketch_emotions_export.to_csv('facial_emotions_by_sketch_summary.csv', index=False)

# 5. Export heatmap data
heatmap_all_final.to_csv('facial_emotions_heatmap_all.csv', index=False)
heatmap_final.to_csv('facial_emotions_heatmap_dominant.csv', index=False)
heatmap_score_final.to_csv('facial_emotions_heatmap_scores.csv', index=False)

print("Data structures saved:")
print("  ✓ facial_emotions_by_sketch_structured.json: Full structured data")
print("  ✓ facial_emotions_by_sketch_normalized.json: Normalized position data")
print("  ✓ facial_emotions_plotting_data.csv: Long-format DataFrame for plotting")
print("  ✓ facial_emotions_by_sketch_summary.csv: Summary DataFrame")
print("  ✓ facial_emotions_heatmap_all.csv: All emotions heatmap data")
print("  ✓ facial_emotions_heatmap_dominant.csv: Dominant emotions heatmap data")
print("  ✓ facial_emotions_heatmap_scores.csv: Score-based heatmap data")

Data structures saved:
  ✓ facial_emotions_by_sketch_structured.json: Full structured data
  ✓ facial_emotions_by_sketch_normalized.json: Normalized position data
  ✓ facial_emotions_plotting_data.csv: Long-format DataFrame for plotting
  ✓ facial_emotions_by_sketch_summary.csv: Summary DataFrame
  ✓ facial_emotions_heatmap_all.csv: All emotions heatmap data
  ✓ facial_emotions_heatmap_dominant.csv: Dominant emotions heatmap data
  ✓ facial_emotions_heatmap_scores.csv: Score-based heatmap data
