In [1]:
%load_ext autoreload
%autoreload 2
import sys
import os
sys.path.append("..")
    
from utils import visual_tools as visualTools
from utils import gt_and_modeling_dfs as prepare_df

import pandas as pd

# Preprocessing Episodes

> Train-test split

> **Choice of split:**\
> There are 4 main characters that we need to identify so the split time from a given episode is selected based on the equal (rough idea) no of apperances of the all all character in both splits.

In [2]:
# fps -> acc to processign requirements
FPS_TO_SAVE = 25

EPISODES = {
    "Muppets-02-01-01": {
        "path": "../data/raw/Muppets-02-01-01.avi",
        "train_split_timestamp": "19:30",
        "ground_truth_path": "../data/muppets-gt-2025wt/Ground_Truth_New_01.xlsx"
    },
    "Muppets-02-04-04": {
        "path": "../data/raw/Muppets-02-04-04.avi",
        "train_split_timestamp": "19:52",
        "ground_truth_path": "../data/muppets-gt-2025wt/Ground_Truth_New_04.xlsx"
    },
    "Muppets-03-04-03": {
        "path": "../data/raw/Muppets-03-04-03.avi",
        "train_split_timestamp": "19:54",
        "ground_truth_path": "../data/muppets-gt-2025wt/Ground_Truth_New_03.xlsx"
    }
}

## Extract frames from all episodes

In [None]:
for episode_name, ep in EPISODES.items():

    print(f"\n=== Processing {episode_name} ===")

    video_path = ep["path"]

    frames_dir, total_saved, info = visualTools.extract_frames(
    video_file=video_path,
    episode_name=episode_name,
    fps_to_save=FPS_TO_SAVE
)

    print(f"[Frames] {episode_name}: saved={total_saved}")


=== Processing Muppets-02-01-01 ===
[frames ok] saved=38682 frames in ../data/processed/video/Muppets-02-01-01-frames
[Frames] Muppets-02-01-01: saved=38682

=== Processing Muppets-02-04-04 ===
[frames ok] saved=38707 frames in ../data/processed/video/Muppets-02-04-04-frames
[Frames] Muppets-02-04-04: saved=38707

=== Processing Muppets-03-04-03 ===
[frames ok] saved=38499 frames in ../data/processed/video/Muppets-03-04-03-frames
[Frames] Muppets-03-04-03: saved=38499


## Combine GT for all episodes

In [4]:
all_ep_gt_df = prepare_df.all_ep_gt(EPISODES)

display(all_ep_gt_df.head())
display(all_ep_gt_df.tail())
print(all_ep_gt_df["Video"].value_counts())

Consolidated GT: 115885 rows, 10 columns


Unnamed: 0,Video,Frame_number,Timestamp,Kermit,Pigs,Miss Piggy,Cook,StatlerWaldorf,Rowlf the Dog,Fozzie Bear
0,211,0,00:00.00,0,0,0,0,0,0,0
1,211,1,00:00.04,0,0,0,0,0,0,0
2,211,2,00:00.08,0,0,0,0,0,0,0
3,211,3,00:00.12,0,0,0,0,0,0,0
4,211,4,00:00.16,0,0,0,0,0,0,0


Unnamed: 0,Video,Frame_number,Timestamp,Kermit,Pigs,Miss Piggy,Cook,StatlerWaldorf,Rowlf the Dog,Fozzie Bear
115880,343,38493,25:39.72,0,0,0,0,0,0,0
115881,343,38494,25:39.76,0,0,0,0,0,0,0
115882,343,38495,25:39.80,0,0,0,0,0,0,0
115883,343,38496,25:39.84,0,0,0,0,0,0,0
115884,343,38497,25:39.88,0,0,0,0,0,0,0


Video
244    38706
211    38681
343    38498
Name: count, dtype: int64


# Build Feature Space 
Dataframe with Visual Features and SIM1 characters 

In [7]:
SIM1_CHARACTERS = [
    'Kermit',
    'StatlerWaldorf',
    'Fozzie Bear'
    ]

SIM1_VISUAL_FEATURES = [
    'dominant_color',
    'green_mask',
    'edge_magnitude'
    ]

META_FEATURES = [
    'Video',
    'frame',
    'Frame_number',
    'Timestamp'
    ]

EPISODE_NAME_TO_VIDEO_ID = {
    "Muppets-02-01-01": 211,
    "Muppets-02-04-04": 244,
    "Muppets-03-04-03": 343
}

In [None]:
feature_space_visual_sim1, skipped = prepare_df.build_feature_space_df(
    feature_extractor_fn=visualTools.extract_visual_features_for_frame,
    feature_list=SIM1_VISUAL_FEATURES,
    gt_df=all_ep_gt_df,
    characters=SIM1_CHARACTERS,
    video_name_to_gt=EPISODE_NAME_TO_VIDEO_ID
)

print("Skipped frames:", skipped[:5])

print(
    f"[Features] {episode_name}: "
    f"visual feature space shape={feature_space_visual_sim1.shape}"
    )

[Feature space] SIM1 saved (115885, 12) -> ../data/processed/feature_spaces/visual_sim1_feature_space.csv
Skipped frames: []
[Features] Muppets-03-04-03: visual feature space shape=(115885, 12)


In [46]:
feature_space_visual_sim1.head()

Unnamed: 0,Video,Frame_number,Timestamp,frame,dom_H,dom_S,dom_V,green_frac,edge_mean,Kermit,StatlerWaldorf,Fozzie Bear
0,211,0,00:00.00,frame0.jpg,0,0,0,0.0,0.0,0,0,0
1,211,1,00:00.04,frame1.jpg,0,0,0,0.0,0.0,0,0,0
2,211,2,00:00.08,frame2.jpg,0,0,0,0.0,0.0,0,0,0
3,211,3,00:00.12,frame3.jpg,0,0,0,0.0,0.0,0,0,0
4,211,4,00:00.16,frame4.jpg,0,0,0,0.0,0.0,0,0,0


# Split Feature Space for Modeling
## train-test splits

In [8]:
feature_df = pd.read_csv(
    "../data/processed/feature_spaces/visual_sim1.csv"
)

train_df, test_df = prepare_df.split_feature_space_df(
    feature_df=feature_df,
    EPISODES=EPISODES,
    EPISODE_NAME_TO_VIDEO_ID=EPISODE_NAME_TO_VIDEO_ID
)

[split] Muppets-02-01-01 | Video=211 | train=29251, test=9430
[split] Muppets-02-04-04 | Video=244 | train=29801, test=8905
[split] Muppets-03-04-03 | Video=343 | train=29851, test=8647
[FINAL SPLIT] train=(88903, 12), test=(26982, 12)


### Making dataframes for model processing

In [None]:
X_train = train_df.drop(columns=LABEL_COLS + META_FEATURES)
y_train = train_df[SIM1_CHARACTERS]

X_test  = test_df.drop(columns=LABEL_COLS + META_FEATURES)
y_test  = test_df[SIM1_CHARACTERS]

# Model training