In [23]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import os

### Data Processing

In [2]:
df_val = pd.read_csv('/Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/result.csv')
df_bal = pd.read_csv('/Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/checkpoint_class_balance.csv')
df_epo = pd.read_csv('/Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/wandb_results.csv')

In [4]:
cols_to_drop = [
    "class", "true_count", "true_pct",
    "pred_count", "pred_pct"
]

df_bal_sum = (
    df_bal
    .drop(columns=cols_to_drop)
    .groupby("checkpoint")
    .agg({
        "%Delta": lambda x: (x**2).sum(),
        "%Best_val_accuracy": "mean"
    })
    .reset_index()
)

def parse_checkpoint(ckpt: str) -> pd.Series:
    # keep only file name in case a path is included
    name = Path(ckpt).name

    # strip extension
    if name.endswith(".pt"):
        name = name[:-3]

    parts = name.split("_")
    # parts example: ["timesformer", "2min_flip", "center", "2"]

    num_unfrozen_layers = int(parts[-1])
    frame_sampling = parts[-2]
    balance_mode = "_".join(parts[1:-2])   # everything between 'timesformer' and frame_sampling

    return pd.Series({
        "num_unfrozen_layers": num_unfrozen_layers,
        "frame_sampling": frame_sampling,
        "balance_mode": balance_mode,
    })


parsed = df_bal_sum["checkpoint"].apply(parse_checkpoint)
df_bal_sum = pd.concat([df_bal_sum, parsed], axis=1)

In [5]:
# Convert *all* columns that can be numeric into numeric
df_val = df_val.apply(pd.to_numeric, errors="ignore")
if "%Val_accuracy" in df_val.columns:
    df_val["%Val_accuracy"] = df_val["%Val_accuracy"] * 100

  df_val = df_val.apply(pd.to_numeric, errors="ignore")


In [8]:
keys = ["num_unfrozen_layers", "frame_sampling", "balance_mode"]
df_val_bal = df_val.merge(
    df_bal_sum,
    on=keys,
    how="left",      # base: df_val
    suffixes=("", "_bal")
)

df_all = df_val_bal.merge(
    df_epo[["best_epoch", "num_unfrozen_layers", "frame_sampling", "balance_mode"]],
    on=keys,
    how="left"
)

In [9]:
df_all.head()

Unnamed: 0,Name,num_unfrozen_layers,frame_sampling,balance_mode,State,Notes,User,Tags,Created,Runtime,...,num_workers,epoch,train/acc,train/loss,%Val_accuracy,val_loss,checkpoint,%Delta,%Best_val_accuracy,best_epoch
0,eager-frog-48,6,even,max_full,finished,-,,,2025-12-11T05:23:51.000Z,533,...,4,8,1.0,0.000137,96.969697,0.10487,timesformer_max_full_even_6.pt,11.403009,97.350993,5
1,confused-lion-22,2,even,min,finished,-,,,2025-12-11T02:44:54.000Z,229,...,4,8,1.0,0.001337,92.929293,0.165791,timesformer_min_even_2.pt,49.997807,94.701987,6
2,lemon-sun-23,4,even,min,finished,-,,,2025-12-11T02:48:44.000Z,233,...,4,8,1.0,0.002324,89.89899,0.204896,timesformer_min_even_4.pt,15.788781,94.039735,7
3,fresh-cosmos-29,4,first,2min_flip,finished,-,,,2025-12-11T03:12:40.000Z,349,...,4,8,1.0,0.000496,89.89899,0.208327,timesformer_2min_flip_first_4.pt,106.135696,92.715232,6
4,driven-waterfall-11,4,even,none,finished,-,,,2025-12-11T01:52:55.000Z,387,...,4,8,1.0,0.000269,92.929293,0.218351,timesformer_none_even_4.pt,49.997807,94.701987,4


## Summary

In [11]:
def summarize(df, group_cols):
    """
    df : DataFrame (df_all)
    group_cols : list of column names to group by
                 e.g. ["num_unfrozen_layers"]
                      ["frame_sampling", "balance_mode"]
                      ["num_unfrozen_layers", "frame_sampling", "balance_mode"]
    """

    agg_dict = {
        "val_loss": ["mean"], #, "min", "max", "std"]
        "%Val_accuracy": ["mean"], #, "min", "max", "std"],
        "%Delta": ["mean"], #, "min", "max", "std"]
        #"accuracy_pct": ["mean"], #, "min", "max", "std"],
        #"best_epoch": ["mean"] #, "min", "max", "std"]
    }

    summary_df = (
        df
        .groupby(group_cols)
        .agg(agg_dict)
    )

    # flatten multi-index columns
    summary_df.columns = [
        "_".join(col).strip()
        if isinstance(col, tuple) else col
        for col in summary_df.columns
    ]

    return summary_df.reset_index()


In [12]:
summarize(df_all, ["num_unfrozen_layers"])

Unnamed: 0,num_unfrozen_layers,val_loss_mean,%Val_accuracy_mean,%Delta_mean
0,2,0.411073,86.30752,192.486684
1,4,0.400091,86.55303,142.537608
2,6,0.564702,84.016637,115.784395


In [14]:
summarize(df_all, ["frame_sampling"])

Unnamed: 0,frame_sampling,val_loss_mean,%Val_accuracy_mean,%Delta_mean
0,center,0.406195,85.942761,250.354517
1,even,0.266423,91.630592,48.306152
2,first,0.428262,85.26936,164.612663
3,random,0.74287,79.176379,158.292652


In [15]:
summarize(df_all, ["balance_mode"])

Unnamed: 0,balance_mode,val_loss_mean,%Val_accuracy_mean,%Delta_mean
0,2min_flip,0.478758,85.353535,137.201585
1,max_full,0.495536,85.392385,158.225179
2,min,0.386762,85.858586,170.314168
3,none,0.469465,85.858586,140.47003


In [None]:
!mkdir 

In [25]:
IMG_DIR = "/Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/images"

def prettify(name: str):
    """Convert snake_case → 'Title Case'"""
    return name.replace("_", " ").title()

ORDER_MAP = {
    "num_unfrozen_layers": [2, 4, 6],
    "frame_sampling": ["center", "even", "first", "random"],
    "balance_mode": ["2min_flip", "max_full", "min", "none"],
}

def plot_box_plot(df, col1, col2):
    """
    Draw a boxplot:
      - x-axis: groups in col1
      - y-axis: values in col2
      - mean shown as royal blue diamond
      - no grid
    """

    # prepare grouped data
    order = ORDER_MAP[col1]
    grouped = [df[df[col1] == g][col2].dropna() for g in df[col1].unique()]
    labels = df[col1].unique()

    plt.figure(figsize=(8, 5))

    bp = plt.boxplot(
        grouped,
        labels=[str(g) for g in order],
        patch_artist=True,
        showmeans=True,
        meanprops=dict(
            marker='D',         # diamond
            markerfacecolor='royalblue',
            markeredgecolor='black',
            markersize=8
        ),
        medianprops=dict(color="black"),
    )
    for box in bp['boxes']:
        box.set(facecolor='gray', alpha=0.8)

    xlabel = prettify(col1)
    ylabel = prettify(col2)
    title  = f"Boxplot of {ylabel} by {xlabel}"

    plt.xlabel(xlabel, fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.title(title, fontsize=14)
    plt.grid(False)

    # --- SAVE IMAGE ---
    os.makedirs(IMG_DIR, exist_ok=True)
    filename = f"boxplot_{col1}_{col2}.png"
    filepath = os.path.join(IMG_DIR, filename)

    plt.tight_layout()
    plt.savefig(filepath, dpi=300, bbox_inches="tight")
    plt.close()

    print(f"Saved boxplot to: {filepath}")


In [26]:
for col1 in keys:
    for col2 in ["%Val_accuracy", "%Delta", "%Best_val_accuracy", "val_loss", "best_epoch"]:
        plot_box_plot(df_all, col1, col2)

  bp = plt.boxplot(
  bp = plt.boxplot(


Saved boxplot to: /Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/images/boxplot_num_unfrozen_layers_%Val_accuracy.png
Saved boxplot to: /Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/images/boxplot_num_unfrozen_layers_%Delta.png


  bp = plt.boxplot(
  bp = plt.boxplot(


Saved boxplot to: /Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/images/boxplot_num_unfrozen_layers_%Best_val_accuracy.png
Saved boxplot to: /Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/images/boxplot_num_unfrozen_layers_val_loss.png


  bp = plt.boxplot(
  bp = plt.boxplot(


Saved boxplot to: /Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/images/boxplot_num_unfrozen_layers_best_epoch.png
Saved boxplot to: /Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/images/boxplot_frame_sampling_%Val_accuracy.png


  bp = plt.boxplot(
  bp = plt.boxplot(


Saved boxplot to: /Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/images/boxplot_frame_sampling_%Delta.png
Saved boxplot to: /Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/images/boxplot_frame_sampling_%Best_val_accuracy.png


  bp = plt.boxplot(
  bp = plt.boxplot(


Saved boxplot to: /Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/images/boxplot_frame_sampling_val_loss.png
Saved boxplot to: /Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/images/boxplot_frame_sampling_best_epoch.png


  bp = plt.boxplot(
  bp = plt.boxplot(


Saved boxplot to: /Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/images/boxplot_balance_mode_%Val_accuracy.png
Saved boxplot to: /Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/images/boxplot_balance_mode_%Delta.png


  bp = plt.boxplot(
  bp = plt.boxplot(


Saved boxplot to: /Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/images/boxplot_balance_mode_%Best_val_accuracy.png
Saved boxplot to: /Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/images/boxplot_balance_mode_val_loss.png
Saved boxplot to: /Data/masayo.tomita/CV/CSC_51073_EP-Computer-Vision-Final-Project/images/boxplot_balance_mode_best_epoch.png


  bp = plt.boxplot(
