In [13]:
import cv2 as cv
from scipy.stats import skew
from skimage.feature import graycomatrix, graycoprops, local_binary_pattern, hog
from skimage.measure import shannon_entropy
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from joblib import Parallel, delayed
from cuml.svm import SVC
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import time
import os
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [14]:
# Default Configuration
CONFIG = {
    'sampling': {
        'strategy': 'uniform',  # Options: 'uniform', 'dense', 'random'
        'n_frames': 16,         # For uniform/random
        'frame_skip': 5,        # For dense
    },
    'resize_dim': (224, 224),
    'preprocess': {
        'denoise': True,
        'normalize_pixel': True, # Pixel value scaling 0-1
    },
    'normalization': 'minmax', # Options: 'minmax', 'standard'
    'n_jobs': -1,

    'lbp_radius': 3,
    'lbp_points': 8,

    'gabor': {
        'ksize': 31, # Increased for better texture capture
        'sigma': 4.0,
        'theta': 0,
        'lamda': 10.0,
        'gamma': 0.5,
        'phi': 0
    },

    'contour': {
        'count' : 3,
    },

    'lucas_kanade': {
        'max_corners': 20,
        'quality_level': 0.01,
        'min_distance': 10,
        'block_size': 7
    },
}


In [15]:
class TemporalFeatureExtractor:
    def __init__(self, window_len):
        self.window_len = window_len
        self.feature_buffer = []

    def update(self, frame_features: dict):
        self.feature_buffer.append(frame_features)
        if len(self.feature_buffer) > self.window_len:
            self.feature_buffer.pop(0)
        return self._get_temporal_features()

    def _get_temporal_features(self):
        features = {}
        if not self.feature_buffer:
            return features

        df = pd.DataFrame(self.feature_buffer)

        # Only compute temporal stats for numeric columns
        numeric_cols = df.select_dtypes(include=[np.number]).columns

        for col in numeric_cols:
            features[f"{col}_temp_mean"] = df[col].mean()
            features[f"{col}_temp_std"] = df[col].std(ddof=0)

            if len(self.feature_buffer) > 1:
                deltas = df[col].diff()
                features[f"{col}_delta_mean"] = deltas.mean()
            else:
                features[f"{col}_delta_mean"] = 0.0

        # Simple motion trend
        if len(self.feature_buffer) > 2 and 'motion_avg_intensity' in df.columns:
            try:
                # Use range as x, intensity as y
                slope = np.polyfit(range(len(self.feature_buffer)), df['motion_avg_intensity'], 1)[0]
            except:
                slope = 0.0
        else:
            slope = 0.0
        features[f"motion_slope"] = slope

        return features


In [16]:
class VideoExtractorFeature:
    def __init__(self, config=None):
        self.config = config if config else CONFIG

        if self.config.get('normalization') == 'standard':
            self.scaler = StandardScaler()
        else:
            self.scaler = MinMaxScaler()

        # Initialize Gabor Kernel
        # Ensure parameters are integers/floats as expected by OpenCV
        g_params = self.config['gabor']
        self.gabor_kernel = cv.getGaborKernel(
            (int(g_params['ksize']), int(g_params['ksize'])),
            float(g_params['sigma']),
            float(g_params['theta']),
            float(g_params['lamda']),
            float(g_params['gamma']),
            float(g_params['phi']),
            ktype=cv.CV_32F
        )

    def _preprocess_frame(self, frame):
        """Apply denoising, resizing and normalization."""
        if frame is None:
            return None

        # Denoise
        if self.config['preprocess'].get('denoise'):
            frame = cv.GaussianBlur(frame, (5, 5), 0)

        # Resize
        frame = cv.resize(frame, self.config['resize_dim'], interpolation=cv.INTER_AREA)

        return frame

    def _get_frame_color_features(self, frame):
        hsv_frame = cv.cvtColor(frame, cv.COLOR_BGR2HSV)
        rgb_frame = cv.cvtColor(frame, cv.COLOR_BGR2RGB)

        features = {}
        # RGB Histograms
        for i, colour in enumerate(['red', 'blue', 'green']):
            channel = rgb_frame[:, :, i]
            hist, _ = np.histogram(channel.ravel(), bins=10, range=(0, 256))
            hist = hist.astype('float')
            hist /= (hist.sum() + 1e-7)
            for j in range(len(hist)):
                features[f'color_{colour}_{j}'] = hist[j]

        # HSV Stats
        for i, column_name in enumerate(['h', 's', 'v']):
            channel = hsv_frame[:, :, i]
            mean = np.mean(channel)
            std = np.std(channel)

            features[f'moments_mean_{column_name}'] = mean
            features[f'moments_std_{column_name}'] = std

            if std > 1e-6:
                skew_val = skew(channel.flatten())
                features[f'moments_skew_{column_name}'] = 0 if np.isnan(skew_val) else skew_val
            else:
                features[f'moments_skew_{column_name}'] = 0

        avg_rgb  = np.mean(rgb_frame, axis=(0, 1))
        features['avg_color_r'] = avg_rgb[0]
        features['avg_color_g'] = avg_rgb[1]
        features['avg_color_b'] = avg_rgb[2]
        return features

    def _get_frame_glcm_features(self, grey_frame):
        features = {}
        # Using fewer distances/angles for efficiency while capturing texture
        distances = [1, 3]
        angles = [0, np.pi/2] # Horizontal and Vertical

        # GLCM requires integer types
        grey_frame_int = (grey_frame).astype(np.uint8)

        glcm = graycomatrix(grey_frame_int, distances=distances, angles=angles, levels=256, symmetric=True, normed=True)

        props = ['contrast', 'dissimilarity', 'homogeneity', 'correlation', 'energy']
        for prop in props:
            val = graycoprops(glcm, prop).ravel()
            # Average over all distances/angles to reduce feature dimensionality
            features[f'glcm_{prop}_mean'] = np.mean(val)
            features[f'glcm_{prop}_std'] = np.std(val)

        features['glcm_entropy'] = shannon_entropy(grey_frame)
        return features

    def _lbp_features(self, grey_frame):
        # LBP usually on integer images? scikit-image handles float but warns.
        # Ensure it works.
        lbp = local_binary_pattern(grey_frame, self.config['lbp_points'], self.config['lbp_radius'], method='uniform')
        # Uniform LBP histogram
        n_bins = self.config['lbp_points'] + 2
        hist, _ = np.histogram(lbp.ravel(), bins=n_bins, range=(0, n_bins))
        hist = hist.astype('float')
        hist /= (hist.sum() + 1e-7)

        features = {}
        for i in range(len(hist)):
            features[f'lbp_{i}'] = hist[i]
        return features

    def _get_gabor_features(self, grey_frame):
        gabor_features = cv.filter2D(grey_frame, cv.CV_32F, self.gabor_kernel)

        mean = np.mean(gabor_features)
        std = np.std(gabor_features)
        features = {
            'gabor_mean': mean,
            'gabor_std': std
        }
        return features

    def _get_canny_features(self, grey_frame):
        sigma = 0.33
        v = np.median(grey_frame)
        lower = int(max(0, (1.0 - sigma) * v))
        upper = int(min(255, (1.0 + sigma) * v))
        edges = cv.Canny(grey_frame, lower, upper)

        # Edge density
        edge_density = np.sum(edges > 0) / (edges.shape[0] * edges.shape[1])
        features = {'canny_edge_density': edge_density}
        return features

    def _get_contour_features(self, grey_frame):
        # Binary threshold
        _, img_th = cv.threshold(grey_frame, 127, 255, cv.THRESH_BINARY)
        contours, _ = cv.findContours(img_th, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)

        features = {}
        count = self.config['contour']['count']

        # Sort by area
        sorted_contours = sorted(contours, key=cv.contourArea, reverse=True)

        for i in range(count):
            if i < len(sorted_contours):
                c = sorted_contours[i]
                area = cv.contourArea(c)
                perimeter = cv.arcLength(c, True)
                if perimeter == 0: perimeter = 1e-7
                circularity = 4 * np.pi * (area / (perimeter * perimeter))

                features[f'contour_{i}_area'] = area
                features[f'contour_{i}_circularity'] = circularity
            else:
                features[f'contour_{i}_area'] = 0
                features[f'contour_{i}_circularity'] = 0
        return features

    def _get_hog_features(self, grey_frame):
        # Using smaller image for HOG to reduce dimensions
        features = {}
        small = cv.resize(grey_frame, (64, 64))
        hog_feats = hog(small, orientations=9, pixels_per_cell=(16, 16), cells_per_block=(2, 2), block_norm='L2-Hys')

        # Statistical summary of HOG
        features['hog_mean'] = np.mean(hog_feats)
        features['hog_std'] = np.std(hog_feats)
        features['hog_max'] = np.max(hog_feats)
        return features

    def _get_optical_flow_features(self, prev_grey, curr_grey):
        features = {}

        # Farneback Dense Optical Flow
        flow = cv.calcOpticalFlowFarneback(prev_grey, curr_grey, None, 0.5, 3, 15, 3, 5, 1.2, 0)
        mag, ang = cv.cartToPolar(flow[..., 0], flow[..., 1])

        features['flow_mag_mean'] = np.mean(mag)
        features['flow_mag_std'] = np.std(mag)
        features['flow_ang_mean'] = np.mean(ang)

        # Histogram of flow magnitude
        hist_mag, _ = np.histogram(mag, bins=5, range=(0, 10))
        hist_mag = hist_mag.astype(float) / (hist_mag.sum() + 1e-7)
        for i, val in enumerate(hist_mag):
            features[f'flow_mag_hist_{i}'] = val

        return features

    def _process_video(self, row: dict):
        video_id = row['index']
        video_path = row.get('clip_path', '')
        if not video_path:
            return []

        cap = cv.VideoCapture(video_path)
        frames = []
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret: break
            frames.append(frame)
        cap.release()

        if not frames:
            return []

        total_frames = len(frames)
        selected_indices = []
        strategy = self.config['sampling']['strategy']

        if strategy == 'uniform':
            n_frames = self.config['sampling']['n_frames']
            if total_frames <= n_frames:
                selected_indices = list(range(total_frames))
            else:
                selected_indices = np.linspace(0, total_frames - 1, n_frames).astype(int)
        elif strategy == 'random':
            n_frames = self.config['sampling']['n_frames']
            if total_frames <= n_frames:
                selected_indices = list(range(total_frames))
            else:
                indices = np.random.choice(total_frames, n_frames, replace=False)
                selected_indices = np.sort(indices)
        else: # dense
            skip = self.config['sampling']['frame_skip']
            selected_indices = range(0, total_frames, skip)

        video_features = []
        prev_grey_frame = None
        # Should create new instance per video to avoid state bleed
        temporal_extractor = TemporalFeatureExtractor(5)

        for idx in selected_indices:
            frame = frames[idx]
            frame = self._preprocess_frame(frame)
            grey_frame = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)

            frame_feat = {'index': video_id}

            # Spatial Features
            frame_feat.update(self._get_frame_color_features(frame))
            frame_feat.update(self._get_frame_glcm_features(grey_frame))
            frame_feat.update(self._lbp_features(grey_frame))
            frame_feat.update(self._get_gabor_features(grey_frame))
            frame_feat.update(self._get_canny_features(grey_frame))
            frame_feat.update(self._get_contour_features(grey_frame))
            frame_feat.update(self._get_hog_features(grey_frame))

            # Temporal/Motion Features
            if prev_grey_frame is not None:
                motion_feat = self._get_optical_flow_features(prev_grey_frame, grey_frame)
                temporal_stats = temporal_extractor.update(motion_feat)
                frame_feat.update(motion_feat)
                frame_feat.update(temporal_stats)
            else:
                # First frame, zero motion
                motion_feat = self._get_optical_flow_features(grey_frame, grey_frame) # Zero flow basically
                # Since _get_optical_flow calculates flow between two frames, for first frame we can pass SAME frame -> 0 flow
                temporal_stats = temporal_extractor.update(motion_feat)
                frame_feat.update(motion_feat)
                frame_feat.update(temporal_stats)

            video_features.append(frame_feat)
            prev_grey_frame = grey_frame

        return video_features

    def process_dataset(self, df: pd.DataFrame, is_test=False):
        print(f"Processing {len(df)} videos with {self.config['n_jobs']} jobs...")
        rows = df.reset_index().to_dict('records')

        # Using joblib backend 'threading' might be safer for OpenCV which releases GIL?
        # But 'loky' (default) is safer for process isolation.
        nested_results = Parallel(n_jobs=self.config['n_jobs'])(delayed(self._process_video)(row) for row in rows)

        flatten_results = [item for sub_list in nested_results if sub_list for item in sub_list]
        feature_df = pd.DataFrame(flatten_results)

        feature_names = [col for col in feature_df.columns if col not in ['index', 'encoded_label']]
        # Fill NaNs
        feature_df[feature_names] = feature_df[feature_names].fillna(0)
        feature_df[feature_names] = feature_df[feature_names].replace([np.inf, -np.inf], 0)

        return feature_df

    def aggregate_features(self, feature_df: pd.DataFrame):
        """Aggregate frame-level features to video-level features."""
        if feature_df.empty:
            return pd.DataFrame()

        # Group by video index
        # We compute mean, std, min, max for each numerical feature
        agg_funcs = ['mean', 'std', 'min', 'max']

        # Drop non-numeric for aggregation
        numeric_cols = feature_df.select_dtypes(include=[np.number]).columns.tolist()
        if 'index' in numeric_cols: numeric_cols.remove('index')
        if 'encoded_label' in numeric_cols: numeric_cols.remove('encoded_label')

        agg_dict = {col: agg_funcs for col in numeric_cols}

        # We need 'index' to group
        grouped = feature_df.groupby('index')

        agg_df = grouped[numeric_cols].agg(agg_funcs)

        # Flatten MultiIndex columns
        agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]

        return agg_df

    def fit_transform_scaler(self, df, is_test=False):
        # Helper to scale features
        feature_names = [col for col in df.columns if col not in ['index', 'encoded_label']]

        # Check if scalar is initialized (it is in init)

        if is_test:
            # Handle unseen columns in test?
            # Ensure columns match scaler?
            # For now assume consistent feature extraction
            df[feature_names] = self.scaler.transform(df[feature_names])
        else:
            df[feature_names] = self.scaler.fit_transform(df[feature_names])
        return df


In [17]:
# Setup Directories
OUTPUT_DIR = "results_classical"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def load_data():
    train = pd.read_csv("./dataset/splits/train.csv", index_col='index')
    val = pd.read_csv("./dataset/splits/validation.csv", index_col='index')
    test = pd.read_csv("./dataset/splits/test.csv", index_col='index')
    return train, val, test


In [18]:
def train_svm_optuna(X_train, y_train, X_val, y_val, trials=20):
    def objective(trial):
        params = {
            'C': trial.suggest_float('C', 1e-2, 1e2, log=True),
            'gamma': trial.suggest_float('gamma', 1e-3, 1e1, log=True),
            'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly'])
        }

        clf = SVC(**params, probability=True)
        clf.fit(X_train, y_train)

        preds = clf.predict(X_val)
        acc = accuracy_score(y_val, preds)
        return acc

    print("Optimizing SVM...")
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=trials)

    print("Best params (SVM):", study.best_params)
    return study.best_params


In [19]:
def train_rf_optuna(X_train, y_train, X_val, y_val, trials=20):
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'max_depth': trial.suggest_int('max_depth', 5, 50),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 15),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        }

        clf = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
        clf.fit(X_train, y_train)

        preds = clf.predict(X_val)
        acc = accuracy_score(y_val, preds)
        return acc

    print("Optimizing Random Forest...")
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=trials)

    print("Best params (RF):", study.best_params)
    return study.best_params


In [20]:
def train_knn_optuna(X_train, y_train, X_val, y_val, trials=20):
    def objective(trial):
        params = {
            'n_neighbors': trial.suggest_int('n_neighbors', 3, 20),
            'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
            'metric': trial.suggest_categorical('metric', ['euclidean', 'manhattan']),
        }

        clf = KNeighborsClassifier(**params, n_jobs=-1)
        clf.fit(X_train, y_train)

        preds = clf.predict(X_val)
        acc = accuracy_score(y_val, preds)
        return acc

    print("Optimizing KNN...")
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=trials)

    print("Best params (KNN):", study.best_params)
    return study.best_params


In [21]:
def plot_confusion_matrix(y_true, y_pred, labels, title, filename):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.title(title)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(os.path.join(OUTPUT_DIR, filename))
    plt.close()


In [22]:
def plot_roc_curve(clf, X_test, y_test, label_encoder, filename):
    # Only works if probability=True or decision_function available
    # cuML SVC might not support predict_proba easily for all kernels?
    # sklearn SVC needs probability=True.

    try:
        y_score = clf.predict_proba(X_test)
    except:
        try:
             y_score = clf.decision_function(X_test)
        except:
            print("Model does not support probability/decision function. Skipping ROC.")
            return

    n_classes = len(label_encoder.classes_)

    # Binarize output
    from sklearn.preprocessing import label_binarize
    y_test_bin = label_binarize(y_test, classes=range(n_classes))

    plt.figure()
    colors = ['blue', 'red', 'green']
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, color=colors[i], lw=2, label=f'Class {label_encoder.classes_[i]} (area = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.savefig(os.path.join(OUTPUT_DIR, filename))
    plt.close()


In [23]:
def run_experiment(name, config_update):
    print(f"\n--- Running Experiment: {name} ---")

    # Reload Config
    cfg = CONFIG.copy()
    cfg.update(config_update)

    extractor = VideoExtractorFeature(cfg)

    train_df, val_df, test_df = load_data()

    # Process
    t0 = time.time()
    print("Extracting features...")
    X_train_frames = extractor.process_dataset(train_df)
    X_val_frames = extractor.process_dataset(val_df)

    # Scaling
    X_train_frames = extractor.fit_transform_scaler(X_train_frames)
    X_val_frames = extractor.fit_transform_scaler(X_val_frames, is_test=True)

    print(f"Extraction time: {time.time()-t0:.2f}s")

    # Merge Labels
    # We need to map back to labels.
    # process_dataset returns df with 'index'
    # train_df index is 'index'

    train_merged = pd.merge(train_df.reset_index(), X_train_frames, on='index')
    val_merged = pd.merge(val_df.reset_index(), X_val_frames, on='index')

    # Encode Labels
    le = LabelEncoder()
    # Fit on all possible labels to avoid unseen label errors
    all_labels = pd.concat([train_merged['label'], val_merged['label']]).unique()
    le.fit(all_labels)

    y_train_enc = le.transform(train_merged['label'])
    y_val_enc = le.transform(val_merged['label'])

    # Mode 1: Frame Level Classification + Voting
    print("Training Frame-Level Model...")
    feature_cols = [c for c in X_train_frames.columns if c not in ['index', 'encoded_label']]

    X_train_f = train_merged[feature_cols].values
    X_val_f = val_merged[feature_cols].values

    best_params = train_svm_optuna(X_train_f, y_train_enc, X_val_f, y_val_enc, trials=10)

    best_clf = SVC(**best_params, probability=True)
    best_clf.fit(X_train_f, y_train_enc)

    # Evaluate Video Level (Voting)
    val_preds_frames = best_clf.predict(X_val_f)
    val_merged['pred'] = val_preds_frames

    y_true_vid = []
    y_pred_vid = []

    for vid, group in val_merged.groupby('index'):
        y_true_vid.append(le.transform([group['label'].iloc[0]])[0])
        # Mode of frame predictions
        mode_pred = group['pred'].mode().iloc[0]
        y_pred_vid.append(mode_pred)

    acc = accuracy_score(y_true_vid, y_pred_vid)
    print(f"Video Level Accuracy (Voting): {acc:.4f}")

    plot_confusion_matrix(y_true_vid, y_pred_vid, le.classes_, f"Confusion Matrix - {name} (Voting)", f"cm_{name}_voting.png")

    # Mode 2: Video Level Aggregation
    print("Training Video-Level Aggregated Model...")

    # Aggregate
    # Note: re-process from raw X_train_frames (already scaled)
    # Aggregating SCALED features is fine.

    X_train_vid_df = extractor.aggregate_features(X_train_frames)
    X_val_vid_df = extractor.aggregate_features(X_val_frames)

    # We lost labels in aggregation, need to re-merge
    # X_train_vid_df index is 'index' (video id)

    train_vid_merged = pd.merge(train_df, X_train_vid_df, left_index=True, right_index=True)
    val_vid_merged = pd.merge(val_df, X_val_vid_df, left_index=True, right_index=True)

    feat_cols_vid = [c for c in train_vid_merged.columns if c not in train_df.columns]

    X_train_v = train_vid_merged[feat_cols_vid].values
    y_train_v = le.transform(train_vid_merged['label'])

    X_val_v = val_vid_merged[feat_cols_vid].values
    y_val_v = le.transform(val_vid_merged['label'])

    # PCA (Optional - req 4.4)
    pca = PCA(n_components=0.95) # Keep 95% variance
    X_train_v_pca = pca.fit_transform(X_train_v)
    X_val_v_pca = pca.transform(X_val_v)
    print(f"PCA reduced dim from {X_train_v.shape[1]} to {X_train_v_pca.shape[1]}")

    # 1. SVM Video
    best_params_vid_svm = train_svm_optuna(X_train_v_pca, y_train_v, X_val_v_pca, y_val_v, trials=10)
    best_clf_vid_svm = SVC(**best_params_vid_svm, probability=True)
    best_clf_vid_svm.fit(X_train_v_pca, y_train_v)
    vid_preds_svm = best_clf_vid_svm.predict(X_val_v_pca)
    acc_vid_svm = accuracy_score(y_val_v, vid_preds_svm)
    print(f"Video Level Accuracy (SVM + PCA): {acc_vid_svm:.4f}")
    plot_confusion_matrix(y_val_v, vid_preds_svm, le.classes_, f"Confusion Matrix - {name} (SVM)", f"cm_{name}_svm.png")
    plot_roc_curve(best_clf_vid_svm, X_val_v_pca, y_val_v, le, f"roc_{name}_svm.png")

    # 2. Random Forest Video
    best_params_vid_rf = train_rf_optuna(X_train_v, y_train_v, X_val_v, y_val_v, trials=10)
    best_clf_vid_rf = RandomForestClassifier(**best_params_vid_rf, random_state=42)
    best_clf_vid_rf.fit(X_train_v, y_train_v)
    vid_preds_rf = best_clf_vid_rf.predict(X_val_v)
    acc_vid_rf = accuracy_score(y_val_v, vid_preds_rf)
    print(f"Video Level Accuracy (RF): {acc_vid_rf:.4f}")
    plot_confusion_matrix(y_val_v, vid_preds_rf, le.classes_, f"Confusion Matrix - {name} (RF)", f"cm_{name}_rf.png")
    plot_roc_curve(best_clf_vid_rf, X_val_v, y_val_v, le, f"roc_{name}_rf.png")

    # 3. KNN Video
    best_params_vid_knn = train_knn_optuna(X_train_v_pca, y_train_v, X_val_v_pca, y_val_v, trials=10)
    best_clf_vid_knn = KNeighborsClassifier(**best_params_vid_knn)
    best_clf_vid_knn.fit(X_train_v_pca, y_train_v)
    vid_preds_knn = best_clf_vid_knn.predict(X_val_v_pca)
    acc_vid_knn = accuracy_score(y_val_v, vid_preds_knn)
    print(f"Video Level Accuracy (KNN + PCA): {acc_vid_knn:.4f}")
    plot_confusion_matrix(y_val_v, vid_preds_knn, le.classes_, f"Confusion Matrix - {name} (KNN)", f"cm_{name}_knn.png")
    plot_roc_curve(best_clf_vid_knn, X_val_v_pca, y_val_v, le, f"roc_{name}_knn.png")

    return {
        'frame_voting_acc': acc,
        'video_agg_acc_svm': acc_vid_svm,
        'video_agg_acc_rf': acc_vid_rf,
        'video_agg_acc_knn': acc_vid_knn,
    }


In [24]:
    try:
        print("Starting main...")
        # 1. Baseline: Uniform Sampling, MinMax
        res_baseline = run_experiment('baseline_uniform_minmax', {
            'sampling': {'strategy': 'uniform', 'n_frames': 16},
            'normalization': 'minmax'
        })

        # 2. Improved: Uniform, StandardScaler (Req 3)
        res_std = run_experiment('uniform_stdscaler', {
            'sampling': {'strategy': 'uniform', 'n_frames': 16},
            'normalization': 'standard'
        })

        # 3. Improved: Dense Sampling (Req 1), standard scaler
        res_dense = run_experiment('dense_stdscaler', {
            'sampling': {'strategy': 'dense', 'frame_skip': 5},
            'normalization': 'standard'
        })

        print("\n--- Summary ---")
        print("Baseline (Voting):", res_baseline['frame_voting_acc'])
        print("Baseline (SVM Agg):", res_baseline['video_agg_acc_svm'])
        print("Baseline (RF Agg):", res_baseline['video_agg_acc_rf'])
        print("Baseline (KNN Agg):", res_baseline['video_agg_acc_knn'])

        print("\nStdScaler (Voting):", res_std['frame_voting_acc'])
        print("StdScaler (SVM Agg):", res_std['video_agg_acc_svm'])
        print("StdScaler (RF Agg):", res_std['video_agg_acc_rf'])
        print("StdScaler (KNN Agg):", res_std['video_agg_acc_knn'])

        print("\nDense (Voting):", res_dense['frame_voting_acc'])
        print("Dense (SVM Agg):", res_dense['video_agg_acc_svm'])
        print("Dense (RF Agg):", res_dense['video_agg_acc_rf'])
        print("Dense (KNN Agg):", res_dense['video_agg_acc_knn'])
    except Exception as e:
        import traceback
        traceback.print_exc()
        print(f"CRITICAL ERROR: {e}")


Starting main...

--- Running Experiment: baseline_uniform_minmax ---
Extracting features...
Processing 240 videos with -1 jobs...
Processing 30 videos with -1 jobs...


[32m[I 2026-02-08 20:43:37,706][0m A new study created in memory with name: no-name-7fa2b97c-842e-4b34-8a8b-bd72dce84afb[0m


Extraction time: 29.14s
Training Frame-Level Model...
Optimizing SVM...


[32m[I 2026-02-08 20:43:38,891][0m Trial 0 finished with value: 0.98125 and parameters: {'C': 44.029503622045624, 'gamma': 0.7706949835469655, 'kernel': 'linear'}. Best is trial 0 with value: 0.98125.[0m
[32m[I 2026-02-08 20:43:39,609][0m Trial 1 finished with value: 0.8458333333333333 and parameters: {'C': 1.8244183815173216, 'gamma': 0.0010256799247576804, 'kernel': 'poly'}. Best is trial 0 with value: 0.98125.[0m
[32m[I 2026-02-08 20:43:40,331][0m Trial 2 finished with value: 0.8458333333333333 and parameters: {'C': 7.559321515388865, 'gamma': 0.001143871696222311, 'kernel': 'poly'}. Best is trial 0 with value: 0.98125.[0m
[32m[I 2026-02-08 20:43:40,983][0m Trial 3 finished with value: 0.9770833333333333 and parameters: {'C': 2.9289251423493536, 'gamma': 2.36813083743141, 'kernel': 'poly'}. Best is trial 0 with value: 0.98125.[0m
[32m[I 2026-02-08 20:43:41,672][0m Trial 4 finished with value: 0.6416666666666667 and parameters: {'C': 0.0231998194362148, 'gamma': 4.32329

Best params (SVM): {'C': 5.505601269665177, 'gamma': 0.08123494588590749, 'kernel': 'rbf'}


[32m[I 2026-02-08 20:43:46,185][0m A new study created in memory with name: no-name-4991524f-e9e6-4d3e-9849-ded09a71e344[0m


Video Level Accuracy (Voting): 1.0000
Training Video-Level Aggregated Model...
PCA reduced dim from 432 to 35
Optimizing SVM...


[32m[I 2026-02-08 20:43:46,377][0m Trial 0 finished with value: 1.0 and parameters: {'C': 6.974262538406011, 'gamma': 0.07412541031124908, 'kernel': 'rbf'}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-08 20:43:46,517][0m Trial 1 finished with value: 0.7666666666666667 and parameters: {'C': 0.02607327420032932, 'gamma': 0.022360605962535272, 'kernel': 'rbf'}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-08 20:43:46,673][0m Trial 2 finished with value: 0.9666666666666667 and parameters: {'C': 42.390353146900054, 'gamma': 0.18994826444447813, 'kernel': 'poly'}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-08 20:43:46,808][0m Trial 3 finished with value: 0.43333333333333335 and parameters: {'C': 0.04214716899433626, 'gamma': 0.0141841325171663, 'kernel': 'poly'}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-08 20:43:46,996][0m Trial 4 finished with value: 1.0 and parameters: {'C': 3.9072090703104605, 'gamma': 0.6107776959075499, 'kernel': 'rbf'}. Best

Best params (SVM): {'C': 6.974262538406011, 'gamma': 0.07412541031124908, 'kernel': 'rbf'}
Video Level Accuracy (SVM + PCA): 1.0000


[32m[I 2026-02-08 20:43:48,268][0m A new study created in memory with name: no-name-a0ce79f4-d6ca-4b76-8a75-dae844da0f1e[0m


Optimizing Random Forest...


[32m[I 2026-02-08 20:43:48,569][0m Trial 0 finished with value: 0.9333333333333333 and parameters: {'n_estimators': 80, 'max_depth': 21, 'min_samples_split': 3, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.9333333333333333.[0m
[32m[I 2026-02-08 20:43:48,966][0m Trial 1 finished with value: 0.9666666666666667 and parameters: {'n_estimators': 120, 'max_depth': 46, 'min_samples_split': 12, 'min_samples_leaf': 7}. Best is trial 1 with value: 0.9666666666666667.[0m
[32m[I 2026-02-08 20:43:49,823][0m Trial 2 finished with value: 0.9666666666666667 and parameters: {'n_estimators': 231, 'max_depth': 25, 'min_samples_split': 11, 'min_samples_leaf': 7}. Best is trial 1 with value: 0.9666666666666667.[0m
[32m[I 2026-02-08 20:43:50,121][0m Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 95, 'max_depth': 28, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 3 with value: 1.0.[0m
[32m[I 2026-02-08 20:43:50,828][0m Trial 4 finished with value: 1.0 

Best params (RF): {'n_estimators': 95, 'max_depth': 28, 'min_samples_split': 3, 'min_samples_leaf': 1}
Video Level Accuracy (RF): 1.0000


[32m[I 2026-02-08 20:43:55,120][0m A new study created in memory with name: no-name-6bd62aaf-227e-4c80-9d4a-638ee898b9e0[0m
[32m[I 2026-02-08 20:43:55,155][0m Trial 0 finished with value: 0.8 and parameters: {'n_neighbors': 9, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 0 with value: 0.8.[0m
[32m[I 2026-02-08 20:43:55,158][0m Trial 1 finished with value: 0.8 and parameters: {'n_neighbors': 8, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 0 with value: 0.8.[0m
[32m[I 2026-02-08 20:43:55,161][0m Trial 2 finished with value: 0.6666666666666666 and parameters: {'n_neighbors': 18, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 0 with value: 0.8.[0m
[32m[I 2026-02-08 20:43:55,163][0m Trial 3 finished with value: 0.9 and parameters: {'n_neighbors': 8, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 3 with value: 0.9.[0m
[32m[I 2026-02-08 20:43:55,165][0m Trial 4 finished with value: 0.7 and parameters: {'n_neighbors': 15

Optimizing KNN...
Best params (KNN): {'n_neighbors': 10, 'weights': 'distance', 'metric': 'euclidean'}
Video Level Accuracy (KNN + PCA): 0.9667

--- Running Experiment: uniform_stdscaler ---
Extracting features...
Processing 240 videos with -1 jobs...
Processing 30 videos with -1 jobs...


[32m[I 2026-02-08 20:44:22,145][0m A new study created in memory with name: no-name-668bce2d-c755-4d88-8b91-a5083bea56b6[0m


Extraction time: 26.82s
Training Frame-Level Model...
Optimizing SVM...


[32m[I 2026-02-08 20:44:23,660][0m Trial 0 finished with value: 0.99375 and parameters: {'C': 0.11035255258703651, 'gamma': 0.03099734567774172, 'kernel': 'linear'}. Best is trial 0 with value: 0.99375.[0m
[32m[I 2026-02-08 20:44:25,949][0m Trial 1 finished with value: 0.65625 and parameters: {'C': 1.270019263670051, 'gamma': 0.550146633344877, 'kernel': 'rbf'}. Best is trial 0 with value: 0.99375.[0m
[32m[I 2026-02-08 20:44:27,610][0m Trial 2 finished with value: 0.9854166666666667 and parameters: {'C': 0.9407938750760856, 'gamma': 0.01501702396074752, 'kernel': 'linear'}. Best is trial 0 with value: 0.99375.[0m
[32m[I 2026-02-08 20:44:28,367][0m Trial 3 finished with value: 0.9583333333333334 and parameters: {'C': 0.3723430817230827, 'gamma': 0.16052394228218742, 'kernel': 'poly'}. Best is trial 0 with value: 0.99375.[0m
[32m[I 2026-02-08 20:44:29,939][0m Trial 4 finished with value: 0.9854166666666667 and parameters: {'C': 1.6577526271739336, 'gamma': 0.002479438445240

Best params (SVM): {'C': 0.41735587062508644, 'gamma': 0.0030620730806854386, 'kernel': 'rbf'}


[32m[I 2026-02-08 20:44:34,644][0m A new study created in memory with name: no-name-bdd1ad8c-f807-4021-8255-9891f650ca2e[0m


Video Level Accuracy (Voting): 1.0000
Training Video-Level Aggregated Model...
PCA reduced dim from 432 to 37
Optimizing SVM...


[32m[I 2026-02-08 20:44:34,820][0m Trial 0 finished with value: 0.9 and parameters: {'C': 0.47966050043947905, 'gamma': 0.02347808187799514, 'kernel': 'poly'}. Best is trial 0 with value: 0.9.[0m
[32m[I 2026-02-08 20:44:34,985][0m Trial 1 finished with value: 0.36666666666666664 and parameters: {'C': 0.720377167595496, 'gamma': 2.723643626525303, 'kernel': 'rbf'}. Best is trial 0 with value: 0.9.[0m
[32m[I 2026-02-08 20:44:35,168][0m Trial 2 finished with value: 1.0 and parameters: {'C': 4.156994223359954, 'gamma': 0.003604620044945133, 'kernel': 'linear'}. Best is trial 2 with value: 1.0.[0m
[32m[I 2026-02-08 20:44:35,337][0m Trial 3 finished with value: 0.9333333333333333 and parameters: {'C': 0.04619808587870525, 'gamma': 1.7547191573531762, 'kernel': 'linear'}. Best is trial 2 with value: 1.0.[0m
[32m[I 2026-02-08 20:44:35,540][0m Trial 4 finished with value: 0.9 and parameters: {'C': 8.69866238707858, 'gamma': 2.3701084193639392, 'kernel': 'poly'}. Best is trial 2 wi

Best params (SVM): {'C': 4.156994223359954, 'gamma': 0.003604620044945133, 'kernel': 'linear'}
Video Level Accuracy (SVM + PCA): 1.0000


[32m[I 2026-02-08 20:44:36,886][0m A new study created in memory with name: no-name-5b774466-f5f1-41b6-9f01-78952b10d167[0m


Optimizing Random Forest...


[32m[I 2026-02-08 20:44:37,343][0m Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 220, 'max_depth': 19, 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-08 20:44:37,641][0m Trial 1 finished with value: 0.9666666666666667 and parameters: {'n_estimators': 240, 'max_depth': 24, 'min_samples_split': 11, 'min_samples_leaf': 9}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-08 20:44:37,935][0m Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 197, 'max_depth': 48, 'min_samples_split': 11, 'min_samples_leaf': 2}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-08 20:44:38,560][0m Trial 3 finished with value: 0.9666666666666667 and parameters: {'n_estimators': 231, 'max_depth': 49, 'min_samples_split': 2, 'min_samples_leaf': 5}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-08 20:44:39,160][0m Trial 4 finished with value: 1.0 and parameters: {'n_estimators': 163, 'max_depth': 13, 'mi

Best params (RF): {'n_estimators': 220, 'max_depth': 19, 'min_samples_split': 2, 'min_samples_leaf': 3}


[32m[I 2026-02-08 20:44:42,135][0m A new study created in memory with name: no-name-c7ea3b8a-3796-44b1-b34f-c217f44537c7[0m
[32m[I 2026-02-08 20:44:42,142][0m Trial 0 finished with value: 0.9 and parameters: {'n_neighbors': 20, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 0 with value: 0.9.[0m
[32m[I 2026-02-08 20:44:42,149][0m Trial 1 finished with value: 0.8 and parameters: {'n_neighbors': 12, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 0 with value: 0.9.[0m


Video Level Accuracy (RF): 1.0000
Optimizing KNN...


[32m[I 2026-02-08 20:44:42,154][0m Trial 2 finished with value: 0.8666666666666667 and parameters: {'n_neighbors': 6, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 0 with value: 0.9.[0m
[32m[I 2026-02-08 20:44:42,157][0m Trial 3 finished with value: 0.7666666666666667 and parameters: {'n_neighbors': 8, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 0 with value: 0.9.[0m
[32m[I 2026-02-08 20:44:42,159][0m Trial 4 finished with value: 0.8333333333333334 and parameters: {'n_neighbors': 7, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 0 with value: 0.9.[0m
[32m[I 2026-02-08 20:44:42,192][0m Trial 5 finished with value: 0.7666666666666667 and parameters: {'n_neighbors': 9, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 0 with value: 0.9.[0m
[32m[I 2026-02-08 20:44:42,221][0m Trial 6 finished with value: 0.8 and parameters: {'n_neighbors': 4, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 0 with value: 0.9.[0m

Best params (KNN): {'n_neighbors': 20, 'weights': 'distance', 'metric': 'euclidean'}
Video Level Accuracy (KNN + PCA): 0.9000

--- Running Experiment: dense_stdscaler ---
Extracting features...
Processing 240 videos with -1 jobs...
Processing 30 videos with -1 jobs...


[32m[I 2026-02-08 20:45:59,777][0m A new study created in memory with name: no-name-ac1292ac-49a3-47d7-b6e0-c50aacde2738[0m


Extraction time: 77.36s
Training Frame-Level Model...
Optimizing SVM...


[32m[I 2026-02-08 20:46:01,321][0m Trial 0 finished with value: 0.9612948627726953 and parameters: {'C': 12.11206695934873, 'gamma': 0.0316436842047391, 'kernel': 'poly'}. Best is trial 0 with value: 0.9612948627726953.[0m
[32m[I 2026-02-08 20:46:04,219][0m Trial 1 finished with value: 0.9929627023223082 and parameters: {'C': 38.281765241332984, 'gamma': 0.0010256273253940558, 'kernel': 'linear'}. Best is trial 1 with value: 0.9929627023223082.[0m
[32m[I 2026-02-08 20:46:06,489][0m Trial 2 finished with value: 0.9992962702322308 and parameters: {'C': 0.07605609004555121, 'gamma': 0.7378643657469214, 'kernel': 'linear'}. Best is trial 2 with value: 0.9992962702322308.[0m
[32m[I 2026-02-08 20:46:08,631][0m Trial 3 finished with value: 0.9225897255453905 and parameters: {'C': 1.416346363395067, 'gamma': 0.005494169282312147, 'kernel': 'poly'}. Best is trial 2 with value: 0.9992962702322308.[0m
[32m[I 2026-02-08 20:46:15,727][0m Trial 4 finished with value: 0.6108374384236454

Best params (SVM): {'C': 0.07605609004555121, 'gamma': 0.7378643657469214, 'kernel': 'linear'}


[32m[I 2026-02-08 20:46:30,153][0m A new study created in memory with name: no-name-0d626eaf-8755-4f81-ac5d-1b816ebec0ba[0m


Video Level Accuracy (Voting): 1.0000
Training Video-Level Aggregated Model...
PCA reduced dim from 432 to 34
Optimizing SVM...


[32m[I 2026-02-08 20:46:30,338][0m Trial 0 finished with value: 1.0 and parameters: {'C': 0.015251804610922239, 'gamma': 2.822311800001481, 'kernel': 'linear'}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-08 20:46:30,565][0m Trial 1 finished with value: 0.9666666666666667 and parameters: {'C': 0.43263152676611355, 'gamma': 0.010467246837874847, 'kernel': 'linear'}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-08 20:46:30,731][0m Trial 2 finished with value: 0.4666666666666667 and parameters: {'C': 0.05498753127030705, 'gamma': 0.17908670403869134, 'kernel': 'rbf'}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-08 20:46:30,912][0m Trial 3 finished with value: 0.9666666666666667 and parameters: {'C': 1.937462811803805, 'gamma': 2.156385707821332, 'kernel': 'linear'}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-08 20:46:31,107][0m Trial 4 finished with value: 0.9666666666666667 and parameters: {'C': 0.567651227745078, 'gamma': 0.016006110406772123, '

Best params (SVM): {'C': 0.015251804610922239, 'gamma': 2.822311800001481, 'kernel': 'linear'}
Video Level Accuracy (SVM + PCA): 1.0000


[32m[I 2026-02-08 20:46:32,278][0m A new study created in memory with name: no-name-94646ed6-5d64-42e6-8a11-41f041eb5fab[0m


Optimizing Random Forest...


[32m[I 2026-02-08 20:46:32,653][0m Trial 0 finished with value: 0.9666666666666667 and parameters: {'n_estimators': 116, 'max_depth': 8, 'min_samples_split': 11, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.9666666666666667.[0m
[32m[I 2026-02-08 20:46:33,117][0m Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 133, 'max_depth': 7, 'min_samples_split': 7, 'min_samples_leaf': 1}. Best is trial 1 with value: 1.0.[0m
[32m[I 2026-02-08 20:46:34,119][0m Trial 2 finished with value: 0.9666666666666667 and parameters: {'n_estimators': 264, 'max_depth': 21, 'min_samples_split': 8, 'min_samples_leaf': 8}. Best is trial 1 with value: 1.0.[0m
[32m[I 2026-02-08 20:46:34,834][0m Trial 3 finished with value: 0.9666666666666667 and parameters: {'n_estimators': 215, 'max_depth': 15, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 1 with value: 1.0.[0m
[32m[I 2026-02-08 20:46:35,571][0m Trial 4 finished with value: 0.9666666666666667 and parameters: 

Best params (RF): {'n_estimators': 133, 'max_depth': 7, 'min_samples_split': 7, 'min_samples_leaf': 1}


[32m[I 2026-02-08 20:46:37,800][0m A new study created in memory with name: no-name-6e0994fb-efe9-494d-9e8c-97533a0d2714[0m
[32m[I 2026-02-08 20:46:37,803][0m Trial 0 finished with value: 0.8333333333333334 and parameters: {'n_neighbors': 3, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 0 with value: 0.8333333333333334.[0m
[32m[I 2026-02-08 20:46:37,806][0m Trial 1 finished with value: 0.8 and parameters: {'n_neighbors': 4, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 0 with value: 0.8333333333333334.[0m
[32m[I 2026-02-08 20:46:37,808][0m Trial 2 finished with value: 0.8666666666666667 and parameters: {'n_neighbors': 14, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 2 with value: 0.8666666666666667.[0m
[32m[I 2026-02-08 20:46:37,810][0m Trial 3 finished with value: 0.8 and parameters: {'n_neighbors': 9, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 2 with value: 0.8666666666666667.[0m
[32m[I 2026-02-08 20:46:37,8

Video Level Accuracy (RF): 1.0000
Optimizing KNN...
Best params (KNN): {'n_neighbors': 5, 'weights': 'distance', 'metric': 'manhattan'}
Video Level Accuracy (KNN + PCA): 0.9000

--- Summary ---
Baseline (Voting): 1.0
Baseline (SVM Agg): 1.0
Baseline (RF Agg): 1.0
Baseline (KNN Agg): 0.9666666666666667

StdScaler (Voting): 1.0
StdScaler (SVM Agg): 1.0
StdScaler (RF Agg): 1.0
StdScaler (KNN Agg): 0.9

Dense (Voting): 1.0
Dense (SVM Agg): 1.0
Dense (RF Agg): 1.0
Dense (KNN Agg): 0.9


# Analysis

## 1. Performance Comparison

We evaluated three experimental configurations using different sampling strategies and normalization techniques. The models compared include Support Vector Machine (SVM), Random Forest (RF), and K-Nearest Neighbors (KNN) using both Frame-Level Voting and Video-Level Aggregation strategies.

### Accuracy Summary

| Experiment | Frame Voting (SVM) | Video Agg (SVM) | Video Agg (RF) | Video Agg (KNN) |
| :--- | :---: | :---: | :---: | :---: |
| **Baseline** (Uniform, MinMax) | **1.0000** | **1.0000** | **1.0000** | 0.9667 |
| **Improved 1** (Uniform, StdScaler) | **1.0000** | **1.0000** | **1.0000** | 0.9000 |
| **Improved 2** (Dense, StdScaler) | **1.0000** | **1.0000** | **1.0000** | 0.9000 |

**Observations:**
- **Perfect Classification:** The SVM and Random Forest models achieved a perfect accuracy of **100%** across all experimental setups on the validation set. This suggests that the extracted features are highly discriminative for the three classes (Diving, Drumming, Juggling balls).
- **KNN Performance:** KNN slightly lagged behind, achieving 90-96.7% accuracy. It performed best with MinMax scaling (96.7%) compared to StandardScaler (90%), indicating sensitivity to the scaling method.
- **Robustness:** The high performance indicates that the classical feature engineering pipeline (combining Spatial and Temporal features) is robust for this specific dataset.

### Visual Analysis

#### Confusion Matrices
The confusion matrices for the best performing models (SVM/RF) show clear diagonal dominance with zero misclassifications.
![Confusion Matrix Baseline SVM](results_classical/cm_baseline_uniform_minmax_svm.png)

#### ROC Curves
The ROC curves confirm the perfect separation with Area Under Curve (AUC) of 1.00 for all classes in SVM and RF models.
![ROC Baseline SVM](results_classical/roc_baseline_uniform_minmax_svm.png)

## 2. Computational Analysis

### Training & Extraction Time
Feature extraction is the most computationally expensive part of the classical pipeline.

| Experiment | Extraction Time (s) | Relative Cost |
| :--- | :---: | :---: |
| **Baseline** (Uniform, 16 frames) | 29.14s | 1.0x |
| **StdScaler** (Uniform, 16 frames) | 26.82s | ~0.9x |
| **Dense** (Stride 5) | **77.36s** | **2.9x** |

**Key Findings:**
- **Dense Sampling Cost:** Dense sampling (processing every 5th frame) took nearly **3x longer** than uniform sampling (16 frames). Since accuracy did not improve (already at 100%), the extra computational cost of dense sampling is unjustified for this dataset.
- **Normalization Impact:** The choice of scaler (MinMax vs Standard) had negligible impact on extraction time.

### Model Complexity (Best Parameters)
- **SVM:**
    - Baseline: `C=6.97`, `RBF` kernel.
    - Dense: `C=0.015`, `Linear` kernel.
    - *Insight:* Dense sampling allowed a simpler Linear kernel to work effectively, likely because more data points smoothed out the feature space.
- **Random Forest:**
    - Baseline: `95 estimators`, `depth 28`.
    - Dense: `133 estimators`, `depth 7`.
    - *Insight:* Dense sampling resulted in shallower trees (depth 7 vs 28), suggesting that with more frames, the features became more robust, requiring less complex decision boundaries.
- **Dimensionality Reduction (PCA):**
    - Original Feature Count: 432
    - Reduced Feature Count (95% Variance): ~35
    - *Insight:* **92% of the feature space was redundant.** PCA successfully compressed the information, speeding up the classifier training without loss of accuracy.

## 3. Feature Analysis

### Extracted Features
The pipeline extracted a rich set of 432 features per video (before PCA), comprising:
- **Spatial:** Color Histograms, GLCM (Texture), LBP (Pattern), Gabor (Texture), Canny (Edge), HOG (Shape).
- **Temporal:** Optical Flow Statistics, Motion Trajectory.

### Representation Learning (PCA)
The drastic reduction from 432 to ~35 components while maintaining 100% accuracy implies that the classes are linearly separable in the lower-dimensional manifold.
- **Diving:** Likely characterized by specific motion patterns (vertical flow) and background colors (pool blue).
- **Drumming:** Characterized by repetitive localized motion and specific object textures.
- **Juggling:** Complex erratic motion patterns.

## 4. Trade-off Analysis

### Accuracy vs. Computational Cost
- **Winner:** **Baseline (Uniform Sampling + MinMax)**.
- **Reasoning:** It achieves the same perfect accuracy (100%) as the expensive Dense strategy but is **3x faster**. The computational overhead of dense sampling yields no return on investment for this specific classification task.

### Data Efficiency
- The models performed perfectly even with sparse uniform sampling (16 frames per video). This suggests high data efficiencyâ€”the key discriminative features are global and temporal, not requiring a dense frame-by-frame analysis.

### Interpretability vs. Performance
- **Random Forest** offers the best balance. It achieved 100% accuracy and provides feature importance (interpretable), whereas SVM (especially with RBF kernel) is a black box.
- The **Video-Level Aggregation** approach (using statistical potential of features) is more interpretable and faster to train than Frame-Level Voting, as it reduces the number of training samples for the final classifier significantly.

## Conclusion
For the task of classifying Diving, Drumming, and Juggling balls:
1.  **Classical methods are sufficient:** Deep learning may be overkill given that classical feature extraction + SVM/RF achieves 100% accuracy.
2.  **Uniform Sampling is optimal:** Dense sampling triples the cost without performance gain.
3.  **Recommendations:** Use the **Random Forest** classifier with **Uniform Sampling**. It is fast, accurate (100%), and offers interpretability.
