In [None]:
import librosa
import numpy as np
import os
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [3]:
# --- 1. Audio file list ---
audio_files = [
    "../data/raw/video/Muppets-02-01-01.wav",
    "../data/raw/video/Muppets-02-04-04.wav",
    "../data/raw/video/Muppets-03-04-03.wav"
]

# --- 2. Frame and feature settings ---
SR = 22050 
HOP_LENGTH = int(0.040 * SR) # 40 ms hop size
N_FFT = 2048 
TARGET_COLUMNS = ['Kermit', 'StatlerWaldorf', 'Fozzie Bear'] # Add this here

In [27]:
# --- PROJECT CONSTANTS ---
TARGET_COLUMNS = ['Kermit', 'StatlerWaldorf', 'Fozzie Bear'] 
SR = 22050 
HOP_LENGTH = int(0.040 * SR) # 40 ms hop size
N_FFT = 2048 # FFT window size

# 1. Audio file list 
audio_files = [
    "../data/raw/video/Muppets-03-04-03.wav",
    "../data/raw/video/Muppets-02-04-04.wav",
    "../data/raw/video/Muppets-02-01-01.wav"
]

# 2. Function for frame-level feature extraction 
def extract_features_from_file(audio_path, sr, hop_length, n_fft):
    """Loads audio and extracts frame-level MFCCs, F0, and Spectral Centroid."""
    
    if not os.path.exists(audio_path):
        print(f"‚ùå File not found: {audio_path}. Skipping.")
        return None

    print(f"\nüé¨ Processing file: {os.path.basename(audio_path)}")
    
    # Load audio data
    audio_data, _ = librosa.load(audio_path, sr=sr)
    
    # Feature extraction 
    
    # MFCCs (13 coefficients + 1st and 2nd derivatives = 39 features)
    mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=13, n_fft=n_fft, hop_length=hop_length)
    mfccs_delta = librosa.feature.delta(mfccs, order=1)
    mfccs_delta2 = librosa.feature.delta(mfccs, order=2)
    mfccs_full = np.concatenate((mfccs, mfccs_delta, mfccs_delta2), axis=0)
    
    # Spectral Centroid (1 feature)
    spectral_centroids = librosa.feature.spectral_centroid(y=audio_data, sr=sr, n_fft=n_fft, hop_length=hop_length)[0]
    
    # F0 (Pitch) (1 feature)
    f0, voiced_flag, _ = librosa.pyin(
        y=audio_data, sr=sr, 
        fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C5'), 
        hop_length=hop_length,
    )

    # --- 3. Concatenate all frame-level features (Total 41 features) ---
    
    # check that all arrays have the same number of frames
    min_frames = min(mfccs_full.shape[1], spectral_centroids.shape[0], f0.shape[0])

    # Combine into a single matrix (features are along the rows)
    all_features_frame_level = np.concatenate([
        mfccs_full[:, :min_frames],
        spectral_centroids[:min_frames][np.newaxis, :], 
        f0[:min_frames][np.newaxis, :]                  
    ], axis=0) 

    print(f"‚úÖ Success. Total frames: {min_frames}")
    print(f"   Feature dimensionality: {all_features_frame_level.shape}") # (41, N_frames)
    
    return {
        'episode': os.path.basename(audio_path),
        'features': all_features_frame_level,
        'voiced_flag': voiced_flag[:min_frames]
    }

# 4. Run processing and save results
all_episodes_data = []

for audio_file_path in audio_files:
    result = extract_features_from_file(audio_file_path, SR, HOP_LENGTH, N_FFT)
    if result is not None:
        all_episodes_data.append(result)

print("\n--- All frame-level features successfully extracted! ---")


üé¨ Processing file: Muppets-03-04-03.wav
‚úÖ Success. Total frames: 38500
   Feature dimensionality: (41, 38500)

üé¨ Processing file: Muppets-02-04-04.wav
‚úÖ Success. Total frames: 38708
   Feature dimensionality: (41, 38708)

üé¨ Processing file: Muppets-02-01-01.wav
‚úÖ Success. Total frames: 38683
   Feature dimensionality: (41, 38683)

--- All frame-level features successfully extracted! ---


In [None]:
#pip install pandas 
#pip install openpyxl

In [28]:
# Load Ground Truth (GT) Data ---

gt_files = [
    "/Users/iana/projects/dicom-dev/SMProject/data/muppets-gt-2025wt/Ground_Truth_New_01.xlsx",
    "/Users/iana/projects/dicom-dev/SMProject/data/muppets-gt-2025wt/Ground_Truth_New_03.xlsx",
    "/Users/iana/projects/dicom-dev/SMProject/data/muppets-gt-2025wt/Ground_Truth_New_04.xlsx"
]

all_gt_data = []

for file_path in gt_files:
    if os.path.exists(file_path):
        df = pd.read_excel(file_path)
        # add to the list for concatenation
        all_gt_data.append(df)
        print(f"‚úÖ Loaded GT: {os.path.basename(file_path)}")
    else:
        print(f"‚ùå GT file not found: {os.path.basename(file_path)}")

gt_df = pd.concat(all_gt_data, ignore_index=True)

print(f"\nTotal frames in combined GT: {len(gt_df)}")

‚úÖ Loaded GT: Ground_Truth_New_01.xlsx
‚úÖ Loaded GT: Ground_Truth_New_03.xlsx
‚úÖ Loaded GT: Ground_Truth_New_04.xlsx

Total frames in combined GT: 115885


# Splitting to train and test

In [None]:
#  SETTINGS AND CONSTANTS 
WINDOW_SIZE = 50      # Window size in frames (50 frames * 40ms/frame = 2.0 seconds)
HOP_SIZE = 10         # Hop size in frames (10 frames * 40ms/frame = 0.4 seconds)
SR = 22050
TOTAL_FEATURE_COUNT = 41 # (MFCCs(39) + Centroid(1) + F0(1))
TARGET_COLUMNS = ['Kermit', 'StatlerWaldorf', 'Fozzie Bear']
TEST_EPISODE_ID = 'Muppets-02-01-01'
TEST_TIME_START_SECONDS = 725 # 12 minutes 5 seconds cus this is the part of the video where all the caracters were visible and anough scenes 

#  1. AGGREGATION AND SLIDING WINDOW 
aggregated_dataset = []
episode_start_frame = 0 

for episode_data in all_episodes_data:
    features = episode_data['features'].T # (N_frames, 41)
    n_frames_episode = features.shape[0]
    
    # Slice frame-level labels for the current episode
    labels_episode = gt_df[TARGET_COLUMNS].values[episode_start_frame : episode_start_frame + n_frames_episode]
    episode_id = episode_data['episode'].replace('.wav', '')
    
    min_length = min(n_frames_episode, len(labels_episode))
    features = features[:min_length, :]
    labels_episode = labels_episode[:min_length, :]

    # Sliding Window Loop
    for i in range(0, min_length - WINDOW_SIZE + 1, HOP_SIZE):
        window_features = features[i : i + WINDOW_SIZE, :]
        window_labels = labels_episode[i : i + WINDOW_SIZE, :]
        
        # Feature Aggregation (mu and sigma) ---
        mu = np.nanmean(window_features, axis=0)
        sigma = np.nanstd(window_features, axis=0)
        
        # Concatenate mean and standard deviation (82 features total)
        feature_vector = np.concatenate([mu, sigma]) 
        
        # Label Aggregation (Majority Voting) 
        presence_counts = np.sum(window_labels, axis=0)
        dominant_index = np.argmax(presence_counts)
        dominant_count = presence_counts[dominant_index]
        
        if dominant_count > (WINDOW_SIZE / 2):
            final_label = TARGET_COLUMNS[dominant_index]
        else:
            final_label = "None/Background"
            
        # Calculate Start Time (in seconds)
        T_start = (episode_start_frame + i) * 0.040 # Time start of the frame
        
    
        aggregated_dataset.append({
            'features': feature_vector,
            'label': final_label,
            'episode_id': episode_id,         
            'time_start': T_start             
        })
        
    episode_start_frame += n_frames_episode

print(f"\nCreated {len(aggregated_dataset)} aggregated event vectors.")

# ----------------------------------------------------------------------
# Step 2: Final dataframe creation and Train/Test splits
# ----------------------------------------------------------------------

# 1. DataFrame
final_df = pd.DataFrame(aggregated_dataset)

# 2. Expand feature vector into separate columns
feature_names = [f'{stat}_{i}' for stat in ['mu', 'sigma'] for i in range(TOTAL_FEATURE_COUNT)]
feature_df = final_df['features'].apply(pd.Series)
feature_df.columns = feature_names

# Concatenate features with metadata
final_df = pd.concat([final_df.drop('features', axis=1), feature_df], axis=1)


# 3. Time-based Split condition
test_condition = (final_df['episode_id'].str.contains(TEST_EPISODE_ID)) & \
                 (final_df['time_start'] >= TEST_TIME_START_SECONDS)

X = final_df.drop(['label', 'episode_id', 'time_start'], axis=1) # Features
y = final_df['label'] 

X_test = X[test_condition]
y_test = y[test_condition]

X_train = X[~test_condition] 
y_train = y[~test_condition]

print(f"\n--- Train/Test Split Summary ---")
print(f"Training set (Train): {len(X_train)} samples")
print(f"Test set (Test): {len(X_test)} samples")
print(f"Test proportion: {len(X_test) / len(final_df):.1%} of total data.")

  mu = np.nanmean(window_features, axis=0)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  mu = np.nanmean(window_features, axis=0)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  mu = np.nanmean(window_features, axis=0)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,



Created 11575 aggregated event vectors.

--- Train/Test Split Summary ---
Training set (Train): 7712 samples
Test set (Test): 3863 samples
Test proportion: 33.4% of total data.


In [31]:
# Checking the class distribution in the splits
class_counts = y_train.value_counts()
total_train = len(y_train)

print("\nüìä Class Distribution (Train Set):")
print(class_counts)
print("\nPercentage Distribution:")
print((class_counts / total_train) * 100)


üìä Class Distribution (Train Set):
label
None/Background    4599
Kermit             2055
Fozzie Bear         791
StatlerWaldorf      267
Name: count, dtype: int64

Percentage Distribution:
label
None/Background    59.634336
Kermit             26.646784
Fozzie Bear        10.256743
StatlerWaldorf      3.462137
Name: count, dtype: float64


In [30]:
# Test split class distribution

class_counts_test = y_test.value_counts()
total_test = len(y_test)

print("\nüìä Class Distribution in the TEST Set:")
print(class_counts_test)
print("\nPercentage Distribution:")
print((class_counts_test / total_test) * 100)


üìä Class Distribution in the TEST Set:
label
None/Background    2433
Kermit             1214
Fozzie Bear         109
StatlerWaldorf      107
Name: count, dtype: int64

Percentage Distribution:
label
None/Background    62.982138
Kermit             31.426353
Fozzie Bear         2.821641
StatlerWaldorf      2.769868
Name: count, dtype: float64


In [33]:
# Step 1: Imputation (Filling NaN values)

# 1. Initialize and FIT Imputer ONLY on X_train
# Strategy: 'mean' (using the mean of each column from the training set)
imputer = SimpleImputer(strategy='mean')
imputer.fit(X_train) 

# 2. APPLY Imputer to X_train and X_test
X_train_imputed = imputer.transform(X_train)
X_test_imputed = imputer.transform(X_test)

X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns, index=X_train.index)
X_test_imputed = pd.DataFrame(X_test_imputed, columns=X_test.columns, index=X_test.index)

print("1) NaN Imputation completed.")

1) NaN Imputation completed.


In [34]:
# Step 2: Normalization (Feature Scaling)

# 1. Initialize and FIT Scaler ONLY on X_train_imputed
# This fits the scaler to calculate mean (mu) and standard deviation (sigma) only from the training data.
scaler = StandardScaler()
scaler.fit(X_train_imputed) 

# 2. APPLY Scaler to both train and test sets
# The test data is scaled using the mu and sigma derived ONLY from the training data (no data leakage).
X_train_scaled = scaler.transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("2) Normalization completed. Data is ready for training.")

2) Normalization completed. Data is ready for training.


In [35]:
# Since the class where characters are not present in the video (None/Background) is the dominant
# we need to do imbalance compensation while training models

# Training SVC with class imbalance compensation (Linear Kernel) 
# class_weight='balanced' automatically adjusts weights inversely proportional to class frequencies.
svc_model = SVC(kernel='linear', C=1, class_weight='balanced', random_state=42) 
svc_model.fit(X_train_scaled, y_train)
y_pred_svc = svc_model.predict(X_test_scaled)
print("\n--- SVC Results (Class Weighted) ---")
print(classification_report(y_test, y_pred_svc, zero_division=0))

# Training Random Forest with class imbalance compensation 
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)
print("\n--- Random Forest Results (Class Weighted) ---")
print(classification_report(y_test, y_pred_rf, zero_division=0))

# Training k-NN (Unweighted) 
# k-NN does not natively support class weighting.
knn_model = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
knn_model.fit(X_train_scaled, y_train)
y_pred_knn = knn_model.predict(X_test_scaled)
print("\n--- k-NN Results (Unweighted) ---")
print(classification_report(y_test, y_pred_knn, zero_division=0))


--- SVC Results (Class Weighted) ---
                 precision    recall  f1-score   support

    Fozzie Bear       0.06      0.79      0.12       109
         Kermit       0.39      0.43      0.41      1214
None/Background       0.60      0.16      0.25      2433
 StatlerWaldorf       0.01      0.06      0.02       107

       accuracy                           0.26      3863
      macro avg       0.26      0.36      0.20      3863
   weighted avg       0.50      0.26      0.29      3863


--- Random Forest Results (Class Weighted) ---
                 precision    recall  f1-score   support

    Fozzie Bear       0.00      0.00      0.00       109
         Kermit       0.36      0.04      0.07      1214
None/Background       0.63      0.97      0.76      2433
 StatlerWaldorf       0.00      0.00      0.00       107

       accuracy                           0.62      3863
      macro avg       0.25      0.25      0.21      3863
   weighted avg       0.51      0.62      0.50      38

In [None]:
# SVC Hyperparameter Tuning: Grid Definition 
# C: Regularization parameter. Higher C means a stricter model (aims to increase Precision which is important in our case).
# gamma: RBF kernel coefficient.
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 0.01, 0.1, 1],
    'kernel': ['rbf']
}

# Base SVC model (keeping class_weight='balanced')
svc_base = SVC(class_weight='balanced', random_state=42)

# Initialize GridSearchCV
# Scoring metric is F1-weighted to account for class importance in imbalanced data.
grid_search = GridSearchCV(
    estimator=svc_base, 
    param_grid=param_grid, 
    scoring='f1_weighted',
    cv=3,                  
    verbose=2,             
    n_jobs=-1              
)

print("\n--- Starting GridSearchCV for SVC Tuning ---")

# Train Grid Search (using previously scaled training data)
grid_search.fit(X_train_scaled, y_train)

# Reporting Tuning Results 
best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_model = grid_search.best_estimator_

print("\n Tuning completed.")
print(f"Best parameters based on F1-weighted: {best_params}")
print(f"Best F1-weighted score (CV): {best_score:.4f}")

# Evaluate the best model on the unseen test set
y_pred_tuned = best_model.predict(X_test_scaled)

print("\n--- Tuned SVC (RBF) Results on Test Set ---")
print(classification_report(y_test, y_pred_tuned, zero_division=0))


--- Starting GridSearchCV for SVC Tuning ---
Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   3.8s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   3.8s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   3.8s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   3.8s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   3.9s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   3.9s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   4.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   3.9s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   4.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   4.3s
[CV] END .......................C=1, gamma=scale, kernel=rbf; total time=   2.8s
[C

We're done with the audio phase. Our best audio model (SVC) is good at finding the characters (high Recall), but it often makes mistakes (low Precision). This shows the limitation of using audio features alone.

## What's Next?

To truly boost accuracy, we need to bring in visual information.
1) Get the Visual Features (the face data your partner extracted).
2) Combine these features with our current audio features.
3) Retrain the Model using this combined dataset.

This should dramatically improve our ability to accurately determine who is speaking. Get ready for Feature Fusion.