In [30]:
import pandas as pd
import numpy as np

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [31]:
# Load the final cleaned dataset
df = pd.read_parquet("en_merged_df.parquet")

# Splitting Data on File Level

In [32]:
df["speaker_id"] = df["filename"].apply(lambda x: x.split("/wav/")[0])

In [33]:
unique_speakers = df["speaker_id"].unique()

In [34]:
train_speakers, temp_speakers = train_test_split(unique_speakers, test_size=0.3, random_state=42)
val_speakers, test_speakers = train_test_split(temp_speakers, test_size=0.5, random_state=42)

In [35]:
def assign_split(speaker_id):
    if speaker_id in train_speakers:
        return "train"
    elif speaker_id in val_speakers:
        return "val"
    elif speaker_id in test_speakers:
        return "test"
    else:
        return "unknown"  # safety net

df["split"] = df["speaker_id"].apply(assign_split)

In [36]:
df["split"].value_counts()

split
train    31464
val       7076
test      6755
Name: count, dtype: int64

In [37]:
df

Unnamed: 0,filename,mean_freq_kHz,std_freq_kHz,median_freq_kHz,first_quantile_kHz,third_quantile_kHz,iqr_kHz,skewness,kurtosis,mode_freq_kHz,peak_freq_kHz,sp_entropy,flatness,centroid_kHz,modindx,gender,age_range,speaker_id,split
0,robin-20070310-vf12/wav/vf12-34.wav,3.999961,2.309424,3.999961,1.999981,5.999942,3.999961,4.289625,22.129724,0.596192,0.596192,13.797732,0.131873,1.104101,2.316162,male,adult,robin-20070310-vf12,train
1,robin-20070310-vf12/wav/vf12-29.wav,4.000000,2.309454,4.000000,2.000000,6.000000,4.000000,4.468912,24.387798,0.518989,0.518989,13.671813,0.145211,1.173674,2.266816,male,adult,robin-20070310-vf12,train
2,robin-20070310-vf12/wav/vf12-24.wav,4.000000,2.309442,4.000000,2.000000,6.000000,4.000000,4.768208,28.682903,0.522362,0.522362,13.983390,0.154861,1.182493,2.340302,male,adult,robin-20070310-vf12,train
3,robin-20070310-vf12/wav/vf12-06.wav,4.000000,2.309460,4.000000,2.000000,6.000000,4.000000,4.288241,22.928845,0.165990,0.165990,13.383058,0.113975,1.056596,2.309039,male,adult,robin-20070310-vf12,train
4,robin-20070310-vf12/wav/vf12-38.wav,3.999964,2.309422,3.999964,1.999982,5.999946,3.999964,4.437734,23.512740,0.327474,0.327474,14.031024,0.154637,1.212154,2.255088,male,adult,robin-20070310-vf12,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45290,Luis-20130226-peg/wav/b0325.wav,4.000000,2.309476,4.000000,2.000000,6.000000,4.000000,8.821168,99.341927,0.127484,0.127484,12.227013,0.186738,0.926109,3.981581,male,adult,Luis-20130226-peg,val
45291,Luis-20130226-peg/wav/b0323.wav,4.000000,2.309447,4.000000,2.000000,6.000000,4.000000,7.781451,82.411050,0.121120,0.121120,12.875872,0.149606,0.784108,3.738265,male,adult,Luis-20130226-peg,val
45292,Luis-20130226-peg/wav/b0329.wav,4.000000,2.309484,4.000000,2.000000,6.000000,4.000000,8.237866,97.468529,0.143143,0.143143,12.128605,0.123457,0.790771,3.505875,male,adult,Luis-20130226-peg,val
45293,Luis-20130226-peg/wav/b0327.wav,4.000000,2.309497,4.000000,2.000000,6.000000,4.000000,8.133196,93.356766,0.117333,0.117333,11.977746,0.152190,0.854203,3.538266,male,adult,Luis-20130226-peg,val


# Handling Categorical Variables

In [38]:
le = LabelEncoder()
df["gender_encoded"] = le.fit_transform(df["gender"])

# Confirm encoding
print("Label encoding map:", dict(zip(le.classes_, le.transform(le.classes_))))

Label encoding map: {'female': 0, 'male': 1}


In [39]:
feature_cols = [
    "mean_freq_kHz",     # Central tendency
    "std_freq_kHz",      # Spread of frequencies
    "skewness",          # Shape of spectrum
    "kurtosis",          # Shape of spectrum
    "mode_freq_kHz",     # Dominant frequency
    "sp_entropy",        # Spectral entropy
    "flatness",          # Spectral flatness (tonal vs. noise-like)
    "centroid_kHz",      # Spectral centroid (perceived brightness)
    "modindx"            # Modulation index (amplitude variation)
    #"age_range" 
]


In [40]:
X_train = df[df["split"] == "train"][feature_cols]
y_train = df[df["split"] == "train"]["gender_encoded"]

X_val = df[df["split"] == "val"][feature_cols]
y_val = df[df["split"] == "val"]["gender_encoded"]

X_test = df[df["split"] == "test"][feature_cols]
y_test = df[df["split"] == "test"]["gender_encoded"]

# ML

In [41]:

models = {
    "Dummy (Most Frequent)": DummyClassifier(strategy="most_frequent"),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Elastic Net (Logistic)": LogisticRegression(penalty="elasticnet", l1_ratio=0.5, solver="saga", max_iter=1000),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Support Vector Machine": SVC(),
    "MLP Classifier": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

In [42]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

results = []

# Training, validation, and test evaluation loop WITHOUT imbalance handling
for name, model in models.items():
    print(f"\n=== {name} ===")

    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", model)
    ])

    # Fit on training data
    pipeline.fit(X_train, y_train)

    # Train evaluation
    train_preds = pipeline.predict(X_train)
    train_acc = accuracy_score(y_train, train_preds)
    print(f"Train Accuracy: {train_acc:.4f}")
    print("Train Classification Report:")
    print(classification_report(y_train, train_preds, target_names=le.classes_))

    # Validation evaluation
    val_preds = pipeline.predict(X_val)
    val_acc = accuracy_score(y_val, val_preds)
    print(f"Validation Accuracy: {val_acc:.4f}")
    print("Validation Classification Report:")
    print(classification_report(y_val, val_preds, target_names=le.classes_))

    # Test evaluation
    test_preds = pipeline.predict(X_test)
    test_acc = accuracy_score(y_test, test_preds)
    print(f"Test Accuracy: {test_acc:.4f}")
    print("Test Classification Report:")
    print(classification_report(y_test, test_preds, target_names=le.classes_))

    # Store results for summary
    results.append({
        "Model": name,
        "Train Accuracy": train_acc,
        "Validation Accuracy": val_acc,
        "Test Accuracy": test_acc
    })

# Show summary table
results_df = pd.DataFrame(results)
print("\n=== Summary Table ===")
print(results_df.sort_values(by="Test Accuracy", ascending=False).reset_index(drop=True))



=== Dummy (Most Frequent) ===
Train Accuracy: 0.9302
Train Classification Report:
              precision    recall  f1-score   support

      female       0.00      0.00      0.00      2195
        male       0.93      1.00      0.96     29269

    accuracy                           0.93     31464
   macro avg       0.47      0.50      0.48     31464
weighted avg       0.87      0.93      0.90     31464

Validation Accuracy: 0.9275
Validation Classification Report:
              precision    recall  f1-score   support

      female       0.00      0.00      0.00       513
        male       0.93      1.00      0.96      6563

    accuracy                           0.93      7076
   macro avg       0.46      0.50      0.48      7076
weighted avg       0.86      0.93      0.89      7076

Test Accuracy: 0.9513
Test Classification Report:
              precision    recall  f1-score   support

      female       0.00      0.00      0.00       329
        male       0.95      1.00      0.9

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Train Accuracy: 0.9302
Train Classification Report:
              precision    recall  f1-score   support

      female       0.00      0.00      0.00      2195
        male       0.93      1.00      0.96     29269

    accuracy                           0.93     31464
   macro avg       0.47      0.50      0.48     31464
weighted avg       0.87      0.93      0.90     31464

Validation Accuracy: 0.9275
Validation Classification Report:
              precision    recall  f1-score   support

      female       0.00      0.00      0.00       513
        male       0.93      1.00      0.96      6563

    accuracy                           0.93      7076
   macro avg       0.46      0.50      0.48      7076
weighted avg       0.86      0.93      0.89      7076

Test Accuracy: 0.9513
Test Classification Report:
              precision    recall  f1-score   support

      female       0.00      0.00      0.00       329
        male       0.95      1.00      0.98      6426

    accuracy      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Train Accuracy: 0.9423
Train Classification Report:
              precision    recall  f1-score   support

      female       0.73      0.27      0.40      2195
        male       0.95      0.99      0.97     29269

    accuracy                           0.94     31464
   macro avg       0.84      0.63      0.68     31464
weighted avg       0.93      0.94      0.93     31464

Validation Accuracy: 0.9199
Validation Classification Report:
              precision    recall  f1-score   support

      female       0.34      0.11      0.17       513
        male       0.93      0.98      0.96      6563

    accuracy                           0.92      7076
   macro avg       0.64      0.55      0.56      7076
weighted avg       0.89      0.92      0.90      7076

Test Accuracy: 0.9405
Test Classification Report:
              precision    recall  f1-score   support

      female       0.27      0.13      0.17       329
        male       0.96      0.98      0.97      6426

    accuracy      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Train Accuracy: 1.0000
Train Classification Report:
              precision    recall  f1-score   support

      female       1.00      1.00      1.00      2195
        male       1.00      1.00      1.00     29269

    accuracy                           1.00     31464
   macro avg       1.00      1.00      1.00     31464
weighted avg       1.00      1.00      1.00     31464

Validation Accuracy: 0.8916
Validation Classification Report:
              precision    recall  f1-score   support

      female       0.25      0.26      0.25       513
        male       0.94      0.94      0.94      6563

    accuracy                           0.89      7076
   macro avg       0.60      0.60      0.60      7076
weighted avg       0.89      0.89      0.89      7076

Test Accuracy: 0.9072
Test Classification Report:
              precision    recall  f1-score   support

      female       0.19      0.28      0.22       329
        male       0.96      0.94      0.95      6426

    accuracy      



Train Accuracy: 0.9303
Train Classification Report:
              precision    recall  f1-score   support

      female       1.00      0.00      0.00      2195
        male       0.93      1.00      0.96     29269

    accuracy                           0.93     31464
   macro avg       0.97      0.50      0.48     31464
weighted avg       0.94      0.93      0.90     31464

Validation Accuracy: 0.9275
Validation Classification Report:
              precision    recall  f1-score   support

      female       0.00      0.00      0.00       513
        male       0.93      1.00      0.96      6563

    accuracy                           0.93      7076
   macro avg       0.46      0.50      0.48      7076
weighted avg       0.86      0.93      0.89      7076

Test Accuracy: 0.9513
Test Classification Report:
              precision    recall  f1-score   support

      female       0.00      0.00      0.00       329
        male       0.95      1.00      0.98      6426

    accuracy      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Train Accuracy: 0.9305
Train Classification Report:
              precision    recall  f1-score   support

      female       1.00      0.00      0.01      2195
        male       0.93      1.00      0.96     29269

    accuracy                           0.93     31464
   macro avg       0.97      0.50      0.49     31464
weighted avg       0.94      0.93      0.90     31464

Validation Accuracy: 0.9275
Validation Classification Report:
              precision    recall  f1-score   support

      female       0.00      0.00      0.00       513
        male       0.93      1.00      0.96      6563

    accuracy                           0.93      7076
   macro avg       0.46      0.50      0.48      7076
weighted avg       0.86      0.93      0.89      7076



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Test Accuracy: 0.9513
Test Classification Report:
              precision    recall  f1-score   support

      female       0.00      0.00      0.00       329
        male       0.95      1.00      0.98      6426

    accuracy                           0.95      6755
   macro avg       0.48      0.50      0.49      6755
weighted avg       0.90      0.95      0.93      6755


=== MLP Classifier ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Train Accuracy: 0.9333
Train Classification Report:
              precision    recall  f1-score   support

      female       0.80      0.06      0.11      2195
        male       0.93      1.00      0.97     29269

    accuracy                           0.93     31464
   macro avg       0.87      0.53      0.54     31464
weighted avg       0.92      0.93      0.91     31464

Validation Accuracy: 0.9302
Validation Classification Report:
              precision    recall  f1-score   support

      female       0.69      0.07      0.12       513
        male       0.93      1.00      0.96      6563

    accuracy                           0.93      7076
   macro avg       0.81      0.53      0.54      7076
weighted avg       0.91      0.93      0.90      7076

Test Accuracy: 0.9535
Test Classification Report:
              precision    recall  f1-score   support

      female       0.71      0.08      0.14       329
        male       0.95      1.00      0.98      6426

    accuracy      