In [1]:
import pandas as pd
import numpy as np

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [2]:
df = pd.read_parquet("en_merged_df.parquet")

In [3]:
df["speaker_id"] = df["filename"].apply(lambda x: x.split("/wav/")[0])

In [4]:
unique_speakers = df["speaker_id"].unique()

In [5]:
train_speakers, temp_speakers = train_test_split(unique_speakers, test_size=0.3, random_state=42)
val_speakers, test_speakers = train_test_split(temp_speakers, test_size=0.5, random_state=42)

In [6]:
def assign_split(speaker_id):
    if speaker_id in train_speakers:
        return "train"
    elif speaker_id in val_speakers:
        return "val"
    elif speaker_id in test_speakers:
        return "test"
    else:
        return "unknown"  # safety net

df["split"] = df["speaker_id"].apply(assign_split)

In [7]:
df["split"].value_counts()

split
train    31464
val       7076
test      6755
Name: count, dtype: int64

In [8]:
le = LabelEncoder()
df["gender_encoded"] = le.fit_transform(df["gender"])

# Confirm encoding
print("Label encoding map:", dict(zip(le.classes_, le.transform(le.classes_))))

Label encoding map: {'female': 0, 'male': 1}


In [9]:
feature_cols = [
    "mean_freq_kHz",     # Central tendency
    "std_freq_kHz",      # Spread of frequencies
    "skewness",          # Shape of spectrum
    "kurtosis",          # Shape of spectrum
    "mode_freq_kHz",     # Dominant frequency
    "sp_entropy",        # Spectral entropy
    "flatness",          # Spectral flatness (tonal vs. noise-like)
    "centroid_kHz",      # Spectral centroid (perceived brightness)
    "modindx"            # Modulation index (amplitude variation)
    #"age_range" if you one-hot encode it
]


In [10]:
X_train = df[df["split"] == "train"][feature_cols]
y_train = df[df["split"] == "train"]["gender_encoded"]

X_val = df[df["split"] == "val"][feature_cols]
y_val = df[df["split"] == "val"]["gender_encoded"]

X_test = df[df["split"] == "test"][feature_cols]
y_test = df[df["split"] == "test"]["gender_encoded"]

# ML

# Oversampling

In [14]:
import optuna
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Feature matrix and labels assumed already defined: X_train, y_train, X_val, y_val, X_test, y_test

def objective(trial):
    # Define the hyperparameter search space
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "use_label_encoder": False,
        "eval_metric": "logloss",
        "random_state": 42,
    }

    # Define pipeline
    pipeline = Pipeline([
        ("oversample", RandomOverSampler(random_state=42)),
        ("scaler", StandardScaler()),
        ("classifier", XGBClassifier(**params))
    ])

    # Cross-validation on training data
    scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring="accuracy", n_jobs=-1)
    return np.mean(scores)


  from .autonotebook import tqdm as notebook_tqdm


In [15]:

# Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("Best hyperparameters found:")
print(study.best_params)


[I 2025-03-23 15:16:43,805] A new study created in memory with name: no-name-977a6d19-d183-495a-9f66-4f96c7eeef1d
[I 2025-03-23 15:16:46,178] Trial 0 finished with value: 0.8733473175692855 and parameters: {'n_estimators': 74, 'max_depth': 15, 'learning_rate': 0.24553776217940362, 'subsample': 0.693171679624428, 'colsample_bytree': 0.8141935037619459, 'gamma': 4.777468765090615, 'reg_alpha': 0.45584490024889035, 'reg_lambda': 0.6908523421294549}. Best is trial 0 with value: 0.8733473175692855.
[I 2025-03-23 15:16:49,085] Trial 1 finished with value: 0.9089117721840835 and parameters: {'n_estimators': 288, 'max_depth': 8, 'learning_rate': 0.24812746824277687, 'subsample': 0.6774482756941634, 'colsample_bytree': 0.6403688808145922, 'gamma': 0.7065138513189645, 'reg_alpha': 0.2669619712668986, 'reg_lambda': 0.4851307865222062}. Best is trial 1 with value: 0.9089117721840835.
[I 2025-03-23 15:16:51,750] Trial 2 finished with value: 0.8820556826849734 and parameters: {'n_estimators': 105, '

Best hyperparameters found:
{'n_estimators': 200, 'max_depth': 14, 'learning_rate': 0.20968947421630618, 'subsample': 0.5875872912700031, 'colsample_bytree': 0.5431407113954335, 'gamma': 0.025317219856460224, 'reg_alpha': 0.6928266846555541, 'reg_lambda': 0.004074430324537054}


In [16]:
# Retrain model using best params
best_xgb = XGBClassifier(**study.best_params)

pipeline = Pipeline([
    ("oversample", RandomOverSampler(random_state=42)),
    ("scaler", StandardScaler()),
    ("classifier", best_xgb)
])

pipeline.fit(X_train, y_train)

In [17]:

# Evaluation on train
train_preds = pipeline.predict(X_train)
print("\nTrain Accuracy:", accuracy_score(y_train, train_preds))
print("Train Classification Report:")
print(classification_report(y_train, train_preds, target_names=le.classes_))

# Evaluation on validation
val_preds = pipeline.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, val_preds))
print("Validation Classification Report:")
print(classification_report(y_val, val_preds, target_names=le.classes_))

# Evaluation on test
test_preds = pipeline.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test, test_preds))
print("Test Classification Report:")
print(classification_report(y_test, test_preds, target_names=le.classes_))


Train Accuracy: 1.0
Train Classification Report:
              precision    recall  f1-score   support

      female       1.00      1.00      1.00      2195
        male       1.00      1.00      1.00     29269

    accuracy                           1.00     31464
   macro avg       1.00      1.00      1.00     31464
weighted avg       1.00      1.00      1.00     31464


Validation Accuracy: 0.9208592425098926
Validation Classification Report:
              precision    recall  f1-score   support

      female       0.41      0.22      0.29       513
        male       0.94      0.98      0.96      6563

    accuracy                           0.92      7076
   macro avg       0.68      0.60      0.62      7076
weighted avg       0.90      0.92      0.91      7076


Test Accuracy: 0.93960029607698
Test Classification Report:
              precision    recall  f1-score   support

      female       0.34      0.25      0.29       329
        male       0.96      0.97      0.97      64

In [11]:
import optuna
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Feature matrix and labels assumed already defined: X_train, y_train, X_val, y_val, X_test, y_test

def objective(trial):
    # Define the hyperparameter search space
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "use_label_encoder": False,
        "eval_metric": "logloss",
        "random_state": 42,
    }

    # Define pipeline
    pipeline = Pipeline([
        ("oversample", RandomOverSampler(random_state=42)),
        ("scaler", StandardScaler()),
        ("classifier", XGBClassifier(**params))
    ])

    # Cross-validation with macro F1-score
    scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring="f1_macro", n_jobs=-1)
    return np.mean(scores)


  from .autonotebook import tqdm as notebook_tqdm


In [12]:

# Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("Best hyperparameters found:")
print(study.best_params)

# Retrain model using best params
best_xgb = XGBClassifier(**study.best_params)

pipeline = Pipeline([
    ("oversample", RandomOverSampler(random_state=42)),
    ("scaler", StandardScaler()),
    ("classifier", best_xgb)
])

pipeline.fit(X_train, y_train)


[I 2025-03-23 18:30:13,235] A new study created in memory with name: no-name-375eb856-5a4e-4ba7-9f90-029b1bce7eda
[I 2025-03-23 18:30:16,048] Trial 0 finished with value: 0.6300419037812647 and parameters: {'n_estimators': 120, 'max_depth': 12, 'learning_rate': 0.14003696735860643, 'subsample': 0.6280217750498023, 'colsample_bytree': 0.565042147225761, 'gamma': 0.05079585439047685, 'reg_alpha': 0.5875571912132664, 'reg_lambda': 0.7652296895035527}. Best is trial 0 with value: 0.6300419037812647.
[I 2025-03-23 18:30:17,927] Trial 1 finished with value: 0.6366308035712703 and parameters: {'n_estimators': 199, 'max_depth': 15, 'learning_rate': 0.2342269360374533, 'subsample': 0.9570504254506798, 'colsample_bytree': 0.9631565271224696, 'gamma': 0.7105404038336471, 'reg_alpha': 0.9811339920832082, 'reg_lambda': 0.9405300948662416}. Best is trial 1 with value: 0.6366308035712703.
[I 2025-03-23 18:30:19,333] Trial 2 finished with value: 0.5580935525503166 and parameters: {'n_estimators': 109,

Best hyperparameters found:
{'n_estimators': 219, 'max_depth': 12, 'learning_rate': 0.041063183344853746, 'subsample': 0.7773114382407827, 'colsample_bytree': 0.7776023846777059, 'gamma': 1.8007643554939714, 'reg_alpha': 0.5595368794599336, 'reg_lambda': 0.03884561928189753}


In [13]:

# Evaluation on train
train_preds = pipeline.predict(X_train)
print("\nTrain Accuracy:", accuracy_score(y_train, train_preds))
print("Train Classification Report:")
print(classification_report(y_train, train_preds, target_names=le.classes_))

# Evaluation on validation
val_preds = pipeline.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, val_preds))
print("Validation Classification Report:")
print(classification_report(y_val, val_preds, target_names=le.classes_))

# Evaluation on test
test_preds = pipeline.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test, test_preds))
print("Test Classification Report:")
print(classification_report(y_test, test_preds, target_names=le.classes_))

# Optional: print macro F1 explicitly
print("\nMacro F1 (Test):", f1_score(y_test, test_preds, average="macro"))



Train Accuracy: 0.9682176455631833
Train Classification Report:
              precision    recall  f1-score   support

      female       0.69      1.00      0.81      2195
        male       1.00      0.97      0.98     29269

    accuracy                           0.97     31464
   macro avg       0.84      0.98      0.90     31464
weighted avg       0.98      0.97      0.97     31464


Validation Accuracy: 0.8975409836065574
Validation Classification Report:
              precision    recall  f1-score   support

      female       0.33      0.41      0.37       513
        male       0.95      0.94      0.94      6563

    accuracy                           0.90      7076
   macro avg       0.64      0.67      0.66      7076
weighted avg       0.91      0.90      0.90      7076


Test Accuracy: 0.8991857883049593
Test Classification Report:
              precision    recall  f1-score   support

      female       0.20      0.36      0.26       329
        male       0.97      0.93 

# SMOTE Tomek

In [11]:
import optuna
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Feature matrix and labels assumed already defined:
# X_train, y_train, X_val, y_val, X_test, y_test
# le is your LabelEncoder

def objective(trial):
    # Hyperparameter search space
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "use_label_encoder": False,
        "eval_metric": "logloss",
        "random_state": 42,
    }

    # SMOTETomek pipeline
    pipeline = Pipeline([
        ("smotetomek", SMOTETomek(random_state=42)),
        ("scaler", StandardScaler()),
        ("classifier", XGBClassifier(**params))
    ])

    # Cross-validation with macro F1-score
    scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring="f1_macro", n_jobs=-1)
    return np.mean(scores)

# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("Best hyperparameters found:")
print(study.best_params)

# Retrain best model using SMOTETomek
best_xgb = XGBClassifier(**study.best_params)

pipeline = Pipeline([
    ("smotetomek", SMOTETomek(random_state=42)),
    ("scaler", StandardScaler()),
    ("classifier", best_xgb)
])

pipeline.fit(X_train, y_train)

# Train evaluation
train_preds = pipeline.predict(X_train)
print("\nTrain Accuracy:", accuracy_score(y_train, train_preds))
print("Train Classification Report:")
print(classification_report(y_train, train_preds, target_names=le.classes_))

# Validation evaluation
val_preds = pipeline.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, val_preds))
print("Validation Classification Report:")
print(classification_report(y_val, val_preds, target_names=le.classes_))

# Test evaluation
test_preds = pipeline.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test, test_preds))
print("Test Classification Report:")
print(classification_report(y_test, test_preds, target_names=le.classes_))

# Optional: macro F1
print("\nMacro F1 (Test):", f1_score(y_test, test_preds, average="macro"))


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-03-23 18:40:39,584] A new study created in memory with name: no-name-8d17216c-650a-40e6-948b-c8ec1221df7b
[I 2025-03-23 18:40:42,129] Trial 0 finished with value: 0.5893669832017372 and parameters: {'n_estimators': 279, 'max_depth': 6, 'learning_rate': 0.2793784816890019, 'subsample': 0.6873841608589786, 'colsample_bytree': 0.5675150867406161, 'gamma': 4.604096287922444, 'reg_alpha': 0.05214510541027151, 'reg_lambda': 0.14707906288053774}. Best is trial 0 with value: 0.5893669832017372.
[I 2025-03-23 18:40:43,982] Trial 1 finished with value: 0.5430859302684873 and parameters: {'n_estimators': 260, 'max_depth': 3, 'learning_rate': 0.040535356339987214, 'subsample': 0.6181570259559241, 'colsample_bytree': 0.6359427436804008, 'gamma': 3.412670404670781, 'reg_alpha': 0.012722848065426007, 'reg_lambda': 0.9303799029346164}. Best is trial 0 with value: 0.5893669832017372.
[I 2025-03-23 18:40:46,211] Trial 2 finished with value: 0.605

Best hyperparameters found:
{'n_estimators': 171, 'max_depth': 11, 'learning_rate': 0.0672235976504656, 'subsample': 0.7776496707686625, 'colsample_bytree': 0.9104667535566999, 'gamma': 0.42006826902189576, 'reg_alpha': 0.44297534982757286, 'reg_lambda': 0.6251173660319584}

Train Accuracy: 0.9573798627002288
Train Classification Report:
              precision    recall  f1-score   support

      female       0.64      0.88      0.74      2195
        male       0.99      0.96      0.98     29269

    accuracy                           0.96     31464
   macro avg       0.82      0.92      0.86     31464
weighted avg       0.97      0.96      0.96     31464


Validation Accuracy: 0.8794516676088185
Validation Classification Report:
              precision    recall  f1-score   support

      female       0.26      0.37      0.31       513
        male       0.95      0.92      0.93      6563

    accuracy                           0.88      7076
   macro avg       0.61      0.64      0

In [11]:
import optuna
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Feature matrix and labels assumed already defined:
# X_train, y_train, X_val, y_val, X_test, y_test
# le is your LabelEncoder

def objective(trial):
    # Hyperparameter search space
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "use_label_encoder": False,
        "eval_metric": "logloss",
        "random_state": 42,
    }

    # Undersampling pipeline
    pipeline = Pipeline([
        ("undersample", RandomUnderSampler(random_state=42)),
        ("scaler", StandardScaler()),
        ("classifier", XGBClassifier(**params))
    ])

    # Cross-validation with macro F1-score
    scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring="f1_macro", n_jobs=-1)
    return np.mean(scores)

# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("Best hyperparameters found:")
print(study.best_params)

# Retrain best model using undersampling
best_xgb = XGBClassifier(**study.best_params)

pipeline = Pipeline([
    ("undersample", RandomUnderSampler(random_state=42)),
    ("scaler", StandardScaler()),
    ("classifier", best_xgb)
])

pipeline.fit(X_train, y_train)

# Train evaluation
train_preds = pipeline.predict(X_train)
print("\nTrain Accuracy:", accuracy_score(y_train, train_preds))
print("Train Classification Report:")
print(classification_report(y_train, train_preds, target_names=le.classes_))

# Validation evaluation
val_preds = pipeline.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, val_preds))
print("Validation Classification Report:")
print(classification_report(y_val, val_preds, target_names=le.classes_))

# Test evaluation
test_preds = pipeline.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test, test_preds))
print("Test Classification Report:")
print(classification_report(y_test, test_preds, target_names=le.classes_))

# Optional: macro F1
print("\nMacro F1 (Test):", f1_score(y_test, test_preds, average="macro"))


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-03-23 18:44:11,579] A new study created in memory with name: no-name-ad33511c-00f1-465a-9362-b26f3e574cd8
[I 2025-03-23 18:44:13,849] Trial 0 finished with value: 0.5499533208534247 and parameters: {'n_estimators': 197, 'max_depth': 11, 'learning_rate': 0.021385885872180346, 'subsample': 0.7841320100611187, 'colsample_bytree': 0.5561040583425645, 'gamma': 0.4953497868877882, 'reg_alpha': 0.2069674041610874, 'reg_lambda': 0.7141427837604206}. Best is trial 0 with value: 0.5499533208534247.
[I 2025-03-23 18:44:15,051] Trial 1 finished with value: 0.5365480152128894 and parameters: {'n_estimators': 253, 'max_depth': 7, 'learning_rate': 0.25245749492262276, 'subsample': 0.6306544746551641, 'colsample_bytree': 0.9404999454622225, 'gamma': 2.615859683069916, 'reg_alpha': 0.902302203647613, 'reg_lambda': 0.7911005975045402}. Best is trial 0 with value: 0.5499533208534247.
[I 2025-03-23 18:44:16,309] Trial 2 finished with value: 0.52476

Best hyperparameters found:
{'n_estimators': 172, 'max_depth': 14, 'learning_rate': 0.14282121923508737, 'subsample': 0.9514454740102135, 'colsample_bytree': 0.8839137369991612, 'gamma': 0.5338527537212201, 'reg_alpha': 0.2261312332256606, 'reg_lambda': 0.7536984725385162}

Train Accuracy: 0.7852148487159929
Train Classification Report:
              precision    recall  f1-score   support

      female       0.24      0.99      0.39      2195
        male       1.00      0.77      0.87     29269

    accuracy                           0.79     31464
   macro avg       0.62      0.88      0.63     31464
weighted avg       0.95      0.79      0.84     31464


Validation Accuracy: 0.747880158281515
Validation Classification Report:
              precision    recall  f1-score   support

      female       0.19      0.74      0.30       513
        male       0.97      0.75      0.85      6563

    accuracy                           0.75      7076
   macro avg       0.58      0.74      0.5

# ADASYN

In [11]:
import optuna
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Feature matrix and labels assumed already defined:
# X_train, y_train, X_val, y_val, X_test, y_test
# le is your LabelEncoder

def objective(trial):
    # Hyperparameter search space
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "use_label_encoder": False,
        "eval_metric": "logloss",
        "random_state": 42,
    }

    # ADASYN pipeline
    pipeline = Pipeline([
        ("adasyn", ADASYN(random_state=42)),
        ("scaler", StandardScaler()),
        ("classifier", XGBClassifier(**params))
    ])

    # Cross-validation with macro F1-score
    scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring="f1_macro", n_jobs=-1)
    return np.mean(scores)

# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("Best hyperparameters found:")
print(study.best_params)

# Retrain best model using ADASYN
best_xgb = XGBClassifier(**study.best_params)

pipeline = Pipeline([
    ("adasyn", ADASYN(random_state=42)),
    ("scaler", StandardScaler()),
    ("classifier", best_xgb)
])

pipeline.fit(X_train, y_train)

# Train evaluation
train_preds = pipeline.predict(X_train)
print("\nTrain Accuracy:", accuracy_score(y_train, train_preds))
print("Train Classification Report:")
print(classification_report(y_train, train_preds, target_names=le.classes_))

# Validation evaluation
val_preds = pipeline.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, val_preds))
print("Validation Classification Report:")
print(classification_report(y_val, val_preds, target_names=le.classes_))

# Test evaluation
test_preds = pipeline.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test, test_preds))
print("Test Classification Report:")
print(classification_report(y_test, test_preds, target_names=le.classes_))

# Optional: macro F1
print("\nMacro F1 (Test):", f1_score(y_test, test_preds, average="macro"))


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-03-23 18:46:44,079] A new study created in memory with name: no-name-2fed700d-d6e5-462a-8ed1-1dd6a254589c
[I 2025-03-23 18:46:46,684] Trial 0 finished with value: 0.5860564230968628 and parameters: {'n_estimators': 177, 'max_depth': 8, 'learning_rate': 0.033430871288612926, 'subsample': 0.9363104114307427, 'colsample_bytree': 0.8582312941765564, 'gamma': 4.7140421837254385, 'reg_alpha': 0.08197601562490686, 'reg_lambda': 0.5778276816136512}. Best is trial 0 with value: 0.5860564230968628.
[I 2025-03-23 18:46:48,377] Trial 1 finished with value: 0.6013871645483103 and parameters: {'n_estimators': 55, 'max_depth': 10, 'learning_rate': 0.18581238988896498, 'subsample': 0.57816473222934, 'colsample_bytree': 0.9535551881460389, 'gamma': 2.645358140364684, 'reg_alpha': 0.9710705422716666, 'reg_lambda': 0.5628858840231029}. Best is trial 1 with value: 0.6013871645483103.
[I 2025-03-23 18:46:50,948] Trial 2 finished with value: 0.605490

Best hyperparameters found:
{'n_estimators': 220, 'max_depth': 15, 'learning_rate': 0.10226274219175374, 'subsample': 0.7742182613522712, 'colsample_bytree': 0.9187814128596935, 'gamma': 1.6642034183673025, 'reg_alpha': 0.778555268795029, 'reg_lambda': 0.7815503139665414}

Train Accuracy: 0.9627192982456141
Train Classification Report:
              precision    recall  f1-score   support

      female       0.69      0.85      0.76      2195
        male       0.99      0.97      0.98     29269

    accuracy                           0.96     31464
   macro avg       0.84      0.91      0.87     31464
weighted avg       0.97      0.96      0.96     31464


Validation Accuracy: 0.8852459016393442
Validation Classification Report:
              precision    recall  f1-score   support

      female       0.28      0.38      0.32       513
        male       0.95      0.93      0.94      6563

    accuracy                           0.89      7076
   macro avg       0.62      0.65      0.6

# Undersampling

In [11]:
import optuna
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Assumed already defined: X_train, y_train, X_val, y_val, X_test, y_test, le

def objective(trial):
    # Hyperparameter space
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "use_label_encoder": False,
        "eval_metric": "logloss",
        "random_state": 42,
    }

    # Undersample pipeline (slightly above minority class size)
    pipeline = Pipeline([
        ("undersample", RandomUnderSampler(sampling_strategy=0.6, random_state=42)),
        ("scaler", StandardScaler()),
        ("classifier", XGBClassifier(**params))
    ])

    # Cross-validation with macro F1
    scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring="f1_macro", n_jobs=-1)
    return np.mean(scores)

# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("Best hyperparameters found:")
print(study.best_params)

# Retrain model using best parameters and undersampling
best_xgb = XGBClassifier(**study.best_params)

pipeline = Pipeline([
    ("undersample", RandomUnderSampler(sampling_strategy=1, random_state=42)),
    ("scaler", StandardScaler()),
    ("classifier", best_xgb)
])

pipeline.fit(X_train, y_train)

# Train evaluation
train_preds = pipeline.predict(X_train)
print("\nTrain Accuracy:", accuracy_score(y_train, train_preds))
print("Train Classification Report:")
print(classification_report(y_train, train_preds, target_names=le.classes_))

# Validation evaluation
val_preds = pipeline.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, val_preds))
print("Validation Classification Report:")
print(classification_report(y_val, val_preds, target_names=le.classes_))

# Test evaluation
test_preds = pipeline.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test, test_preds))
print("Test Classification Report:")
print(classification_report(y_test, test_preds, target_names=le.classes_))

# Optional: macro F1
print("\nMacro F1 (Test):", f1_score(y_test, test_preds, average="macro"))


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-03-23 19:09:54,599] A new study created in memory with name: no-name-ed2d6ba4-7606-49ab-a5d6-56dc0aa02957
[I 2025-03-23 19:09:56,160] Trial 0 finished with value: 0.5897591943982867 and parameters: {'n_estimators': 113, 'max_depth': 7, 'learning_rate': 0.24707740121121868, 'subsample': 0.664521997223514, 'colsample_bytree': 0.6072266695973763, 'gamma': 1.769620816129796, 'reg_alpha': 0.6999452822116049, 'reg_lambda': 0.5374051000201677}. Best is trial 0 with value: 0.5897591943982867.
[I 2025-03-23 19:09:57,628] Trial 1 finished with value: 0.6076228308899801 and parameters: {'n_estimators': 165, 'max_depth': 14, 'learning_rate': 0.028263281347568094, 'subsample': 0.610810273656256, 'colsample_bytree': 0.7950244806193645, 'gamma': 1.5154159653151544, 'reg_alpha': 0.8089002351957171, 'reg_lambda': 0.14028847329816763}. Best is trial 1 with value: 0.6076228308899801.
[I 2025-03-23 19:09:58,697] Trial 2 finished with value: 0.60428

Best hyperparameters found:
{'n_estimators': 233, 'max_depth': 11, 'learning_rate': 0.04135110417580229, 'subsample': 0.8752717161995178, 'colsample_bytree': 0.7959389275350619, 'gamma': 4.992764818676129, 'reg_alpha': 0.7626912434823421, 'reg_lambda': 0.17087345520938263}

Train Accuracy: 0.7566742944317315
Train Classification Report:
              precision    recall  f1-score   support

      female       0.20      0.82      0.32      2195
        male       0.98      0.75      0.85     29269

    accuracy                           0.76     31464
   macro avg       0.59      0.78      0.59     31464
weighted avg       0.93      0.76      0.81     31464


Validation Accuracy: 0.7498586772187676
Validation Classification Report:
              precision    recall  f1-score   support

      female       0.18      0.71      0.29       513
        male       0.97      0.75      0.85      6563

    accuracy                           0.75      7076
   macro avg       0.58      0.73      0.