# Classifying Audio Data using AST and Logistic Regression

[ADD BREIF INTRODUCTION TO THE PRIJECT]

## Step 1 : Get the Data from the Recanovo Project

Importing necessary packages and libraries

In [1]:
import functools
from pathlib import Path


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    confusion_matrix,
    f1_score,
    log_loss,
    recall_score,
)
from sklearn.model_selection import (
    cross_val_predict,
    StratifiedKFold,
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
#self note: removed the skopt line below so it'll work, if it gives errors
from skopt import BayesSearchCV
# import torch
# import torchaudio
# from tqdm.notebook import tqdm


def to_prob(metric):
    @functools.wraps(metric)
    def metric_that_takes_prob(y_actual, y_pred, sample_weight=None):
        return metric(y_actual, y_pred.argmax(1), sample_weight=sample_weight)

    return metric_that_takes_prob


metrics = {
    "accuracy": to_prob(accuracy_score),
    "balanced_accuracy": to_prob(balanced_accuracy_score),
    "unweighted_f1": to_prob(functools.partial(f1_score, average="macro")),
    "UAR": to_prob(functools.partial(recall_score, average="macro")),
    "logloss": log_loss,
}

The train test split is made so at least one whole session is in the test set


In [2]:
#create a dataframe to hold the data
df = pd.read_csv('new_train_test.csv')
#check labels
display(df.Label.value_counts())

Label
selftalk                  1885
frustrated                1536
delighted                 1272
dysregulated               704
social                     634
request                    419
affectionate               129
yes                        123
laughter                    93
dysregulation-sick          74
happy                       61
help                        24
more                        22
protest                     21
bathroom                    20
dysregulation-bathroom      18
no                          12
glee                         8
laugh                        8
tablet                       7
hunger                       4
greeting                     3
Name: count, dtype: int64

In [3]:
#Isolate the data for participant #05
df_P05 = df.loc[df.Participant == "P05"]
#check labels
display(df_P05.Label.value_counts())

Label
selftalk        286
frustrated      283
delighted       235
dysregulated    116
happy            61
help             24
laughter         13
request           6
hunger            4
Name: count, dtype: int64

In [4]:
label_counts = df_P05['Label'].value_counts()
labels_to_keep = label_counts[label_counts >= 30].index

# Filter the DataFrame to keep only the rows with these labels
df_P05_dropped = df_P05[df_P05['Label'].isin(labels_to_keep)]
df_P05_dropped['Label'].value_counts()

Label
selftalk        286
frustrated      283
delighted       235
dysregulated    116
happy            61
Name: count, dtype: int64

A dataframe is isolated to contain only the training data for participant 05

In [5]:
#create a dataframe that contains training data for participant 05
train_P05 = df_P05_dropped.loc[df_P05_dropped.is_test == 0]

In [6]:
#temporary cell: checking the dataframe
#print(train_P05)
print(train_P05.Label.value_counts())

Label
frustrated      208
selftalk        199
delighted       146
dysregulated     74
happy            46
Name: count, dtype: int64


In [7]:
train_P05.head()

Unnamed: 0.1,Unnamed: 0,Filename,Participant,Label,is_test,Session
2790,2790,200929_0101_00-02-06.53--00-02-07.10.wav,P05,happy,0,200929_0101
2791,2791,200929_2203_00-00-59.95--00-01-03.62.wav,P05,happy,0,200929_2203
2792,2792,200929_0101_00-06-53.65--00-06-54.43c.wav,P05,happy,0,200929_0101
2793,2793,200929_0101_00-06-52.47--00-06-52.91c.wav,P05,happy,0,200929_0101
2794,2794,200929_2203_00-06-05.52--00-06-06.36.wav,P05,happy,0,200929_2203


# Step 4 : Use Logistic Regression for the Classification

In [8]:
X = np.loadtxt('X0_ast_numpy')

In [9]:
y = np.loadtxt('y_jr')

In [10]:
# seeing which feature makes the most sense
# checking accuracy for feature set 0 : X
est = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        max_iter=10**6,
    ),
)
opt = BayesSearchCV(
    est,
    {
        "logisticregression__C": (5e-3, 1, "log-uniform"),
    },
    n_iter=20,
    cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=12345),
    scoring="accuracy",
)
opt.fit(
    X.reshape(len(X), -1),
    y,
)
print(opt.best_params_)
print("Best accuracy:", opt.best_score_)

OrderedDict({'logisticregression__C': 0.005})
Best accuracy: 0.3149912203687445


In [11]:
train_P05["session"] = train_P05.Filename.apply(
    lambda name: name.split("-")[0][:-3]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_P05["session"] = train_P05.Filename.apply(


In [12]:
session_weight = (
    (1 / train_P05.session.value_counts())
    .clip(None, 0.1)
    .loc[train_P05.session]
).values
session_and_label_weight = (
    1
    / pd.Series(session_weight, train_P05.index)
    .groupby(train_P05.Label)
    .sum()
).loc[train_P05.Label].values * session_weight

In [13]:
est = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        C=opt.best_params_["logisticregression__C"],
        max_iter=10**6,
    ),
)
oos_pred_prob = cross_val_predict(
    est,
    X.reshape(len(X), -1),
    y,
    cv=StratifiedKFold(
        n_splits=10,
        shuffle=True,
        random_state=1234,  # Using different seed to avoid over-fitting parameter
    ),
    method="predict_proba",
    params={"logisticregression__sample_weight": session_weight},
)
oos_pred = oos_pred_prob.argmax(1)

display(
    pd.Series(
        {name: metric(y, oos_pred_prob) for name, metric in metrics.items()},
        name="no_weight",
    ).round(3)
)
display(
    pd.Series(
        {
            name: metric(y, oos_pred_prob, sample_weight=session_weight)
            for name, metric in metrics.items()
        },
        name="session_weight",
    ).round(3)
)
display(
    pd.Series(
        {
            name: metric(
                y, oos_pred_prob, sample_weight=session_and_label_weight
            )
            for name, metric in metrics.items()
        },
        name="session_and_label_weight",
    ).round(3)
)

accuracy             0.290
balanced_accuracy    0.239
unweighted_f1        0.164
UAR                  0.239
logloss              1.484
Name: no_weight, dtype: float64

accuracy             0.363
balanced_accuracy    0.237
unweighted_f1        0.187
UAR                  0.237
logloss              1.372
Name: session_weight, dtype: float64

accuracy             0.237
balanced_accuracy    0.237
unweighted_f1        0.156
UAR                  0.237
logloss              1.667
Name: session_and_label_weight, dtype: float64

In [14]:
from skopt.space import Categorical
from sklearn.ensemble import RandomForestClassifier

In [15]:
est = make_pipeline(
    StandardScaler(),
    RandomForestClassifier()
)

# Define the hyperparameter grid for RandomForestClassifier
opt = BayesSearchCV(
    est,
    {
        "randomforestclassifier__n_estimators": [800, 1000, 1200],  # Number of trees in the forest
        "randomforestclassifier__max_depth": [3,5,7],  # Maximum depth of the trees
    },
    n_iter=20,
    cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=12345),
    scoring="accuracy",
)

opt.fit(
    X.reshape(len(X), -1),
    y,
)
print(opt.best_params_)
print("Best accuracy:", opt.best_score_)



OrderedDict({'randomforestclassifier__max_depth': 3, 'randomforestclassifier__n_estimators': 1200})
Best accuracy: 0.3818261633011414


In [16]:
est = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(
        n_estimators=opt.best_params_.get("randomforestclassifier__n_estimators", 100),
        max_depth=opt.best_params_.get("randomforestclassifier__max_depth", None),
        random_state=12345  # You can include a random state for reproducibility
    ),
)

# Cross-validation prediction probabilities
oos_pred_prob = cross_val_predict(
    est,
    X.reshape(len(X), -1),
    y,
    cv=StratifiedKFold(
        n_splits=10,
        shuffle=True,
        random_state=1234,  # Using a different seed to avoid overfitting parameter
    ),
    method="predict_proba",
    fit_params={"randomforestclassifier__sample_weight": session_weight},
)

# Get the predicted classes from the probabilities
oos_pred = oos_pred_prob.argmax(1)

# Display performance metrics
display(
    pd.Series(
        {name: metric(y, oos_pred_prob) for name, metric in metrics.items()},
        name="no_weight",
    ).round(3)
)
display(
    pd.Series(
        {
            name: metric(y, oos_pred_prob, sample_weight=session_weight)
            for name, metric in metrics.items()
        },
        name="session_weight",
    ).round(3)
)
display(
    pd.Series(
        {
            name: metric(
                y, oos_pred_prob, sample_weight=session_and_label_weight
            )
            for name, metric in metrics.items()
        },
        name="session_and_label_weight",
    ).round(3)
)



accuracy             0.294
balanced_accuracy    0.252
unweighted_f1        0.171
UAR                  0.252
logloss              1.475
Name: no_weight, dtype: float64

accuracy             0.371
balanced_accuracy    0.251
unweighted_f1        0.196
UAR                  0.251
logloss              1.368
Name: session_weight, dtype: float64

accuracy             0.251
balanced_accuracy    0.251
unweighted_f1        0.166
UAR                  0.251
logloss              1.667
Name: session_and_label_weight, dtype: float64

In [17]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [18]:
# Create the pipeline with XGBClassifier
estxgbast = make_pipeline(
    StandardScaler(),
    XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')  # XGBoost-specific parameters
)

# Define the hyperparameter grid for XGBClassifier
param_gridxgbast = {
    "xgbclassifier__n_estimators": [800, 1000, 1200],  # Number of boosting rounds
    "xgbclassifier__max_depth": [3, 5, 7],  # Maximum depth of the tree
    "xgbclassifier__learning_rate": [0.01, 0.1, 0.2],  # Step size shrinkage
}

# Set up the RandomizedSearchCV
optxgb = GridSearchCV(
    estxgbast,
    param_grid=param_gridxgbast,
    cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=12345),
    scoring="accuracy",
    n_jobs=-1
)

# Now use `opt` for hyperparameter tuning
optxgb.fit(
    X.reshape(len(X), -1),
    y,
)

Parameters: { "use_label_encoder" } are not used.



In [21]:
print(optxgb.best_params_)
print("Best accuracy:", optxgb.best_score_)

{'xgbclassifier__learning_rate': 0.1, 'xgbclassifier__max_depth': 7, 'xgbclassifier__n_estimators': 800}
Best accuracy: 0.35500438981562776


In [22]:
est = make_pipeline(
    StandardScaler(),
    XGBClassifier(
        n_estimators=optxgb.best_params_["xgbclassifier__n_estimators"],  # Example of using best params
        max_depth=optxgb.best_params_["xgbclassifier__max_depth"],
        learning_rate=optxgb.best_params_["xgbclassifier__learning_rate"]
    ),
)

# Cross-validation predictions with probability estimates
oos_pred_prob = cross_val_predict(
    est,
    X.reshape(len(X), -1),
    y,
    cv=StratifiedKFold(
        n_splits=10,
        shuffle=True,
        random_state=1234,  # Using different seed to avoid over-fitting parameter
    ),
    method="predict_proba",
    fit_params={"xgbclassifier__sample_weight": session_weight},
)

# Get the predicted class by taking the argmax of the predicted probabilities
oos_pred = oos_pred_prob.argmax(1)

# Display metrics without sample weight
display(
    pd.Series(
        {name: metric(y, oos_pred_prob) for name, metric in metrics.items()},
        name="no_weight",
    ).round(3)
)

# Display metrics with session weight
display(
    pd.Series(
        {
            name: metric(y, oos_pred_prob, sample_weight=session_weight)
            for name, metric in metrics.items()
        },
        name="session_weight",
    ).round(3)
)

# Display metrics with session and label weight
display(
    pd.Series(
        {
            name: metric(
                y, oos_pred_prob, sample_weight=session_and_label_weight
            )
            for name, metric in metrics.items()
        },
        name="session_and_label_weight",
    ).round(3)
)



accuracy             0.309
balanced_accuracy    0.280
unweighted_f1        0.255
UAR                  0.280
logloss              1.603
Name: no_weight, dtype: float64

accuracy             0.316
balanced_accuracy    0.277
unweighted_f1        0.262
UAR                  0.277
logloss              1.575
Name: session_weight, dtype: float64

accuracy             0.277
balanced_accuracy    0.277
unweighted_f1        0.251
UAR                  0.277
logloss              1.792
Name: session_and_label_weight, dtype: float64