In [50]:
import pandas as pd
import numpy as np

import os
import sys

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# to play the audio files
from IPython.display import Audio



import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [51]:
pip install numpy==1.21

Note: you may need to restart the kernel to use updated packages.


In [52]:
metadata=pd.read_csv('BVC_Voice_Bio_Public.csv')
metadata.drop(metadata.columns[metadata.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
metadata

Unnamed: 0,New_ID,Sex,Age,Ethnicity
0,4001,'Male',19,'Igbo'
1,4002,'Male',23,'Igbo'
2,4003,'Female',18,'Ikwerre'
3,4004,'Male',23,'Annang'
4,4005,'Female',17,'Igbo'
...,...,...,...,...
555,4993,'Male',18,'Igede'
556,4994,'Male',17,'Okirika'
557,4995,'Male',22,'Igbo'
558,4998,'Female',23,'Igbo'


# prepare the data 

In [53]:
import os
import pandas as pd

new_df = pd.DataFrame(columns=['ID', 'AudioPath', 'Sex'])

cdir = 'one_sentence/one_sentence/'

for new_id, Sex in zip(metadata['New_ID'], metadata['Sex']):
    audio_filenames = os.listdir(cdir)
    
    audio_filename = next((filename for filename in audio_filenames if str(new_id) in filename), None)

    if audio_filename is not None:
        audio_path = os.path.join(cdir, audio_filename)
        new_df = new_df.append({'ID': new_id, 'AudioPath': audio_path, 'Sex': Sex}, ignore_index=True)

print(new_df)


       ID                                   AudioPath Age
0    4001  one_sentence/one_sentence/S_01_4001_VE.wav  19
1    4003  one_sentence/one_sentence/S_01_4003_VE.wav  18
2    4004  one_sentence/one_sentence/S_01_4004_VE.wav  23
3    4006  one_sentence/one_sentence/S_01_4006_VE.wav  26
4    4007  one_sentence/one_sentence/S_01_4007_VE.wav  29
..    ...                                         ...  ..
330  4990  one_sentence/one_sentence/S_01_4990_VE.wav  20
331  4991  one_sentence/one_sentence/S_01_4991_VE.wav  17
332  4994  one_sentence/one_sentence/S_01_4994_VE.wav  17
333  4998  one_sentence/one_sentence/S_01_4998_VE.wav  23
334  4999  one_sentence/one_sentence/S_01_4999_VE.wav  18

[335 rows x 3 columns]


In [55]:
new_df

Unnamed: 0,ID,AudioPath,Age
0,4001,one_sentence/one_sentence/S_01_4001_VE.wav,19
1,4003,one_sentence/one_sentence/S_01_4003_VE.wav,18
2,4004,one_sentence/one_sentence/S_01_4004_VE.wav,23
3,4006,one_sentence/one_sentence/S_01_4006_VE.wav,26
4,4007,one_sentence/one_sentence/S_01_4007_VE.wav,29
...,...,...,...
330,4990,one_sentence/one_sentence/S_01_4990_VE.wav,20
331,4991,one_sentence/one_sentence/S_01_4991_VE.wav,17
332,4994,one_sentence/one_sentence/S_01_4994_VE.wav,17
333,4998,one_sentence/one_sentence/S_01_4998_VE.wav,23


# using mfcc to extract features without data augmentation 

In [56]:
df = pd.DataFrame(columns=['feature'])

counter=0
for index,path in enumerate(new_df.AudioPath):
    X, sample_rate = librosa.load(path)
    
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13),axis=0)
    df.loc[counter] = [mfccs]
    counter=counter+1   

print(len(df))
df.head()

335


Unnamed: 0,feature
0,"[-18.175413, -18.762264, -20.23277, -19.647663..."
1,"[-20.833927, -21.177696, -24.323574, -25.75362..."
2,"[-23.72864, -23.236788, -24.520565, -27.393017..."
3,"[-26.813719, -26.398699, -27.694794, -28.77627..."
4,"[-17.551044, -14.641887, -13.771718, -14.99221..."


In [57]:
df = pd.concat([new_df,pd.DataFrame(df['feature'].values.tolist())],axis=1)
df[:5]

Unnamed: 0,ID,AudioPath,Age,0,1,2,3,4,5,6,...,334,335,336,337,338,339,340,341,342,343
0,4001,one_sentence/one_sentence/S_01_4001_VE.wav,19,-18.175413,-18.762264,-20.232771,-19.647663,-18.840992,-19.883558,-19.45681,...,,,,,,,,,,
1,4003,one_sentence/one_sentence/S_01_4003_VE.wav,18,-20.833927,-21.177696,-24.323574,-25.753622,-26.207659,-25.526674,-23.309019,...,,,,,,,,,,
2,4004,one_sentence/one_sentence/S_01_4004_VE.wav,23,-23.72864,-23.236788,-24.520565,-27.393017,-27.991961,-27.391615,-24.116264,...,,,,,,,,,,
3,4006,one_sentence/one_sentence/S_01_4006_VE.wav,26,-26.813719,-26.398699,-27.694794,-28.776274,-28.260632,-26.799578,-26.709593,...,,,,,,,,,,
4,4007,one_sentence/one_sentence/S_01_4007_VE.wav,29,-17.551044,-14.641887,-13.771718,-14.992212,-14.953177,-15.66591,-15.971464,...,,,,,,,,,,


In [58]:
df=df.fillna(0)
print(df.shape)
df[:5]

(335, 347)


Unnamed: 0,ID,AudioPath,Age,0,1,2,3,4,5,6,...,334,335,336,337,338,339,340,341,342,343
0,4001,one_sentence/one_sentence/S_01_4001_VE.wav,19,-18.175413,-18.762264,-20.232771,-19.647663,-18.840992,-19.883558,-19.45681,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4003,one_sentence/one_sentence/S_01_4003_VE.wav,18,-20.833927,-21.177696,-24.323574,-25.753622,-26.207659,-25.526674,-23.309019,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4004,one_sentence/one_sentence/S_01_4004_VE.wav,23,-23.72864,-23.236788,-24.520565,-27.393017,-27.991961,-27.391615,-24.116264,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4006,one_sentence/one_sentence/S_01_4006_VE.wav,26,-26.813719,-26.398699,-27.694794,-28.776274,-28.260632,-26.799578,-26.709593,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4007,one_sentence/one_sentence/S_01_4007_VE.wav,29,-17.551044,-14.641887,-13.771718,-14.992212,-14.953177,-15.66591,-15.971464,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
df.drop(['AudioPath', 'ID'], axis='columns', inplace=True)
df

Unnamed: 0,Age,0,1,2,3,4,5,6,7,8,...,334,335,336,337,338,339,340,341,342,343
0,19,-18.175413,-18.762264,-20.232771,-19.647663,-18.840992,-19.883558,-19.456810,-16.310200,-16.304497,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,18,-20.833927,-21.177696,-24.323574,-25.753622,-26.207659,-25.526674,-23.309019,-21.799061,-22.417704,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,23,-23.728640,-23.236788,-24.520565,-27.393017,-27.991961,-27.391615,-24.116264,-22.924234,-21.550385,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,26,-26.813719,-26.398699,-27.694794,-28.776274,-28.260632,-26.799578,-26.709593,-27.302380,-27.737309,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,29,-17.551044,-14.641887,-13.771718,-14.992212,-14.953177,-15.665910,-15.971464,-15.392863,-15.014594,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330,20,-17.885078,-19.087559,-20.567562,-20.407108,-20.240883,-20.666222,-21.352287,-21.591372,-21.577139,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
331,17,-27.856340,-25.090708,-24.633238,-23.830137,-22.626913,-23.280348,-24.800947,-25.598581,-27.327532,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
332,17,-33.140629,-30.330515,-29.802738,-29.563713,-29.638454,-30.491890,-29.830095,-29.024952,-28.524021,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
333,23,-20.658417,-18.282227,-11.193456,-8.359219,-9.174542,-10.339983,-10.806499,-10.634888,-10.348563,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

y = df['Age']
X = df.loc[:, 1:]


# Feature Normalization 

In [61]:

mean = np.mean(X, axis=0)
std = np.std(X, axis=0)

X = (X - mean)/std
X = (X - mean)/std

y

0      19
1      18
2      23
3      26
4      29
       ..
330    20
331    17
332    17
333    23
334    18
Name: Age, Length: 335, dtype: int64

# Lasso Madel

In [62]:
from sklearn.linear_model import Lasso
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #we get different train and test sets across different executions and the shuffling process is out of control

lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
from sklearn.metrics import r2_score, mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 46.44735434015077


In [63]:
from sklearn.metrics import r2_score, mean_squared_error
r2 = r2_score(y_test, y_pred)
print("R-squared score:", r2)

R-squared score: 0.015861671664365362


# Regularization Parameter (alpha): The default value for the regularization parameter in scikit-learn's Lasso model is 1.0. This parameter controls the strength of regularization, where higher values of alpha result in more coefficients being set to zero.

# Maximum Iterations (max_iter): The default value for the maximum number of iterations in scikit-learn's Lasso model is 1000. This parameter specifies the maximum number of iterations the solver will perform to converge to a solution.

# Convergence Tolerance (tol): The default value for the convergence tolerance in scikit-learn's Lasso model is 0.0001. It represents the tolerance for the stopping criteria of the solver. If the change in the objective function or coefficients falls below this value, the solver is considered to have converged.

# Intercept (fit_intercept): The default value for the fit_intercept parameter in scikit-learn's Lasso model is True. This parameter determines whether to include an intercept term (bias) in the model. If set to True, an intercept will be included, and if set to False, no intercept will be included.

# Normalize: The default value for the normalize parameter in scikit-learn's Lasso model is False. This parameter determines whether to normalize the predictor variables before fitting the model. If set to True, the variables will be scaled to have zero mean and unit variance.