In [1]:
import pandas as pd
import numpy as np

import os
import sys

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# to play the audio files
from IPython.display import Audio



import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
metadata=pd.read_csv('BVC_Voice_Bio_Public.csv')
metadata.drop(metadata.columns[metadata.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
metadata

Unnamed: 0,New_ID,Sex,Age,Ethnicity
0,4001,'Male',19,'Igbo'
1,4002,'Male',23,'Igbo'
2,4003,'Female',18,'Ikwerre'
3,4004,'Male',23,'Annang'
4,4005,'Female',17,'Igbo'
...,...,...,...,...
555,4993,'Male',18,'Igede'
556,4994,'Male',17,'Okirika'
557,4995,'Male',22,'Igbo'
558,4998,'Female',23,'Igbo'


# prepare the data 

In [3]:
import os
import pandas as pd

new_df = pd.DataFrame(columns=['ID', 'AudioPath', 'Sex'])

cdir = 'one_sentence/one_sentence/'

for new_id, Sex in zip(metadata['New_ID'], metadata['Sex']):
    audio_filenames = os.listdir(cdir)
    
    audio_filename = next((filename for filename in audio_filenames if str(new_id) in filename), None)

    if audio_filename is not None:
        audio_path = os.path.join(cdir, audio_filename)
        new_df = new_df.append({'ID': new_id, 'AudioPath': audio_path, 'Sex': Sex}, ignore_index=True)

print(new_df)


       ID                                   AudioPath       Sex
0    4001  one_sentence/one_sentence/S_01_4001_VE.wav    'Male'
1    4003  one_sentence/one_sentence/S_01_4003_VE.wav  'Female'
2    4004  one_sentence/one_sentence/S_01_4004_VE.wav    'Male'
3    4006  one_sentence/one_sentence/S_01_4006_VE.wav  'Female'
4    4007  one_sentence/one_sentence/S_01_4007_VE.wav  'Female'
..    ...                                         ...       ...
330  4990  one_sentence/one_sentence/S_01_4990_VE.wav    'Male'
331  4991  one_sentence/one_sentence/S_01_4991_VE.wav    'Male'
332  4994  one_sentence/one_sentence/S_01_4994_VE.wav    'Male'
333  4998  one_sentence/one_sentence/S_01_4998_VE.wav  'Female'
334  4999  one_sentence/one_sentence/S_01_4999_VE.wav    'Male'

[335 rows x 3 columns]


In [4]:
new_df

Unnamed: 0,ID,AudioPath,Sex
0,4001,one_sentence/one_sentence/S_01_4001_VE.wav,'Male'
1,4003,one_sentence/one_sentence/S_01_4003_VE.wav,'Female'
2,4004,one_sentence/one_sentence/S_01_4004_VE.wav,'Male'
3,4006,one_sentence/one_sentence/S_01_4006_VE.wav,'Female'
4,4007,one_sentence/one_sentence/S_01_4007_VE.wav,'Female'
...,...,...,...
330,4990,one_sentence/one_sentence/S_01_4990_VE.wav,'Male'
331,4991,one_sentence/one_sentence/S_01_4991_VE.wav,'Male'
332,4994,one_sentence/one_sentence/S_01_4994_VE.wav,'Male'
333,4998,one_sentence/one_sentence/S_01_4998_VE.wav,'Female'


# Data Augmentation

In [5]:
import numpy as np
import librosa

def noise(data):
    noise_amp = 0.035 * np.random.uniform() * np.amax(data)
    data = data + noise_amp * np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate=rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high=5) * 1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=pitch_factor)

path = np.array(new_df.AudioPath)[1]
data, sample_rate = librosa.load(path)


# feature extraction 

In [6]:
def extract_features(data):
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result = np.hstack((result, zcr))  

    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) 

    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc))  


    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) 


    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) 

    return result

def get_features(path):
    data, sample_rate = librosa.load(path)

    res1 = extract_features(data)
    result = np.array(res1)

    noise_data = noise(data)
    res2 = extract_features(noise_data)
    result = np.vstack((result, res2)) 


    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch)
    result = np.vstack((result, res3)) 
    
    return result

In [7]:
X, Y = [], []
for path, Sex in zip(new_df.AudioPath, new_df.Sex):
    feature = get_features(path)
    for ele in feature:
        X.append(ele)
        Y.append(Sex)

In [8]:
Features = pd.DataFrame(X)
Features['labels'] = Y
Features.to_csv('features.csv', index=False)
Features.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,labels
0,0.13495,0.541973,0.590975,0.629498,0.55434,0.484941,0.414414,0.411719,0.403977,0.492537,...,0.003427,0.002742,0.003128,0.003828,0.004409,0.004212,0.00311,0.001608,0.000147,'Male'
1,0.175533,0.578523,0.639114,0.678436,0.612612,0.548129,0.442708,0.423216,0.429283,0.533114,...,0.004231,0.003812,0.004222,0.004921,0.005547,0.005148,0.004072,0.002583,0.001063,'Male'
2,0.13032,0.49511,0.560829,0.628878,0.613587,0.476975,0.448526,0.39705,0.402275,0.385674,...,0.00043,0.000798,0.001017,0.000631,0.000811,0.001076,0.001058,0.000414,3.2e-05,'Male'
3,0.064779,0.426302,0.385016,0.460398,0.454087,0.422845,0.381049,0.470981,0.601168,0.606481,...,0.001227,0.000926,0.001135,0.001839,0.002517,0.002276,0.00299,0.001816,5.1e-05,'Female'
4,0.20459,0.599852,0.620378,0.666363,0.646233,0.624822,0.550103,0.626995,0.691869,0.772933,...,0.005289,0.004908,0.005456,0.006172,0.006987,0.006534,0.007363,0.006129,0.004326,'Female'
5,0.067325,0.459782,0.379343,0.348571,0.472985,0.425403,0.353252,0.336104,0.47213,0.635292,...,0.000247,0.000256,0.000197,0.000216,0.000232,0.000382,0.000569,0.000315,3.4e-05,'Female'
6,0.041983,0.567305,0.574385,0.65084,0.693372,0.661344,0.641463,0.572052,0.527214,0.517609,...,0.001186,0.001116,0.000773,0.000975,0.000828,0.000624,0.000616,0.00028,2.1e-05,'Male'
7,0.171539,0.634088,0.635058,0.718379,0.760411,0.734627,0.696014,0.620386,0.594683,0.576823,...,0.005276,0.005266,0.004857,0.004799,0.00481,0.004607,0.004748,0.00424,0.004024,'Male'
8,0.051621,0.574841,0.535947,0.580629,0.677304,0.657368,0.613852,0.586084,0.573262,0.518847,...,0.000261,0.000194,0.000316,0.000206,0.000191,0.000256,0.000164,6.4e-05,7e-06,'Male'
9,0.08845,0.390351,0.365893,0.387294,0.39402,0.416864,0.520996,0.577196,0.467659,0.477839,...,0.001285,0.001302,0.001491,0.00139,0.001221,0.001086,0.001176,0.000814,4e-05,'Female'


In [9]:


X = Features.iloc[: ,:-1].values
Y = Features['labels'].values



In [10]:
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray() #'A' -> [1, 0, 0]



# Lasso Model

In [11]:
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test, y_pred)
print("R-squared score:", r2)

Mean Squared Error: 0.11323727045073509
R-squared score: 0.3878798000125172


# Regularization Parameter (alpha): The default value for the regularization parameter in scikit-learn's Lasso model is 1.0. This parameter controls the strength of regularization, where higher values of alpha result in more coefficients being set to zero.

# Maximum Iterations (max_iter): The default value for the maximum number of iterations in scikit-learn's Lasso model is 1000. This parameter specifies the maximum number of iterations the solver will perform to converge to a solution.

# Convergence Tolerance (tol): The default value for the convergence tolerance in scikit-learn's Lasso model is 0.0001. It represents the tolerance for the stopping criteria of the solver. If the change in the objective function or coefficients falls below this value, the solver is considered to have converged.

# Intercept (fit_intercept): The default value for the fit_intercept parameter in scikit-learn's Lasso model is True. This parameter determines whether to include an intercept term (bias) in the model. If set to True, an intercept will be included, and if set to False, no intercept will be included.

# Normalize: The default value for the normalize parameter in scikit-learn's Lasso model is False. This parameter determines whether to normalize the predictor variables before fitting the model. If set to True, the variables will be scaled to have zero mean and unit variance.