In [1]:
import pandas as pd
import numpy as np

import os
import sys

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# to play the audio files
from IPython.display import Audio



import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
metadata=pd.read_csv('BVC_Voice_Bio_Public.csv')
metadata.drop(metadata.columns[metadata.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
metadata

Unnamed: 0,New_ID,Sex,Age,Ethnicity
0,4001,'Male',19,'Igbo'
1,4002,'Male',23,'Igbo'
2,4003,'Female',18,'Ikwerre'
3,4004,'Male',23,'Annang'
4,4005,'Female',17,'Igbo'
...,...,...,...,...
555,4993,'Male',18,'Igede'
556,4994,'Male',17,'Okirika'
557,4995,'Male',22,'Igbo'
558,4998,'Female',23,'Igbo'


# prepare the data

In [4]:
import os
import pandas as pd

new_df = pd.DataFrame(columns=['ID', 'AudioPath', 'Age'])

cdir = 'one_sentence/one_sentence/'

for new_id, age in zip(metadata['New_ID'], metadata['Age']):
    audio_filenames = os.listdir(cdir)    
    matching = [filename for filename in audio_filenames if str(new_id) in filename]

    for audio_filename in matching:
        audio_path = os.path.join(cdir, audio_filename)
        new_df = new_df.append({'ID': new_id, 'AudioPath': audio_path, 'Age': age}, ignore_index=True)

print(new_df)


       ID                                   AudioPath Age
0    4001  one_sentence/one_sentence/S_01_4001_VE.wav  19
1    4001  one_sentence/one_sentence/S_01_4001_VV.wav  19
2    4003  one_sentence/one_sentence/S_01_4003_VE.wav  18
3    4003  one_sentence/one_sentence/S_01_4003_VV.wav  18
4    4004  one_sentence/one_sentence/S_01_4004_VE.wav  23
..    ...                                         ...  ..
666  4994  one_sentence/one_sentence/S_01_4994_VV.wav  17
667  4998  one_sentence/one_sentence/S_01_4998_VE.wav  23
668  4998  one_sentence/one_sentence/S_01_4998_VV.wav  23
669  4999  one_sentence/one_sentence/S_01_4999_VE.wav  18
670  4999  one_sentence/one_sentence/S_01_4999_VV.wav  18

[671 rows x 3 columns]


In [5]:
new_df

Unnamed: 0,ID,AudioPath,Age
0,4001,one_sentence/one_sentence/S_01_4001_VE.wav,19
1,4001,one_sentence/one_sentence/S_01_4001_VV.wav,19
2,4003,one_sentence/one_sentence/S_01_4003_VE.wav,18
3,4003,one_sentence/one_sentence/S_01_4003_VV.wav,18
4,4004,one_sentence/one_sentence/S_01_4004_VE.wav,23
...,...,...,...
666,4994,one_sentence/one_sentence/S_01_4994_VV.wav,17
667,4998,one_sentence/one_sentence/S_01_4998_VE.wav,23
668,4998,one_sentence/one_sentence/S_01_4998_VV.wav,23
669,4999,one_sentence/one_sentence/S_01_4999_VE.wav,18


# data augmentation

In [6]:
import numpy as np
import librosa

def noise(data):
    noise_amp = 0.035 * np.random.uniform() * np.amax(data)
    data = data + noise_amp * np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate=rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high=5) * 1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=pitch_factor)

path = np.array(new_df.AudioPath)[1]
data, sample_rate = librosa.load(path)


In [7]:
pip install numpy==1.21

Note: you may need to restart the kernel to use updated packages.


# feature extraction 

In [10]:
def extract_features(data):
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result = np.hstack((result, zcr))  


    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) 

    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) 

    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms))  


    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) 

    return result

def get_features(path):
    data, sample_rate = librosa.load(path)

    res1 = extract_features(data)
    result = np.array(res1)

    noise_data = noise(data)
    res2 = extract_features(noise_data)
    result = np.vstack((result, res2)) 


    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch)
    result = np.vstack((result, res3)) 
    
    return result

In [11]:
X, Y = [], []
for path, age in zip(new_df.AudioPath, new_df.Age):
    feature = get_features(path)
    for ele in feature:
        X.append(ele)
        Y.append(age)

In [None]:
#x_train, x_test, y_train, y_test = train_test_split(np.array(X), np.array(Y), test_size=0.1)

In [None]:
#print((x_train.shape, y_train.shape, x_test.shape, y_test.shape))

In [12]:
Features = pd.DataFrame(X)
Features['labels'] = Y
Features.to_csv('features.csv', index=False)
Features.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,labels
0,0.104374,0.569376,0.562032,0.568345,0.496963,0.461219,0.464172,0.452773,0.439066,0.516222,...,0.002484,0.001969,0.002226,0.002719,0.003111,0.002974,0.00221,0.00115,0.000105,19
1,0.212879,0.665941,0.686811,0.707503,0.685711,0.643418,0.577971,0.540947,0.515371,0.553503,...,0.008048,0.007957,0.00773,0.008416,0.008862,0.008842,0.007603,0.006857,0.005643,19
2,0.109827,0.601371,0.58778,0.547757,0.538568,0.442235,0.459177,0.455613,0.419329,0.430577,...,0.000324,0.000429,0.000379,0.000414,0.000491,0.000723,0.000643,0.000315,2.4e-05,19
3,0.069498,0.474179,0.435311,0.415795,0.420492,0.472643,0.447137,0.423127,0.425767,0.453755,...,0.000271,0.000258,0.000279,0.000258,0.000237,0.000166,0.000136,7.5e-05,7e-06,19
4,0.077985,0.476042,0.4526,0.441642,0.443776,0.495412,0.456759,0.436256,0.445709,0.491835,...,0.000311,0.000308,0.000319,0.000293,0.000277,0.000216,0.000178,0.000115,5.1e-05,19
5,0.081919,0.487925,0.440715,0.441453,0.430629,0.448919,0.45371,0.440819,0.427653,0.459485,...,7.2e-05,5.9e-05,5.8e-05,5.4e-05,5.8e-05,5.1e-05,4.7e-05,1.9e-05,2e-06,19
6,0.068149,0.481145,0.407402,0.460841,0.453245,0.433526,0.426108,0.460561,0.534319,0.540014,...,0.000857,0.000668,0.000816,0.001311,0.001767,0.001597,0.002069,0.001252,3.6e-05,18
7,0.244029,0.654938,0.622569,0.665952,0.67628,0.692889,0.635666,0.604391,0.658673,0.691993,...,0.006694,0.006404,0.006639,0.006752,0.007209,0.007367,0.007561,0.007038,0.0058,18
8,0.07882,0.481371,0.429861,0.497227,0.453096,0.399709,0.427882,0.47067,0.549646,0.53616,...,0.000226,0.000193,0.000156,0.000153,0.00016,0.000411,0.000288,0.000225,1.8e-05,18
9,0.060297,0.545225,0.503764,0.471546,0.553613,0.615457,0.633614,0.546773,0.462461,0.497818,...,0.000238,0.000211,0.000262,0.000296,0.000233,0.000128,7.5e-05,6.9e-05,1e-05,18


In [14]:


X = Features.iloc[: ,:-1].values
Y = Features['labels'].values



In [None]:
Y

# Lasso model

In [15]:
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test, y_pred)
print("R-squared score:", r2)

Mean Squared Error: 46.28903236006011
R-squared score: 0.12317008176758115


# Regularization Parameter (alpha): The default value for the regularization parameter in scikit-learn's Lasso model is 1.0. This parameter controls the strength of regularization, where higher values of alpha result in more coefficients being set to zero.

# Maximum Iterations (max_iter): The default value for the maximum number of iterations in scikit-learn's Lasso model is 1000. This parameter specifies the maximum number of iterations the solver will perform to converge to a solution.

# Convergence Tolerance (tol): The default value for the convergence tolerance in scikit-learn's Lasso model is 0.0001. It represents the tolerance for the stopping criteria of the solver. If the change in the objective function or coefficients falls below this value, the solver is considered to have converged.

# Intercept (fit_intercept): The default value for the fit_intercept parameter in scikit-learn's Lasso model is True. This parameter determines whether to include an intercept term (bias) in the model. If set to True, an intercept will be included, and if set to False, no intercept will be included.

# Normalize: The default value for the normalize parameter in scikit-learn's Lasso model is False. This parameter determines whether to normalize the predictor variables before fitting the model. If set to True, the variables will be scaled to have zero mean and unit variance.