# Data Conversion

This notebook transforms our soundfiles into a numerical dataset for our models.

In [None]:
#Potentially needed installations
#!pip install librosa
# !pip install playsound
# !pip install tqdm

In [None]:
#Imports
import numpy as np
import pandas as pd
import librosa
#from playsound import playsound
#This is used to show a progress bar in the terminal. Helpful as the conversion can take a while.
from tqdm import tqdm

## Preparation of the Overview Dataframe

In [13]:
def prepare_overview(list_of_files):
    """
    This function takes a list of files and creates a joint dataframe for audio file conversion
    """
    overview = None
    for file_path in list_of_files:
        with open(file_path, "r") as file:
            if overview is None:
                overview = pd.read_csv(file, sep="\t")
            else:
                pd.concat([overview,pd.read_csv(file, sep="\t")], ignore_index=True)
    return overview

In [None]:
def preprocess_overview(overview):
    """
    This function takes a dataframe and preprocesses it for audio file conversion
    """
    # we only need files with a gender label
    overview=overview.dropna(subset=["gender"])
    #irrelevant columns for our analysis
    overview=overview.drop(columns=["variant", "segment", "sentence_id", "client_id", "up_votes", "down_votes", "locale"])
    #dropping all files that are not simply male or female
    overview=overview[(overview["gender"]=="female_feminine")|(overview["gender"]=="male_masculine")]
    #changing the path to reflect the location of the audio files
    overview["path"]=overview["path"].apply(lambda x: f"./clips/{x}")
    return overview

In [17]:
list_of_files = ["other_18.tsv", "other_19.tsv", "other_20.tsv", "other_21.tsv", "validated_18.tsv", "validated_19.tsv", "validated_20.tsv", "validated_21.tsv"]
overview=prepare_overview(list_of_files)
overview=preprocess_overview(overview)
overview.head()

Unnamed: 0,path,sentence,sentence_domain,age,gender,accents,locale
20,./clips/common_voice_en_40187693.mp3,Cohousing cultivates a culture of sharing and ...,,fifties,male_masculine,Canadian English,en
21,./clips/common_voice_en_40187694.mp3,She is allegedly an agent of an organization c...,,fifties,male_masculine,Canadian English,en
22,./clips/common_voice_en_40187695.mp3,Artume appeared as a recurring character in Ma...,,fifties,male_masculine,Canadian English,en
23,./clips/common_voice_en_40187696.mp3,In practice however the resistive element vari...,,fifties,male_masculine,Canadian English,en
24,./clips/common_voice_en_40187697.mp3,He then led the life of a wandering hermit.,,fifties,male_masculine,Canadian English,en


In [18]:
overview.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12084 entries, 20 to 26692
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   path             12084 non-null  object
 1   sentence         12084 non-null  object
 2   sentence_domain  204 non-null    object
 3   age              11989 non-null  object
 4   gender           12084 non-null  object
 5   accents          10821 non-null  object
 6   locale           12084 non-null  object
dtypes: object(7)
memory usage: 755.2+ KB


In [19]:
overview.describe(include="all")

Unnamed: 0,path,sentence,sentence_domain,age,gender,accents,locale
count,12084,12084,204,11989,12084,10821,12084
unique,12084,12055,24,6,2,25,1
top,./clips/common_voice_en_40862767.mp3,It is unknown at this time if any new products...,general,fourties,female_feminine,Scottish English,en
freq,1,2,140,4083,6500,3792,12084


In [23]:
overview.groupby("gender").size()

gender
female_feminine    6500
male_masculine     5584
dtype: int64

## Conversion of the Audiofiles

In [38]:
def extract_features(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None)

        # MFCC
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        mfccs_mean = np.mean(mfccs, axis=1)
        mfccs_std = np.std(mfccs, axis=1)

        # Chroma
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        chroma_mean = np.mean(chroma, axis=1)
        chroma_std = np.std(chroma, axis=1)

        # Spectral features
        spec_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        spec_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        spec_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)

        # Zero Crossing Rate
        zcr = librosa.feature.zero_crossing_rate(y)

        # RMSE
        rmse = librosa.feature.rms(y=y)

        features = np.concatenate([
            mfccs_mean, mfccs_std,
            chroma_mean, chroma_std,
            np.mean(spec_centroid, axis=1), np.std(spec_centroid, axis=1),
            np.mean(spec_bw, axis=1), np.std(spec_bw, axis=1),
            np.mean(spec_contrast, axis=1), np.std(spec_contrast, axis=1),
            np.mean(spec_rolloff, axis=1), np.std(spec_rolloff, axis=1),
            np.mean(zcr, axis=1), np.std(zcr, axis=1),
            np.mean(rmse, axis=1), np.std(rmse, axis=1)
        ])

        return features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def extract_features_from_series(file_paths):
    features_list = []
    for path in tqdm(file_paths, desc="Extracting features"):
        features = extract_features(path)
        features_list.append(features)

    # Building proper column names
    columns = []
    columns += [f"mfcc_{i+1:02d}_mean" for i in range(20)]
    columns += [f"mfcc_{i+1:02d}_std" for i in range(20)]
    columns += [f"chroma_{i+1:02d}_mean" for i in range(12)]
    columns += [f"chroma_{i+1:02d}_std" for i in range(12)]
    columns += ["spec_centroid_mean", "spec_centroid_std"]
    columns += ["spec_bandwidth_mean", "spec_bandwidth_std"]
    columns += [f"spec_contrast_band_{i+1}_mean" for i in range(7)]
    columns += [f"spec_contrast_band_{i+1}_std" for i in range(7)]
    columns += ["spec_rolloff_mean", "spec_rolloff_std"]
    columns += ["zcr_mean", "zcr_std"]
    columns += ["rmse_mean", "rmse_std"]


    features_df = pd.DataFrame(features_list, columns=columns)
    
    # Optional: Add file path to keep track
    features_df['file_path'] = file_paths.values
    
    return features_df



In [8]:
extract_features(test_path)

array([-3.62970856e+02,  8.06915283e+01, -8.02076876e-01,  1.39045610e+01,
       -2.23727441e+00,  3.73489261e+00, -1.50086775e+01,  4.65095282e+00,
       -6.69358301e+00,  1.27157092e-01, -1.46965942e+01, -1.36738396e+01,
       -9.22727048e-01, -3.79592180e+00, -1.87453949e+00, -5.26400661e+00,
        1.13411152e+00, -8.43582821e+00, -4.60211372e+00, -5.24977398e+00,
        1.15385429e+02,  6.98976746e+01,  2.97405167e+01,  3.69195747e+01,
        2.72258949e+01,  2.29122715e+01,  2.07468204e+01,  1.41433172e+01,
        1.66579151e+01,  1.23701954e+01,  1.67707367e+01,  1.27385578e+01,
        1.02734222e+01,  1.07126684e+01,  9.32983017e+00,  9.83325768e+00,
        1.02610426e+01,  1.05172005e+01,  1.03063889e+01,  9.94580078e+00,
        4.13493007e-01,  3.78831506e-01,  3.57891530e-01,  2.95197576e-01,
        2.40167186e-01,  2.43760347e-01,  2.34083891e-01,  2.45221466e-01,
        2.97325909e-01,  3.41924489e-01,  4.20348167e-01,  4.19052422e-01,
        3.50895882e-01,  

In [None]:
test_path=overview["path"].iloc[1]
extract_features(test_path)

array([-4.17263275e+02,  9.59954529e+01,  1.20223665e+01,  1.66581841e+01,
       -1.03789682e+01,  4.74267244e+00, -3.67165780e+00,  1.52179384e+00,
       -1.67389584e+01, -2.92925525e+00, -3.28474402e+00, -8.25840569e+00,
       -7.26611996e+00, -6.01704216e+00, -2.53125310e+00, -9.17570400e+00,
       -8.57575607e+00, -1.61557808e+01, -3.43558502e+00, -6.81621313e+00,
        1.05240646e+02,  7.43216171e+01,  3.49467812e+01,  3.44769287e+01,
        3.47207222e+01,  2.95527687e+01,  1.85495415e+01,  1.45847635e+01,
        1.57257624e+01,  1.42567053e+01,  1.48636837e+01,  1.12375584e+01,
        9.18695641e+00,  8.70439243e+00,  8.65102100e+00,  8.10237503e+00,
        9.44598293e+00,  9.37942696e+00,  1.04770756e+01,  7.32606792e+00,
        4.87466007e-01,  4.62529749e-01,  3.99397373e-01,  4.09033239e-01,
        4.04384792e-01,  4.61868405e-01,  4.57070053e-01,  3.95538002e-01,
        3.99244994e-01,  4.51175988e-01,  4.39263731e-01,  4.18804467e-01,
        3.05663884e-01,  

In [39]:
extract_features_from_series(overview["path"].iloc[0:10])

Extracting features: 100%|██████████| 10/10 [00:02<00:00,  3.36it/s]


Unnamed: 0,mfcc_01_mean,mfcc_02_mean,mfcc_03_mean,mfcc_04_mean,mfcc_05_mean,mfcc_06_mean,mfcc_07_mean,mfcc_08_mean,mfcc_09_mean,mfcc_10_mean,...,spec_contrast_band_5_std,spec_contrast_band_6_std,spec_contrast_band_7_std,spec_rolloff_mean,spec_rolloff_std,zcr_mean,zcr_std,rmse_mean,rmse_std,file_path
0,-445.498474,104.745689,-4.465846,23.129927,-8.573237,-1.268834,-9.369425,1.640033,2.187018,-2.149446,...,4.715905,9.575807,12.351898,4026.16782,2490.066878,0.112659,0.10089,0.018489,0.022452,./clips/common_voice_en_40865481.mp3
1,-364.696869,121.879822,-7.309391,25.674622,-2.411059,-3.44179,-4.54372,3.402653,-3.743834,-3.174486,...,5.322201,9.041376,7.188519,3889.284049,2266.098812,0.106307,0.087624,0.047765,0.054042,./clips/common_voice_en_40865482.mp3
2,-368.627716,124.229691,-21.27103,22.779219,11.785179,-2.904757,-6.195994,6.487331,-3.25847,-6.093594,...,4.38005,10.544969,6.287266,3647.123471,2168.339703,0.098639,0.081719,0.043612,0.048221,./clips/common_voice_en_40865483.mp3
3,-393.304443,128.466187,-12.981902,29.2012,-3.168825,-5.908168,-6.666719,11.620709,-1.459203,-4.840008,...,5.394829,10.689913,6.769686,4019.53125,2316.598327,0.108081,0.082338,0.032594,0.033312,./clips/common_voice_en_40865484.mp3
4,-417.168152,127.932961,-3.63296,22.386129,1.824513,-8.6879,-7.400895,11.621496,3.207134,-5.357733,...,4.151822,11.127904,8.289181,3783.578431,2346.418892,0.096015,0.079085,0.034302,0.040524,./clips/common_voice_en_40865485.mp3
5,-426.043457,87.810509,-4.700737,11.842738,13.548295,-1.723409,-2.226623,12.243686,-12.377252,-1.92855,...,6.090735,11.881918,4.647541,4328.49026,2680.954255,0.079652,0.080997,0.030861,0.048145,./clips/common_voice_en_40865654.mp3
6,-396.758423,104.185867,-11.018373,10.071074,15.728835,-0.28548,-5.212489,6.488687,-16.446341,-0.234883,...,8.407872,13.135753,4.621261,4288.146322,2575.791313,0.077533,0.070417,0.043904,0.056282,./clips/common_voice_en_40865655.mp3
7,-380.937744,101.963188,-15.807178,10.143242,16.529417,-0.13418,-12.777008,10.536057,-11.671973,-4.904925,...,5.711856,11.650043,4.132404,4089.751766,2376.368495,0.076756,0.069944,0.054208,0.068454,./clips/common_voice_en_40865656.mp3
8,-421.63562,106.769043,-10.700211,8.268701,13.30379,6.041292,-7.146387,1.268414,-19.776569,1.611451,...,10.254061,13.050201,4.790775,4394.829893,2734.148816,0.089505,0.089704,0.039509,0.044943,./clips/common_voice_en_40865657.mp3
9,-440.549469,85.503616,-9.739224,24.764322,21.730711,0.955629,-11.130703,9.073335,-9.960368,0.661415,...,4.535277,12.052024,3.846377,4655.954174,2328.314851,0.085088,0.0788,0.028039,0.038292,./clips/common_voice_en_40865658.mp3
