# Data Conversion

This notebook transforms our soundfiles into a numerical dataset for our models.

In [1]:
#Potentially needed installations
#!pip install librosa
# !pip install playsound
# !pip install tqdm

In [2]:
#Imports
import numpy as np
import pandas as pd
import librosa
#from playsound import playsound
#This is used to show a progress bar in the terminal. Helpful as the conversion can take a while.
from tqdm import tqdm
import concurrent.futures

## Preparation of the Overview Dataframe

In [18]:
def prepare_overview(list_of_files):
    """
    This function takes a list of files and creates a joint dataframe for audio file conversion
    """
    overview = None
    for file_path in list_of_files:
        with open(file_path, "r") as file:
            if overview is None:
                overview = pd.read_csv(file, sep="\t")
            else:
                overview = pd.concat([overview, pd.read_csv(file, sep="\t")], ignore_index=True)
    return overview

In [22]:
def preprocess_overview(overview, folder_path):
    """
    This function takes a dataframe and preprocesses it for audio file conversion
    """
    # we only need files with a gender label
    overview=overview.dropna(subset=["gender"])
    #irrelevant columns for our analysis
    overview=overview.drop(columns=["variant", "segment", "sentence_id", "client_id", "up_votes", "down_votes", "locale"])
    #dropping all files that are not simply male or female
    overview=overview[(overview["gender"]=="female_feminine")|(overview["gender"]=="male_masculine")]
    #changing the path to reflect the location of the audio files
    overview["path"]=overview["path"].apply(lambda x: f"{folder_path}/{x}")
    return overview.reset_index(drop=True)

In [23]:
list_of_files = ["other_18.tsv", "other_19.tsv", "other_20.tsv", "other_21.tsv", "validated_18.tsv", "validated_19.tsv", "validated_20.tsv", "validated_21.tsv"]
overview=prepare_overview(list_of_files)
overview.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94691 entries, 0 to 94690
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   client_id        94691 non-null  object 
 1   path             94691 non-null  object 
 2   sentence_id      94691 non-null  object 
 3   sentence         94691 non-null  object 
 4   sentence_domain  727 non-null    object 
 5   up_votes         94691 non-null  int64  
 6   down_votes       94691 non-null  int64  
 7   age              68529 non-null  object 
 8   gender           46549 non-null  object 
 9   accents          58440 non-null  object 
 10  variant          0 non-null      float64
 11  locale           94691 non-null  object 
 12  segment          0 non-null      float64
dtypes: float64(2), int64(2), object(9)
memory usage: 9.4+ MB


In [24]:
# On my device, all audio files are in the same "clips" folder. Please change the path if your files are in a different location.
overview=preprocess_overview(overview, "./clips")
overview.head()

Unnamed: 0,path,sentence,sentence_domain,age,gender,accents
0,./clips/common_voice_en_40187693.mp3,Cohousing cultivates a culture of sharing and ...,,fifties,male_masculine,Canadian English
1,./clips/common_voice_en_40187694.mp3,She is allegedly an agent of an organization c...,,fifties,male_masculine,Canadian English
2,./clips/common_voice_en_40187695.mp3,Artume appeared as a recurring character in Ma...,,fifties,male_masculine,Canadian English
3,./clips/common_voice_en_40187696.mp3,In practice however the resistive element vari...,,fifties,male_masculine,Canadian English
4,./clips/common_voice_en_40187697.mp3,He then led the life of a wandering hermit.,,fifties,male_masculine,Canadian English


In [26]:
overview.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45358 entries, 0 to 45357
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   path             45358 non-null  object
 1   sentence         45358 non-null  object
 2   sentence_domain  384 non-null    object
 3   age              44929 non-null  object
 4   gender           45358 non-null  object
 5   accents          38582 non-null  object
dtypes: object(6)
memory usage: 2.1+ MB


In [27]:
overview.describe(include="all")

Unnamed: 0,path,sentence,sentence_domain,age,gender,accents
count,45358,45358,384,44929,45358,38582
unique,45358,45272,30,7,2,44
top,./clips/common_voice_en_42221538.mp3,The test is unofficial and just what it will p...,general,fourties,female_feminine,Scottish English
freq,1,3,241,15300,28975,14275


In [28]:
overview.groupby("gender").size()

gender
female_feminine    28975
male_masculine     16383
dtype: int64

We have a slight class imbalance that we may want to remove/have to deal with within the model building.

## Conversion of the Audiofiles

In [29]:
def extract_features(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None, mono=True)

        # Feature extraction
        # MFCC (Mel-frequency cepstral coefficients) gives the timbre of the audio
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        
        # Chroma feature gives the harmonic content of the audio
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        
        # Spectral features
        # Spectral centroid gives the center of mass of the spectrum
        spec_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        # Spectral bandwidth gives the width of the spectrum
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        # Spectral contrast gives the difference in amplitude between peaks and valleys in the sound spectrum
        spec_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        # Spectral rolloff gives the frequency below which a certain percentage of the total spectral energy is contained
        spec_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        
        # Zero crossing rate gives the rate at which the signal changes sign
        zcr = librosa.feature.zero_crossing_rate(y)
        # Root mean square energy gives the energy of the signal (i.e., the loudness)
        rmse = librosa.feature.rms(y=y)

        # Combine features into a single feature vector
        features = np.hstack([
            np.mean(mfccs, axis=1), np.std(mfccs, axis=1),
            np.mean(chroma, axis=1), np.std(chroma, axis=1),
            np.mean(spec_centroid), np.std(spec_centroid),
            np.mean(spec_bw), np.std(spec_bw),
            np.mean(spec_contrast, axis=1), np.std(spec_contrast, axis=1),
            np.mean(spec_rolloff), np.std(spec_rolloff),
            np.mean(zcr), np.std(zcr),
            np.mean(rmse), np.std(rmse)
        ])

        return features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


In [30]:
def build_columns():
    columns = []
    columns += [f"mfcc_{i+1:02d}_mean" for i in range(20)]
    columns += [f"mfcc_{i+1:02d}_std" for i in range(20)]
    columns += [f"chroma_{i+1:02d}_mean" for i in range(12)]
    columns += [f"chroma_{i+1:02d}_std" for i in range(12)]
    columns += ["spec_centroid_mean", "spec_centroid_std"]
    columns += ["spec_bandwidth_mean", "spec_bandwidth_std"]
    columns += [f"spec_contrast_band_{i+1}_mean" for i in range(7)]
    columns += [f"spec_contrast_band_{i+1}_std" for i in range(7)]
    columns += ["spec_rolloff_mean", "spec_rolloff_std"]
    columns += ["zcr_mean", "zcr_std"]
    columns += ["rmse_mean", "rmse_std"]
    return columns

In [31]:
def extract_features_from_dataframe(df_paths, output_csv_path, parallel=True):
    features_list = []
    valid_indices = []

    paths = df_paths['path'].tolist()

    # Use ThreadPoolExecutor for parallel processing
    # This helps to speed up the feature extraction process
    if parallel:
        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
            futures = {executor.submit(extract_features, path): idx for idx, path in enumerate(paths)}
            for future in tqdm(concurrent.futures.as_completed(futures), total=len(paths), desc="Extracting features"):
                result = future.result()
                if result is not None:
                    features_list.append(result)
                    valid_indices.append(futures[future])
    else:
        for idx, path in tqdm(enumerate(paths), total=len(paths), desc="Extracting features"):
            result = extract_features(path)
            if result is not None:
                features_list.append(result)
                valid_indices.append(idx)

    # Build features DataFrame
    columns = build_columns()
    features_df = pd.DataFrame(features_list, columns=columns)
    
    # Match features to original DataFrame
    merged_df = df_paths.iloc[valid_indices].reset_index(drop=True)
    final_df = pd.concat([merged_df, features_df], axis=1)

    # Save to CSV
    final_df.to_csv(output_csv_path, index=False)
    print(f"Saved extracted features to {output_csv_path}")

    return final_df

In [12]:
extract_features_from_dataframe(overview, "data.csv", parallel=True).head()

Extracting features: 100%|██████████| 12084/12084 [11:02<00:00, 18.23it/s]


Saved extracted features to data.csv


Unnamed: 0,path,sentence,sentence_domain,age,gender,accents,mfcc_01_mean,mfcc_02_mean,mfcc_03_mean,mfcc_04_mean,...,spec_contrast_band_4_std,spec_contrast_band_5_std,spec_contrast_band_6_std,spec_contrast_band_7_std,spec_rolloff_mean,spec_rolloff_std,zcr_mean,zcr_std,rmse_mean,rmse_std
0,./clips/common_voice_en_40187697.mp3,He then led the life of a wandering hermit.,,fifties,male_masculine,Canadian English,-494.58429,125.826019,18.260662,23.508959,...,4.600072,4.264892,7.531223,4.263827,4458.521793,2792.566125,0.061128,0.059886,0.016811,0.020808
1,./clips/common_voice_en_40187705.mp3,Any new copies were initialized with a success...,,fifties,male_masculine,Canadian English,-449.487457,119.852509,7.75342,30.211107,...,4.427589,3.37492,8.790948,4.794475,4491.61981,2971.032386,0.078209,0.080991,0.021588,0.023391
2,./clips/common_voice_en_40187695.mp3,Artume appeared as a recurring character in Ma...,,fifties,male_masculine,Canadian English,-503.006805,131.359589,15.189734,18.775719,...,4.263854,6.058417,6.529373,3.953812,4289.709689,2843.069699,0.067131,0.057595,0.01311,0.014007
3,./clips/common_voice_en_40187696.mp3,In practice however the resistive element vari...,,fifties,male_masculine,Canadian English,-473.256073,116.954979,12.820942,26.763842,...,4.252196,4.170021,5.473282,3.982934,4656.431159,2868.39905,0.082116,0.086288,0.016562,0.01828
4,./clips/common_voice_en_40187693.mp3,Cohousing cultivates a culture of sharing and ...,,fifties,male_masculine,Canadian English,-466.040344,111.497826,7.834814,21.756237,...,4.760368,4.768118,6.406526,3.485832,4421.248209,2540.725526,0.072797,0.060684,0.015802,0.019355
