# Data Conversion

This notebook transforms our soundfiles into a numerical dataset for our models.

In [None]:
#Potentially needed installations
#!pip install librosa
# !pip install playsound
# !pip install tqdm

ERROR: Could not find a version that satisfies the requirement concurrent.futures (from versions: none)
ERROR: No matching distribution found for concurrent.futures


In [25]:
#Imports
import numpy as np
import pandas as pd
import librosa
#from playsound import playsound
#This is used to show a progress bar in the terminal. Helpful as the conversion can take a while.
from tqdm import tqdm
import concurrent.futures

## Preparation of the Overview Dataframe

In [13]:
def prepare_overview(list_of_files):
    """
    This function takes a list of files and creates a joint dataframe for audio file conversion
    """
    overview = None
    for file_path in list_of_files:
        with open(file_path, "r") as file:
            if overview is None:
                overview = pd.read_csv(file, sep="\t")
            else:
                pd.concat([overview,pd.read_csv(file, sep="\t")], ignore_index=True)
    return overview

In [None]:
def preprocess_overview(overview):
    """
    This function takes a dataframe and preprocesses it for audio file conversion
    """
    # we only need files with a gender label
    overview=overview.dropna(subset=["gender"])
    #irrelevant columns for our analysis
    overview=overview.drop(columns=["variant", "segment", "sentence_id", "client_id", "up_votes", "down_votes", "locale"])
    #dropping all files that are not simply male or female
    overview=overview[(overview["gender"]=="female_feminine")|(overview["gender"]=="male_masculine")]
    #changing the path to reflect the location of the audio files
    overview["path"]=overview["path"].apply(lambda x: f"./clips/{x}")
    return overview

In [17]:
list_of_files = ["other_18.tsv", "other_19.tsv", "other_20.tsv", "other_21.tsv", "validated_18.tsv", "validated_19.tsv", "validated_20.tsv", "validated_21.tsv"]
overview=prepare_overview(list_of_files)
overview=preprocess_overview(overview)
overview.head()

Unnamed: 0,path,sentence,sentence_domain,age,gender,accents,locale
20,./clips/common_voice_en_40187693.mp3,Cohousing cultivates a culture of sharing and ...,,fifties,male_masculine,Canadian English,en
21,./clips/common_voice_en_40187694.mp3,She is allegedly an agent of an organization c...,,fifties,male_masculine,Canadian English,en
22,./clips/common_voice_en_40187695.mp3,Artume appeared as a recurring character in Ma...,,fifties,male_masculine,Canadian English,en
23,./clips/common_voice_en_40187696.mp3,In practice however the resistive element vari...,,fifties,male_masculine,Canadian English,en
24,./clips/common_voice_en_40187697.mp3,He then led the life of a wandering hermit.,,fifties,male_masculine,Canadian English,en


In [18]:
overview.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12084 entries, 20 to 26692
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   path             12084 non-null  object
 1   sentence         12084 non-null  object
 2   sentence_domain  204 non-null    object
 3   age              11989 non-null  object
 4   gender           12084 non-null  object
 5   accents          10821 non-null  object
 6   locale           12084 non-null  object
dtypes: object(7)
memory usage: 755.2+ KB


In [19]:
overview.describe(include="all")

Unnamed: 0,path,sentence,sentence_domain,age,gender,accents,locale
count,12084,12084,204,11989,12084,10821,12084
unique,12084,12055,24,6,2,25,1
top,./clips/common_voice_en_40862767.mp3,It is unknown at this time if any new products...,general,fourties,female_feminine,Scottish English,en
freq,1,2,140,4083,6500,3792,12084


In [23]:
overview.groupby("gender").size()

gender
female_feminine    6500
male_masculine     5584
dtype: int64

## Conversion of the Audiofiles

In [26]:
def extract_features(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None, mono=True)

        # Feature extraction
        # MFCC (Mel-frequency cepstral coefficients) gives the timbre of the audio
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        
        # Chroma feature gives the harmonic content of the audio
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        
        # Spectral features
        # Spectral centroid gives the center of mass of the spectrum
        spec_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        # Spectral bandwidth gives the width of the spectrum
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        # Spectral contrast gives the difference in amplitude between peaks and valleys in the sound spectrum
        spec_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        # Spectral rolloff gives the frequency below which a certain percentage of the total spectral energy is contained
        spec_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        
        # Zero crossing rate gives the rate at which the signal changes sign
        zcr = librosa.feature.zero_crossing_rate(y)
        # Root mean square energy gives the energy of the signal (i.e., the loudness)
        rmse = librosa.feature.rms(y=y)

        # Combine features into a single feature vector
        features = np.hstack([
            np.mean(mfccs, axis=1), np.std(mfccs, axis=1),
            np.mean(chroma, axis=1), np.std(chroma, axis=1),
            np.mean(spec_centroid), np.std(spec_centroid),
            np.mean(spec_bw), np.std(spec_bw),
            np.mean(spec_contrast, axis=1), np.std(spec_contrast, axis=1),
            np.mean(spec_rolloff), np.std(spec_rolloff),
            np.mean(zcr), np.std(zcr),
            np.mean(rmse), np.std(rmse)
        ])

        return features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


In [27]:
def build_columns():
    columns = []
    columns += [f"mfcc_{i+1:02d}_mean" for i in range(20)]
    columns += [f"mfcc_{i+1:02d}_std" for i in range(20)]
    columns += [f"chroma_{i+1:02d}_mean" for i in range(12)]
    columns += [f"chroma_{i+1:02d}_std" for i in range(12)]
    columns += ["spec_centroid_mean", "spec_centroid_std"]
    columns += ["spec_bandwidth_mean", "spec_bandwidth_std"]
    columns += [f"spec_contrast_band_{i+1}_mean" for i in range(7)]
    columns += [f"spec_contrast_band_{i+1}_std" for i in range(7)]
    columns += ["spec_rolloff_mean", "spec_rolloff_std"]
    columns += ["zcr_mean", "zcr_std"]
    columns += ["rmse_mean", "rmse_std"]
    return columns

In [28]:
def extract_features_from_dataframe(df_paths, output_csv_path, parallel=True):
    features_list = []
    valid_indices = []

    paths = df_paths['path'].tolist()

    # Use ThreadPoolExecutor for parallel processing
    # This helps to speed up the feature extraction process
    if parallel:
        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
            futures = {executor.submit(extract_features, path): idx for idx, path in enumerate(paths)}
            for future in tqdm(concurrent.futures.as_completed(futures), total=len(paths), desc="Extracting features"):
                result = future.result()
                if result is not None:
                    features_list.append(result)
                    valid_indices.append(futures[future])
    else:
        for idx, path in tqdm(enumerate(paths), total=len(paths), desc="Extracting features"):
            result = extract_features(path)
            if result is not None:
                features_list.append(result)
                valid_indices.append(idx)

    # Build features DataFrame
    columns = build_columns()
    features_df = pd.DataFrame(features_list, columns=columns)
    
    # Match features to original DataFrame
    merged_df = df_paths.iloc[valid_indices].reset_index(drop=True)
    final_df = pd.concat([merged_df, features_df], axis=1)

    # Save to CSV
    final_df.to_csv(output_csv_path, index=False)
    print(f"Saved extracted features to {output_csv_path}")

    return final_df

In [None]:
extract_features_from_dataframe(overview, "data.csv", parallel=True).head()