# Data Conversion

This notebook transforms our soundfiles into a numerical dataset for our models.

In [1]:
#Potentially needed installations
#!pip install librosa
# !pip install playsound
# !pip install tqdm

In [2]:
#Imports
import numpy as np
import pandas as pd
import librosa
#from playsound import playsound
#This is used to show a progress bar in the terminal. Helpful as the conversion can take a while.
from tqdm import tqdm
import concurrent.futures

## Preparation of the Overview Dataframe

In [3]:
def prepare_overview(list_of_files):
    """
    This function takes a list of files and creates a joint dataframe for audio file conversion
    """
    overview = None
    for file_path in list_of_files:
        with open(file_path, "r") as file:
            if overview is None:
                overview = pd.read_csv(file, sep="\t")
            else:
                overview = pd.concat([overview, pd.read_csv(file, sep="\t")], ignore_index=True)
    return overview

In [17]:
def fix_gender(gender):
    if gender == "male":
        return "male_masculine"
    elif gender == "female":
        return "female_feminine"
    else:
        return gender

In [18]:
def preprocess_overview(overview, folder_path):
    """
    This function takes a dataframe and preprocesses it for audio file conversion
    """
    # we only need files with a gender label
    overview=overview.dropna(subset=["gender"])
    #irrelevant columns for our analysis
    overview=overview.drop(columns=["variant", "segment", "sentence_id", "client_id", "up_votes", "down_votes", "locale"])
    #fixing gender labels
    overview["gender"]=overview["gender"].apply(fix_gender)
    #dropping all files that are not simply male or female
    overview=overview[(overview["gender"]=="female_feminine")|(overview["gender"]=="male_masculine")]
    #changing the path to reflect the location of the audio files
    overview["path"]=overview["path"].apply(lambda x: f"{folder_path}/{x}")
    return overview.reset_index(drop=True)

In [None]:
list_of_files = ["other_18.tsv", "other_19.tsv", "other_20.tsv", "other_21.tsv", "validated_18.tsv", "validated_19.tsv", "validated_20.tsv", "validated_21.tsv"]
list_of_files_2 = ["other_13.tsv", "validated_13.tsv", "other_14.tsv", "validated_14.tsv", "other_15.tsv", "validated_15.tsv", "other_16.tsv", "validated_16.tsv", "other_17.tsv", "validated_17.tsv"]
#overview = prepare_overview(list_of_files)
overview_2=prepare_overview(list_of_files_2)
overview_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206671 entries, 0 to 206670
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   client_id        206671 non-null  object 
 1   path             206671 non-null  object 
 2   sentence         206671 non-null  object 
 3   up_votes         206671 non-null  int64  
 4   down_votes       206671 non-null  int64  
 5   age              156522 non-null  object 
 6   gender           155649 non-null  object 
 7   accents          155305 non-null  object 
 8   variant          0 non-null       float64
 9   locale           206671 non-null  object 
 10  segment          0 non-null       float64
 11  sentence_id      42971 non-null   object 
 12  sentence_domain  103 non-null     object 
dtypes: float64(2), int64(2), object(9)
memory usage: 20.5+ MB


In [None]:
overview_2.groupby("gender").size()

gender
female             63490
female_feminine     5097
male               63353
male_masculine     21747
non-binary             5
other               1919
transgender           38
dtype: int64

In [None]:
# On my device, all audio files are in the same "clips" folder. Please change the path if your files are in a different location.
#overview = preprocess_overview(overview, "./clips")
overview_2=preprocess_overview(overview_2, "./clips")
overview_2.head()

Unnamed: 0,path,sentence,age,gender,accents,sentence_domain
0,./clips/common_voice_en_36530278.mp3,"Every year in spring, a unicycle race is held ...",twenties,female_feminine,United States English,
1,./clips/common_voice_en_36530279.mp3,"For all the promise, there was very little ach...",twenties,female_feminine,United States English,
2,./clips/common_voice_en_36530280.mp3,"Not long afterward, Diamond met Rick Rubin.",twenties,female_feminine,United States English,
3,./clips/common_voice_en_36530282.mp3,"After passing Bachelor of Science, any student...",twenties,female_feminine,United States English,
4,./clips/common_voice_en_36530283.mp3,"He scored in the match, played at Old Trafford.",twenties,female_feminine,United States English,


In [None]:
overview_2.describe(include="all")

Unnamed: 0,path,sentence,age,gender,accents,sentence_domain
count,153687,153687,153299,153687,135147,66
unique,153687,152682,9,2,184,5
top,./clips/common_voice_en_40110434.mp3,No one may go to the side of the hut where the...,twenties,male_masculine,"Southern African (South Africa, Zimbabwe, Nami...",general
freq,1,3,56624,85100,42057,41


In [None]:
overview_2.groupby("gender").size()

gender
female_feminine    68587
male_masculine     85100
dtype: int64

In [None]:
overview_2.groupby("age").size()

age
eighties        25
fifties      18632
fourties     11969
nineties       153
seventies      871
sixties       8444
teens         7737
thirties     48844
twenties     56624
dtype: int64

We have a slight class imbalance that we may want to remove/have to deal with within the model building.

## Conversion of the Audiofiles

In [25]:
def extract_features(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None, mono=True)

        # Feature extraction
        # MFCC (Mel-frequency cepstral coefficients) gives the timbre of the audio
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        
        # Chroma feature gives the harmonic content of the audio
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        
        # Spectral features
        # Spectral centroid gives the center of mass of the spectrum
        spec_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        # Spectral bandwidth gives the width of the spectrum
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        # Spectral contrast gives the difference in amplitude between peaks and valleys in the sound spectrum
        spec_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        # Spectral rolloff gives the frequency below which a certain percentage of the total spectral energy is contained
        spec_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        
        # Zero crossing rate gives the rate at which the signal changes sign
        zcr = librosa.feature.zero_crossing_rate(y)
        # Root mean square energy gives the energy of the signal (i.e., the loudness)
        rmse = librosa.feature.rms(y=y)

        # Combine features into a single feature vector
        features = np.hstack([
            np.mean(mfccs, axis=1), np.std(mfccs, axis=1),
            np.mean(chroma, axis=1), np.std(chroma, axis=1),
            np.mean(spec_centroid), np.std(spec_centroid),
            np.mean(spec_bw), np.std(spec_bw),
            np.mean(spec_contrast, axis=1), np.std(spec_contrast, axis=1),
            np.mean(spec_rolloff), np.std(spec_rolloff),
            np.mean(zcr), np.std(zcr),
            np.mean(rmse), np.std(rmse)
        ])

        return features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


In [26]:
def build_columns():
    columns = []
    columns += [f"mfcc_{i+1:02d}_mean" for i in range(20)]
    columns += [f"mfcc_{i+1:02d}_std" for i in range(20)]
    columns += [f"chroma_{i+1:02d}_mean" for i in range(12)]
    columns += [f"chroma_{i+1:02d}_std" for i in range(12)]
    columns += ["spec_centroid_mean", "spec_centroid_std"]
    columns += ["spec_bandwidth_mean", "spec_bandwidth_std"]
    columns += [f"spec_contrast_band_{i+1}_mean" for i in range(7)]
    columns += [f"spec_contrast_band_{i+1}_std" for i in range(7)]
    columns += ["spec_rolloff_mean", "spec_rolloff_std"]
    columns += ["zcr_mean", "zcr_std"]
    columns += ["rmse_mean", "rmse_std"]
    return columns

In [27]:
def extract_features_from_dataframe(df_paths, output_csv_path, parallel=True):
    features_list = []
    valid_indices = []

    paths = df_paths['path'].tolist()

    # Use ThreadPoolExecutor for parallel processing
    # This helps to speed up the feature extraction process
    if parallel:
        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
            futures = {executor.submit(extract_features, path): idx for idx, path in enumerate(paths)}
            for future in tqdm(concurrent.futures.as_completed(futures), total=len(paths), desc="Extracting features"):
                result = future.result()
                if result is not None:
                    features_list.append(result)
                    valid_indices.append(futures[future])
    else:
        for idx, path in tqdm(enumerate(paths), total=len(paths), desc="Extracting features"):
            result = extract_features(path)
            if result is not None:
                features_list.append(result)
                valid_indices.append(idx)

    # Build features DataFrame
    columns = build_columns()
    features_df = pd.DataFrame(features_list, columns=columns)
    
    # Match features to original DataFrame
    merged_df = df_paths.iloc[valid_indices].reset_index(drop=True)
    final_df = pd.concat([merged_df, features_df], axis=1)

    # Save to CSV
    final_df.to_csv(output_csv_path, index=False)
    print(f"Saved extracted features to {output_csv_path}")

    return final_df

In [None]:
#df=extract_features_from_dataframe(overview, "./data/data_1.csv", parallel=True)
df=extract_features_from_dataframe(overview_2, "./data/data_2.csv", parallel=True)
df.describe()

  return pitch_tuning(
Extracting features: 100%|██████████| 153687/153687 [2:29:43<00:00, 17.11it/s]  


Saved extracted features to ./data/data_2.csv


Unnamed: 0,mfcc_01_mean,mfcc_02_mean,mfcc_03_mean,mfcc_04_mean,mfcc_05_mean,mfcc_06_mean,mfcc_07_mean,mfcc_08_mean,mfcc_09_mean,mfcc_10_mean,...,spec_contrast_band_4_std,spec_contrast_band_5_std,spec_contrast_band_6_std,spec_contrast_band_7_std,spec_rolloff_mean,spec_rolloff_std,zcr_mean,zcr_std,rmse_mean,rmse_std
count,153687.0,153687.0,153687.0,153687.0,153687.0,153687.0,153687.0,153687.0,153687.0,153687.0,...,153687.0,153687.0,153687.0,153687.0,153687.0,153687.0,153687.0,153687.0,153687.0,153687.0
mean,-423.78551,102.472514,-6.831261,25.223064,2.653685,2.20584,-8.246838,-7.198746,-7.167236,-7.834317,...,5.087988,5.142466,8.38017,7.935775,3852.102317,2301.920853,0.099244,0.086102,0.045506,0.048407
std,72.663292,23.937435,18.979706,14.314564,13.207778,12.381509,8.86579,11.223413,9.070593,7.244752,...,0.885814,1.199569,2.313839,3.358012,1264.732632,620.724959,0.033269,0.025126,0.0262,0.02648
min,-1131.371216,-10.022559,-179.35405,-59.260406,-117.882278,-51.119732,-72.913689,-71.733627,-47.036198,-43.166531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-449.265427,85.414848,-18.753922,15.658614,-5.042217,-6.483629,-13.44541,-15.101054,-13.789881,-12.40662,...,4.472532,4.371707,6.950669,5.499864,3053.851044,1934.289527,0.077862,0.073118,0.030327,0.032584
50%,-414.052704,101.648323,-6.484858,26.06233,3.68327,1.978006,-8.190969,-7.33642,-7.019958,-7.96458,...,5.005613,5.008768,8.319714,7.145524,3711.697319,2283.2981,0.097099,0.088005,0.040157,0.042631
75%,-381.784637,118.712063,5.197469,35.023731,11.247384,10.594362,-2.515469,0.872001,-0.400692,-3.290712,...,5.609456,5.79247,9.884053,9.088214,4554.485841,2714.518369,0.117734,0.101255,0.055869,0.059515
max,-130.888443,243.761856,69.662437,115.599419,54.558987,60.683899,41.940166,63.374218,34.061249,36.054985,...,10.252549,14.99241,21.077323,26.842926,10163.976648,4424.978768,0.475348,0.211934,0.370823,0.382496
