# Data Conversion

This notebook transforms our soundfiles into a numerical dataset for our models.

In [95]:
#Potentially needed installations
#!pip install librosa
# !pip install playsound
# !pip install tqdm

In [96]:
#Imports
import numpy as np
import pandas as pd
import librosa
#from playsound import playsound
#This is used to show a progress bar in the terminal. Helpful as the conversion can take a while.
from tqdm import tqdm
import concurrent.futures
import tarfile
import os

## Preparation of the Overview Dataframe

In [97]:
def prepare_overview(list_of_files, folder_path=None):
    """
    This function takes a list of files and creates a joint dataframe for audio file conversion
    """
    overview = None
    for file in list_of_files:
        file_path = f"{folder_path}/{file}" if folder_path else file
        with open(file_path, "r") as file:
            if overview is None:
                overview = pd.read_csv(file, sep="\t", dtype={'sentence_domain': str}, low_memory=False)
            else:
                overview = pd.concat([overview, pd.read_csv(file, sep="\t", dtype={'sentence_domain': str}, low_memory=False)], ignore_index=True)
    return overview

In [98]:
def fix_gender(gender):
    if gender == "male":
        return "male_masculine"
    elif gender == "female":
        return "female_feminine"
    else:
        return gender

In [99]:
def preprocess_overview(overview, f_path):
    """
    This function takes a dataframe and preprocesses it for audio file conversion
    """
    # we only need files with a gender label
    overview = overview.dropna(subset=["gender"])
    # irrelevant columns for our analysis
    overview = overview.drop(columns=["variant", "segment", "sentence_id", "up_votes", "down_votes"])
    # fixing gender labels
    overview["gender"] = overview["gender"].apply(fix_gender)
    # dropping all files that are not simply male or female
    overview = overview[(overview["gender"] == "female_feminine") | (overview["gender"] == "male_masculine")]
    # limiting to a maximum of 5 random clips per client_id
    overview = overview.groupby("client_id").apply(lambda group: group.sample(n=min(len(group), 5), random_state=42)).reset_index(drop=True)
    # changing the path to reflect the location of the audio files
    overview["path"] = overview["path"].apply(lambda x: f"{f_path}/{x}")
    return overview.reset_index(drop=True)

In [130]:
list_of_files = ["validated.tsv","other.tsv"]
folder_path_danish="./data/cv-corpus-21.0-2025-03-14/da"
folder_path_swedish="./data/cv-corpus-21.0-2025-03-14/sv-SE"
folder_path_german="./data/cv-corpus-21.0-2025-03-14/de"
folder_path_french="./data/cv-corpus-21.0-2025-03-14/fr"
folder_path_spanish="./data/cv-corpus-21.0-2025-03-14/es"
folder_path_english="./data/cv-corpus-21.0-2025-03-14/en"
folder_path=folder_path_english
overview = prepare_overview(list_of_files,folder_path)
overview.info()
overview.describe(include="all")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2202177 entries, 0 to 2202176
Data columns (total 13 columns):
 #   Column           Dtype  
---  ------           -----  
 0   client_id        object 
 1   path             object 
 2   sentence_id      object 
 3   sentence         object 
 4   sentence_domain  object 
 5   up_votes         int64  
 6   down_votes       int64  
 7   age              object 
 8   gender           object 
 9   accents          object 
 10  variant          float64
 11  locale           object 
 12  segment          object 
dtypes: float64(1), int64(2), object(10)
memory usage: 218.4+ MB


Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment
count,2202177,2202177,2202177,2202177,824,2202177.0,2202177.0,1444447,1385521,1222442,0.0,2202177,45302
unique,90551,2202177,1408021,1403811,40,,,9,5,973,,1,1
top,372293e65cdab88771e028a4351651ab2eff64438ddafc...,common_voice_en_42693885.mp3,27c03788cf35f93e8360abf39d54da29f64de2842f4d5b...,yes,general,,,twenties,male_masculine,United States English,,en,Benchmark
freq,58069,1,3335,3335,467,,,537149,1001084,507256,,2202177,45302
mean,,,,,,2.051621,0.1634033,,,,,,
std,,,,,,3.664834,0.6419752,,,,,,
min,,,,,,0.0,0.0,,,,,,
25%,,,,,,2.0,0.0,,,,,,
50%,,,,,,2.0,0.0,,,,,,
75%,,,,,,2.0,0.0,,,,,,


In [131]:
overview.groupby("gender").size()

gender
do_not_wish_to_say        836
female_feminine        383206
male_masculine        1001084
non-binary                302
transgender                93
dtype: int64

In [132]:
len(overview.dropna(subset=["gender"])["client_id"].unique())

23033

In [134]:
# On my device, all audio files are in the same "clips" folder. Please change the path if your files are in a different location.
overview=preprocess_overview(overview, "clips")
overview.head()

KeyError: "['variant', 'segment', 'sentence_id', 'up_votes', 'down_votes'] not found in axis"

In [135]:
overview.describe(include="all")

Unnamed: 0,client_id,path,sentence,sentence_domain,age,gender,accents,locale
count,106233,106233,106233,20,105523,106233,71472,106233
unique,23002,106233,74392,8,9,2,348,1
top,fff79a004761f155408c00b531f86e59b59623acc9c5cd...,clips/common_voice_en_17920916.mp3,Hey,general,twenties,male_masculine,United States English,en
freq,5,1,379,9,45855,82771,35715,106233


In [136]:
overview.groupby("gender").size()

gender
female_feminine    23462
male_masculine     82771
dtype: int64

In [137]:
overview.groupby("age").size()

age
eighties       212
fifties       6889
fourties     11325
nineties        19
seventies     1509
sixties       4018
teens        13181
thirties     22515
twenties     45855
dtype: int64

We have a slight class imbalance that we may want to remove/have to deal with within the model building.

## Extraction of Audiofiles

This step was necessary as the unpacking of the full tar files for EN & ES took too long.

In [139]:
def extract_selected_files_flat(tar_path, file_paths, output_dir):
    """
    Extract specific files from a tar archive into a flat output_dir.
    Flattens the directory structure, extracting only the file name.
    """
    os.makedirs(output_dir, exist_ok=True)

    successful = []
    failed = []

    with tarfile.open(tar_path, "r") as tar:
        for raw_path in tqdm(file_paths, desc="Extracting"):
            # Normalize path to forward slashes
            path_in_tar = raw_path.replace("\\", "/")
            try:
                member = tar.getmember(path_in_tar)
                extracted_file = tar.extractfile(member)

                if extracted_file is None:
                    failed.append(path_in_tar)
                    continue

                # Get only the file name from the path (robust)
                file_name = path_in_tar.split("/")[-1]
                out_path = os.path.join(output_dir, "clips", file_name)

                with open(out_path, "wb") as f:
                    f.write(extracted_file.read())

                if os.path.isfile(out_path):
                    successful.append(path_in_tar)
                else:
                    failed.append(path_in_tar)

            except KeyError:
                failed.append(path_in_tar)

    print(f"Extracted {len(successful)} files.")
    if failed:
        print(f"Failed to extract {len(failed)} files.")
    return successful, failed



def ensure_files_unpacked(tar_path, tar_internal_path, file_names, unpacked_dir):
    """
    Ensure that selected files are unpacked in the given directory. 
    If any are missing, extract them from the tar archive.

    Parameters:
    - tar_path (str): Path to the .tar archive.
    - tar_internal_path (str): Folder path inside the tar archive (e.g., "data/").
    - file_names (list[str]): List of file names (not full paths) to check/extract.
    - unpacked_dir (str): Directory where files should be unpacked.
    """

    missing_files = []
    for file_name in tqdm(file_names, desc="Checking existing files", unit="file"):
        target_path = os.path.join(unpacked_dir, file_name)
        if not os.path.isfile(target_path):
            missing_files.append(tar_internal_path+"/"+file_name)

    if missing_files:
        print(f"{len(missing_files)} files missing. Extracting...")
        return extract_selected_files_flat(tar_path, missing_files, unpacked_dir)
    else:
        print("All files are already present.")


In [None]:
tar_path="./data/cv-corpus-21.0-2025-03-14-en.tar"
tar_internal_path="cv-corpus-21.0-2025-03-14/en"
successful, failed=ensure_files_unpacked(tar_path,tar_internal_path, overview["path"],folder_path)

Checking existing files: 100%|██████████| 106233/106233 [00:21<00:00, 4965.55file/s]


36306 files missing. Extracting...


Extracting:  25%|██▍       | 9015/36306 [1:31:47<4:00:49,  1.89it/s] 

## Conversion of the Audiofiles

In [109]:
def extract_features(file_path, alternate_path=None):
    try:
        # Use alternate_path if file_path is empty or invalid
        if not file_path or not os.path.isfile(file_path):
            if alternate_path and os.path.isfile(alternate_path):
                file_path = alternate_path
            else:
                raise FileNotFoundError(f"Neither file_path '{file_path}' nor alternate_path '{alternate_path}' is valid.")

        y, sr = librosa.load(file_path, sr=None, mono=True)

        # Feature extraction
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        spec_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        spec_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        spec_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        rmse = librosa.feature.rms(y=y)

        # Combine features into a single feature vector
        features = np.hstack([
            np.mean(mfccs, axis=1), np.std(mfccs, axis=1),
            np.mean(chroma, axis=1), np.std(chroma, axis=1),
            np.mean(spec_centroid), np.std(spec_centroid),
            np.mean(spec_bw), np.std(spec_bw),
            np.mean(spec_contrast, axis=1), np.std(spec_contrast, axis=1),
            np.mean(spec_rolloff), np.std(spec_rolloff),
            np.mean(zcr), np.std(zcr),
            np.mean(rmse), np.std(rmse)
        ])

        return features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


In [110]:
def build_columns():
    columns = []
    columns += [f"mfcc_{i+1:02d}_mean" for i in range(20)]
    columns += [f"mfcc_{i+1:02d}_std" for i in range(20)]
    columns += [f"chroma_{i+1:02d}_mean" for i in range(12)]
    columns += [f"chroma_{i+1:02d}_std" for i in range(12)]
    columns += ["spec_centroid_mean", "spec_centroid_std"]
    columns += ["spec_bandwidth_mean", "spec_bandwidth_std"]
    columns += [f"spec_contrast_band_{i+1}_mean" for i in range(7)]
    columns += [f"spec_contrast_band_{i+1}_std" for i in range(7)]
    columns += ["spec_rolloff_mean", "spec_rolloff_std"]
    columns += ["zcr_mean", "zcr_std"]
    columns += ["rmse_mean", "rmse_std"]
    return columns

In [111]:
def extract_features_from_dataframe(output_csv_path,df, folder_path,  alternate_folder_path=None, parallel=True):
    features_list = []
    valid_indices = []

    paths = [os.path.join(folder_path, name) for name in df["path"]]
    alternate_paths = [os.path.join(alternate_folder_path, name) for name in df["path"]] if alternate_folder_path else [None] * len(paths)

    # Use ThreadPoolExecutor for parallel processing
    # This helps to speed up the feature extraction process
    if parallel:
        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
            futures = {executor.submit(extract_features, path, alt_path): idx for idx, (path, alt_path) in enumerate(zip(paths, alternate_paths))}
            for future in tqdm(concurrent.futures.as_completed(futures), total=len(paths), desc="Extracting features"):
                result = future.result()
                if result is not None:
                    features_list.append(result)
                    valid_indices.append(futures[future])
    else:
        for idx, (path, alt_path) in tqdm(enumerate(zip(paths, alternate_paths)), total=len(paths), desc="Extracting features"):
            result = extract_features(path, alt_path)
            if result is not None:
                features_list.append(result)
                valid_indices.append(idx)

    # Build features DataFrame
    columns = build_columns()
    features_df = pd.DataFrame(features_list, columns=columns)
    
    # Match features to original DataFrame
    merged_df = df.iloc[valid_indices].reset_index(drop=True)
    final_df = pd.concat([merged_df, features_df], axis=1)

    # Save to CSV
    final_df.to_csv(output_csv_path, index=False)
    print(f"Saved extracted features to {output_csv_path}")

    return final_df

In [126]:
df=extract_features_from_dataframe("./data/data_fr_5.csv", overview,folder_path, parallel=True)
# "./data/cv-corpus-21.0-2025-03-14/en/cv-corpus-21.0-2025-03-14/en/"
df.describe()

  return pitch_tuning(
Extracting features: 100%|██████████| 22520/22520 [18:11<00:00, 20.63it/s]


Saved extracted features to ./data/data_fr_5.csv


Unnamed: 0,mfcc_01_mean,mfcc_02_mean,mfcc_03_mean,mfcc_04_mean,mfcc_05_mean,mfcc_06_mean,mfcc_07_mean,mfcc_08_mean,mfcc_09_mean,mfcc_10_mean,...,spec_contrast_band_4_std,spec_contrast_band_5_std,spec_contrast_band_6_std,spec_contrast_band_7_std,spec_rolloff_mean,spec_rolloff_std,zcr_mean,zcr_std,rmse_mean,rmse_std
count,22520.0,22520.0,22520.0,22520.0,22520.0,22520.0,22520.0,22520.0,22520.0,22520.0,...,22520.0,22520.0,22520.0,22520.0,22520.0,22520.0,22520.0,22520.0,22520.0,22520.0
mean,-435.585915,100.124407,-0.262641,26.177692,8.901965,8.377105,-2.511029,0.709186,-4.137997,-2.914024,...,4.574114,4.658827,6.695377,6.435758,5158.199362,2912.247946,0.08561,0.07233,0.048332,0.055202
std,92.861914,26.407719,18.53052,14.803878,12.991278,13.111117,9.624064,9.085843,7.663946,6.625369,...,0.76538,0.871687,1.992876,3.085829,1911.621782,1017.818417,0.041168,0.029096,0.036787,0.034673
min,-1131.370972,-1.7164,-111.400131,-39.174934,-63.750172,-53.291653,-48.539371,-50.302868,-48.334457,-35.734669,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-481.685211,83.676954,-11.931955,16.408361,0.727697,-0.917267,-8.495069,-5.183105,-9.140188,-7.201171,...,4.050382,4.07303,5.334677,4.595167,3765.518694,2099.685502,0.05647,0.051729,0.0226,0.029201
50%,-420.651428,100.9963,0.748238,26.222112,9.141967,8.204284,-2.103633,0.583287,-3.962007,-2.652189,...,4.480195,4.528772,6.497241,5.575383,4830.938399,2827.59546,0.077585,0.068228,0.042466,0.052296
75%,-376.020691,117.590273,12.200906,35.850368,17.467128,17.743413,3.986356,6.811297,1.037476,1.540046,...,4.998048,5.099118,7.819917,7.092344,6330.291905,3618.662676,0.106756,0.089837,0.065647,0.074852
max,-59.257011,242.315399,79.641441,94.552254,67.496872,59.000866,34.421612,38.607941,28.311029,29.611216,...,9.533967,10.681277,20.779267,24.503157,13157.848697,6462.594356,0.383748,0.25395,0.558873,0.293232
