# Data Conversion

This notebook transforms our soundfiles into a numerical dataset for our models.

In [25]:
#Potentially needed installations
#!pip install librosa
# !pip install playsound
# !pip install tqdm

In [26]:
#Imports
import numpy as np
import pandas as pd
import librosa
#from playsound import playsound
#This is used to show a progress bar in the terminal. Helpful as the conversion can take a while.
from tqdm import tqdm
import concurrent.futures
import tarfile
import os

## Preparation of the Overview Dataframe

In [27]:
def prepare_overview(list_of_files, folder_path=None):
    """
    This function takes a list of files and creates a joint dataframe for audio file conversion
    """
    overview = None
    for file in list_of_files:
        file_path = f"{folder_path}/{file}" if folder_path else file
        with open(file_path, "r") as file:
            if overview is None:
                overview = pd.read_csv(file, sep="\t", dtype={'sentence_domain': str}, low_memory=False)
            else:
                overview = pd.concat([overview, pd.read_csv(file, sep="\t", dtype={'sentence_domain': str}, low_memory=False)], ignore_index=True)
    return overview

In [28]:
def fix_gender(gender):
    if gender == "male":
        return "male_masculine"
    elif gender == "female":
        return "female_feminine"
    else:
        return gender

In [29]:
def preprocess_overview(overview, f_path):
    """
    This function takes a dataframe and preprocesses it for audio file conversion
    """
    # we only need files with a gender label
    overview = overview.dropna(subset=["gender"])
    # irrelevant columns for our analysis
    overview = overview.drop(columns=["variant", "segment", "sentence_id", "up_votes", "down_votes"])
    # fixing gender labels
    overview["gender"] = overview["gender"].apply(fix_gender)
    # dropping all files that are not simply male or female
    overview = overview[(overview["gender"] == "female_feminine") | (overview["gender"] == "male_masculine")]
    # randomly selecting one clip per client_id
    overview = overview.groupby("client_id").sample(n=1, random_state=27)
    # changing the path to reflect the location of the audio files
    overview["path"] = overview["path"].apply(lambda x: f"{f_path}/{x}")
    return overview.reset_index(drop=True)

In [30]:
list_of_files = ["validated.tsv","other.tsv"]
folder_path_danish="./data/cv-corpus-21.0-2025-03-14/da"
folder_path_swedish="./data/cv-corpus-21.0-2025-03-14/sv-SE"
folder_path_german="./data/cv-corpus-21.0-2025-03-14/de"
folder_path_french="./data/cv-corpus-21.0-2025-03-14/fr"
folder_path_spanish="./data/cv-corpus-21.0-2025-03-14/es"
folder_path_english="./data/cv-corpus-21.0-2025-03-14/en"
folder_path=folder_path_english
overview = prepare_overview(list_of_files,folder_path)
overview.info()
overview.describe(include="all")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2202177 entries, 0 to 2202176
Data columns (total 13 columns):
 #   Column           Dtype  
---  ------           -----  
 0   client_id        object 
 1   path             object 
 2   sentence_id      object 
 3   sentence         object 
 4   sentence_domain  object 
 5   up_votes         int64  
 6   down_votes       int64  
 7   age              object 
 8   gender           object 
 9   accents          object 
 10  variant          float64
 11  locale           object 
 12  segment          object 
dtypes: float64(1), int64(2), object(10)
memory usage: 218.4+ MB


Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment
count,2202177,2202177,2202177,2202177,824,2202177.0,2202177.0,1444447,1385521,1222442,0.0,2202177,45302
unique,90551,2202177,1408021,1403811,40,,,9,5,973,,1,1
top,372293e65cdab88771e028a4351651ab2eff64438ddafc...,common_voice_en_42693885.mp3,27c03788cf35f93e8360abf39d54da29f64de2842f4d5b...,yes,general,,,twenties,male_masculine,United States English,,en,Benchmark
freq,58069,1,3335,3335,467,,,537149,1001084,507256,,2202177,45302
mean,,,,,,2.051621,0.1634033,,,,,,
std,,,,,,3.664834,0.6419752,,,,,,
min,,,,,,0.0,0.0,,,,,,
25%,,,,,,2.0,0.0,,,,,,
50%,,,,,,2.0,0.0,,,,,,
75%,,,,,,2.0,0.0,,,,,,


In [31]:
overview.groupby("gender").size()

gender
do_not_wish_to_say        836
female_feminine        383206
male_masculine        1001084
non-binary                302
transgender                93
dtype: int64

In [32]:
len(overview.dropna(subset=["gender"])["client_id"].unique())

23033

In [33]:
# On my device, all audio files are in the same "clips" folder. Please change the path if your files are in a different location.
overview=preprocess_overview(overview, "clips")
overview.sort_values(by="path").head()

Unnamed: 0,client_id,path,sentence,sentence_domain,age,gender,accents,locale
416,04960d53cc851eeb6d93f21a09e09ab36fe16943acb226...,clips/common_voice_en_100042.mp3,I should say not!,,fourties,male_masculine,United States English,en
19901,dd827ca548f469685d7d21bb88f8727594248ef02059b3...,clips/common_voice_en_100169.mp3,I wired her that I wasn't coming.,,twenties,female_feminine,United States English,en
4606,332c62f2e047781ad18df4e7f60fdea68aac9519024f9d...,clips/common_voice_en_100350.mp3,My laptop has crashed.,,twenties,male_masculine,Hong Kong English,en
19959,de2afcd83f933c3e522155d695166bcfcb771cc6ffe431...,clips/common_voice_en_100375.mp3,You've got the brain of a pancake.,,twenties,male_masculine,,en
4294,2ff5127cfa30a1007f613c512819497823d2fd9cd2ff30...,clips/common_voice_en_1004470.mp3,"The combination of trumpets, violins and a dru...",,teens,male_masculine,United States English,en


In [34]:
overview.describe(include="all")

Unnamed: 0,client_id,path,sentence,sentence_domain,age,gender,accents,locale
count,23002,23002,23002,6,22846,23002,15234,23002
unique,23002,23002,18944,6,9,2,348,1
top,fffb0df85bfb76b2e60cab81ba7716332beb7c776c4062...,clips/common_voice_en_17920916.mp3,five,technology_robotics,twenties,male_masculine,United States English,en
freq,1,1,89,1,10013,18019,7575,23002


In [35]:
overview.groupby("gender").size()

gender
female_feminine     4983
male_masculine     18019
dtype: int64

In [36]:
overview.groupby("age").size()

age
eighties        46
fifties       1462
fourties      2420
nineties         4
seventies      327
sixties        842
teens         2891
thirties      4841
twenties     10013
dtype: int64

We have a slight class imbalance that we may want to remove/have to deal with within the model building.

## Extraction of Audiofiles

This step was necessary as the unpacking of the full tar files for EN & ES took too long.

In [37]:
def extract_selected_files_flat(tar_path, file_paths, output_dir):
    """
    Extract specific files from a tar archive into a flat output_dir.
    - Ignores internal folder structure from the tar.
    - Normalizes paths and checks successful extraction.
    """
    os.makedirs(output_dir, exist_ok=True)

    # Normalize tar internal paths
    normalized_paths = [p.replace("\\", "/") for p in file_paths]

    successful = []
    failed = []

    with tarfile.open(tar_path, "r") as tar:
        for path_in_tar in tqdm(normalized_paths, desc="Extracting"):
            try:
                member = tar.getmember(path_in_tar)
                extracted_file = tar.extractfile(member)

                if extracted_file is None:
                    failed.append(path_in_tar)
                    continue

                # Extract just the file name
                file_name = os.path.basename(path_in_tar)
                out_path = os.path.join(output_dir, file_name)

                # Write to disk
                with open(out_path, "wb") as f:
                    f.write(extracted_file.read())

                # Confirm success
                if os.path.isfile(out_path):
                    successful.append(path_in_tar)
                else:
                    failed.append(path_in_tar)

            except KeyError:
                failed.append(path_in_tar)

    print(f"Extracted {len(successful)} files.")
    if failed:
        print(f"Failed to extract {len(failed)} files.")
    return successful, failed


def ensure_files_unpacked(tar_path, tar_internal_path, file_names, unpacked_dir):
    """
    Ensure that selected files are unpacked in the given directory. 
    If any are missing, extract them from the tar archive.

    Parameters:
    - tar_path (str): Path to the .tar archive.
    - tar_internal_path (str): Folder path inside the tar archive (e.g., "data/").
    - file_names (list[str]): List of file names (not full paths) to check/extract.
    - unpacked_dir (str): Directory where files should be unpacked.
    """

    missing_files = []
    once=True
    for file_name in tqdm(file_names, desc="Checking existing files", unit="file"):
        target_path = os.path.join(unpacked_dir, file_name)
        if not os.path.isfile(target_path):
            if once:
                once=False
                print(target_path)
                print(tar_internal_path+"/"+file_name)
            missing_files.append(tar_internal_path+"/"+file_name)

    if missing_files:
        print(f"{len(missing_files)} files missing. Extracting...")
        extract_selected_files(tar_path, missing_files, unpacked_dir)
    else:
        print("All files are already present.")


In [38]:
tar_path="./data/cv-corpus-21.0-2025-03-14-en.tar"
tar_internal_path="cv-corpus-21.0-2025-03-14/en"
ensure_files_unpacked(tar_path,tar_internal_path, overview["path"],folder_path)

Checking existing files:   2%|▏         | 481/23002 [00:00<00:09, 2475.50file/s]

./data/cv-corpus-21.0-2025-03-14/en\clips/common_voice_en_90906.mp3
cv-corpus-21.0-2025-03-14/en/clips/common_voice_en_90906.mp3


Checking existing files: 100%|██████████| 23002/23002 [00:06<00:00, 3353.35file/s]


8822 files missing. Extracting...


Extracting: 100%|██████████| 8822/8822 [14:49<00:00,  9.91it/s]   


Extracted 8822 files.


## Conversion of the Audiofiles

In [40]:
def extract_features(file_path, alternate_path=None):
    try:
        # Use alternate_path if file_path is empty or invalid
        if not file_path or not os.path.isfile(file_path):
            if alternate_path and os.path.isfile(alternate_path):
                file_path = alternate_path
            else:
                raise FileNotFoundError(f"Neither file_path '{file_path}' nor alternate_path '{alternate_path}' is valid.")

        y, sr = librosa.load(file_path, sr=None, mono=True)

        # Feature extraction
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        spec_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        spec_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        spec_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        rmse = librosa.feature.rms(y=y)

        # Combine features into a single feature vector
        features = np.hstack([
            np.mean(mfccs, axis=1), np.std(mfccs, axis=1),
            np.mean(chroma, axis=1), np.std(chroma, axis=1),
            np.mean(spec_centroid), np.std(spec_centroid),
            np.mean(spec_bw), np.std(spec_bw),
            np.mean(spec_contrast, axis=1), np.std(spec_contrast, axis=1),
            np.mean(spec_rolloff), np.std(spec_rolloff),
            np.mean(zcr), np.std(zcr),
            np.mean(rmse), np.std(rmse)
        ])

        return features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


In [41]:
def build_columns():
    columns = []
    columns += [f"mfcc_{i+1:02d}_mean" for i in range(20)]
    columns += [f"mfcc_{i+1:02d}_std" for i in range(20)]
    columns += [f"chroma_{i+1:02d}_mean" for i in range(12)]
    columns += [f"chroma_{i+1:02d}_std" for i in range(12)]
    columns += ["spec_centroid_mean", "spec_centroid_std"]
    columns += ["spec_bandwidth_mean", "spec_bandwidth_std"]
    columns += [f"spec_contrast_band_{i+1}_mean" for i in range(7)]
    columns += [f"spec_contrast_band_{i+1}_std" for i in range(7)]
    columns += ["spec_rolloff_mean", "spec_rolloff_std"]
    columns += ["zcr_mean", "zcr_std"]
    columns += ["rmse_mean", "rmse_std"]
    return columns

In [42]:
def extract_features_from_dataframe(output_csv_path,df, folder_path,  alternate_folder_path=None, parallel=True):
    features_list = []
    valid_indices = []

    paths = [os.path.join(folder_path, name) for name in df["path"]]
    alternate_paths = [os.path.join(alternate_folder_path, name) for name in df["path"]] if alternate_folder_path else [None] * len(paths)

    # Use ThreadPoolExecutor for parallel processing
    # This helps to speed up the feature extraction process
    if parallel:
        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
            futures = {executor.submit(extract_features, path, alt_path): idx for idx, (path, alt_path) in enumerate(zip(paths, alternate_paths))}
            for future in tqdm(concurrent.futures.as_completed(futures), total=len(paths), desc="Extracting features"):
                result = future.result()
                if result is not None:
                    features_list.append(result)
                    valid_indices.append(futures[future])
    else:
        for idx, (path, alt_path) in tqdm(enumerate(zip(paths, alternate_paths)), total=len(paths), desc="Extracting features"):
            result = extract_features(path, alt_path)
            if result is not None:
                features_list.append(result)
                valid_indices.append(idx)

    # Build features DataFrame
    columns = build_columns()
    features_df = pd.DataFrame(features_list, columns=columns)
    
    # Match features to original DataFrame
    merged_df = df.iloc[valid_indices].reset_index(drop=True)
    final_df = pd.concat([merged_df, features_df], axis=1)

    # Save to CSV
    final_df.to_csv(output_csv_path, index=False)
    print(f"Saved extracted features to {output_csv_path}")

    return final_df

In [43]:
df=extract_features_from_dataframe("./data/data_en.csv", overview,folder_path, "./data/cv-corpus-21.0-2025-03-14/en/cv-corpus-21.0-2025-03-14/en/", parallel=True)
df.describe()

  return pitch_tuning(
Extracting features: 100%|██████████| 23002/23002 [20:27<00:00, 18.74it/s]


Saved extracted features to ./data/data_en.csv


Unnamed: 0,mfcc_01_mean,mfcc_02_mean,mfcc_03_mean,mfcc_04_mean,mfcc_05_mean,mfcc_06_mean,mfcc_07_mean,mfcc_08_mean,mfcc_09_mean,mfcc_10_mean,...,spec_contrast_band_4_std,spec_contrast_band_5_std,spec_contrast_band_6_std,spec_contrast_band_7_std,spec_rolloff_mean,spec_rolloff_std,zcr_mean,zcr_std,rmse_mean,rmse_std
count,23002.0,23002.0,23002.0,23002.0,23002.0,23002.0,23002.0,23002.0,23002.0,23002.0,...,23002.0,23002.0,23002.0,23002.0,23002.0,23002.0,23002.0,23002.0,23002.0,23002.0
mean,-434.501995,108.755445,-1.058766,22.340231,7.01632,7.731335,-3.432029,-0.984119,-5.099465,-4.12776,...,4.657649,4.935513,6.953861,6.48582,4866.416106,2888.016472,0.083486,0.072164,0.046331,0.052412
std,93.416798,26.99684,19.239209,15.034002,13.1469,13.236696,9.936667,9.453819,8.179515,7.147226,...,0.773756,0.954049,2.115196,2.78215,1759.716621,985.443786,0.038956,0.028903,0.035065,0.033258
min,-1131.370972,0.0,-143.090866,-55.587875,-87.163132,-46.115959,-59.346558,-71.733627,-48.559692,-42.872486,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-481.588402,91.741686,-12.97286,12.255238,-1.211499,-1.582539,-9.706611,-7.024237,-10.369785,-8.819454,...,4.118929,4.284649,5.493028,4.83608,3602.197478,2125.174208,0.055663,0.051493,0.021572,0.026813
50%,-418.977036,109.261738,-0.156317,22.100967,7.401886,7.642539,-2.971916,-0.891852,-4.905287,-4.096699,...,4.570078,4.809503,6.764456,5.842784,4581.160961,2816.779948,0.076716,0.068785,0.040537,0.048954
75%,-374.433937,126.670219,11.754762,32.122928,15.624801,16.872774,3.220054,5.112296,0.452343,0.604988,...,5.104939,5.47387,8.241973,7.166099,5877.642517,3560.317985,0.103696,0.090305,0.063421,0.072397
max,-37.618797,214.820938,69.936562,107.869583,65.791161,64.013359,33.583839,44.255611,31.633854,36.067753,...,9.714327,12.379227,18.671953,26.842926,16027.782207,6368.249321,0.626398,0.284356,0.463531,0.288863
