# Data Conversion

This notebook transforms our soundfiles into a numerical dataset for our models.

In [25]:
#Potentially needed installations
#!pip install librosa
# !pip install playsound
# !pip install tqdm

In [26]:
#Imports
import numpy as np
import pandas as pd
import librosa
#from playsound import playsound
#This is used to show a progress bar in the terminal. Helpful as the conversion can take a while.
from tqdm import tqdm
import concurrent.futures
import tarfile
import os

## Preparation of the Overview Dataframe

In [27]:
def prepare_overview(list_of_files, folder_path=None):
    """
    This function takes a list of files and creates a joint dataframe for audio file conversion
    """
    overview = None
    for file in list_of_files:
        file_path = f"{folder_path}/{file}" if folder_path else file
        with open(file_path, "r") as file:
            if overview is None:
                overview = pd.read_csv(file, sep="\t", dtype={'sentence_domain': str}, low_memory=False)
            else:
                overview = pd.concat([overview, pd.read_csv(file, sep="\t", dtype={'sentence_domain': str}, low_memory=False)], ignore_index=True)
    return overview

In [28]:
def fix_gender(gender):
    if gender == "male":
        return "male_masculine"
    elif gender == "female":
        return "female_feminine"
    else:
        return gender

In [50]:
def preprocess_overview(overview, f_path):
    """
    This function takes a dataframe and preprocesses it for audio file conversion
    """
    # we only need files with a gender label
    overview = overview.dropna(subset=["gender"])
    # irrelevant columns for our analysis
    overview = overview.drop(columns=["variant", "segment", "sentence_id", "up_votes", "down_votes"])
    # fixing gender labels
    overview["gender"] = overview["gender"].apply(fix_gender)
    # dropping all files that are not simply male or female
    overview = overview[(overview["gender"] == "female_feminine") | (overview["gender"] == "male_masculine")]
    # limiting to a maximum of 5 random clips per client_id
    overview = overview.groupby("client_id").apply(lambda group: group.sample(n=min(len(group), 5), random_state=42)).reset_index(drop=True)
    # changing the path to reflect the location of the audio files
    overview["path"] = overview["path"].apply(lambda x: f"{f_path}/{x}")
    return overview.reset_index(drop=True)

In [58]:
list_of_files = ["validated.tsv","other.tsv"]
folder_path_danish="./data/cv-corpus-21.0-2025-03-14/da"
folder_path_swedish="./data/cv-corpus-21.0-2025-03-14/sv-SE"
folder_path_german="./data/cv-corpus-21.0-2025-03-14/de"
folder_path_french="./data/cv-corpus-21.0-2025-03-14/fr"
folder_path_spanish="./data/cv-corpus-21.0-2025-03-14/es"
folder_path_english="./data/cv-corpus-21.0-2025-03-14/en"
folder_path=folder_path_spanish
overview = prepare_overview(list_of_files,folder_path)
overview.info()
overview.describe(include="all")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1569569 entries, 0 to 1569568
Data columns (total 13 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   client_id        1569569 non-null  object 
 1   path             1569569 non-null  object 
 2   sentence_id      1569569 non-null  object 
 3   sentence         1569566 non-null  object 
 4   sentence_domain  105 non-null      object 
 5   up_votes         1569569 non-null  int64  
 6   down_votes       1569569 non-null  int64  
 7   age              1377285 non-null  object 
 8   gender           1371150 non-null  object 
 9   accents          1204289 non-null  object 
 10  variant          0 non-null        float64
 11  locale           1569569 non-null  object 
 12  segment          63496 non-null    object 
dtypes: float64(1), int64(2), object(10)
memory usage: 155.7+ MB


Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment
count,1569569,1569569,1569569,1569566,105,1569569.0,1569569.0,1377285,1371150,1204289,0.0,1569569,63496
unique,25685,1569569,1036205,1036050,19,,,9,3,187,,1,1
top,a97730f86fa90560ae105669364412a9ad393b32839d01...,common_voice_es_42688245.mp3,ad51886652f88e4d616d47ac4a2c80861588b150feae4c...,siete,general,,,twenties,male_masculine,México,,es,Benchmark
freq,146108,1,4659,4659,24,,,836579,872916,833169,,1569569,63496
mean,,,,,,0.6457817,0.05464175,,,,,,
std,,,,,,1.048609,0.2622993,,,,,,
min,,,,,,0.0,0.0,,,,,,
25%,,,,,,0.0,0.0,,,,,,
50%,,,,,,0.0,0.0,,,,,,
75%,,,,,,2.0,0.0,,,,,,


In [59]:
overview.groupby("gender").size()

gender
do_not_wish_to_say         5
female_feminine       498229
male_masculine        872916
dtype: int64

In [60]:
len(overview.dropna(subset=["gender"])["client_id"].unique())

6234

In [61]:
# On my device, all audio files are in the same "clips" folder. Please change the path if your files are in a different location.
overview=preprocess_overview(overview, "clips")
overview.sort_values(by="path").head()

  overview = overview.groupby("client_id").apply(lambda group: group.sample(n=min(len(group), 5), random_state=42)).reset_index(drop=True)


Unnamed: 0,client_id,path,sentence,sentence_domain,age,gender,accents,locale
10712,5d616fbeb235ea174cc73794b8f3ef28be4d414a941d42...,clips/common_voice_es_18306566.mp3,"se lo daremos , señor ... y después la corona ...",,thirties,male_masculine,"España: Norte peninsular (Asturias, Castilla y...",es
10711,5d616fbeb235ea174cc73794b8f3ef28be4d414a941d42...,clips/common_voice_es_18306604.mp3,"salieron nueve mercantes , cuatro pesqueros de...",,thirties,male_masculine,"España: Norte peninsular (Asturias, Castilla y...",es
10713,5d616fbeb235ea174cc73794b8f3ef28be4d414a941d42...,clips/common_voice_es_18307339.mp3,"los bosques de Campeche ,",,thirties,male_masculine,"España: Norte peninsular (Asturias, Castilla y...",es
16377,8d9937b88227497ab43d9eebdea32b61100f53e7c749e3...,clips/common_voice_es_18307761.mp3,"y calló , tal vez esperando una disculpa amant...",,fifties,male_masculine,,es
16375,8d9937b88227497ab43d9eebdea32b61100f53e7c749e3...,clips/common_voice_es_18307940.mp3,pero yo hijos no te voy a dar .,,fifties,male_masculine,,es


In [62]:
overview.describe(include="all")

Unnamed: 0,client_id,path,sentence,sentence_domain,age,gender,accents,locale
count,29789,29789,29789,2,29666,29789,22434,29789
unique,6233,29789,24241,2,9,2,101,1
top,fffd6064784fc9c4162357a3a91e9e167b48bbf572da1c...,clips/common_voice_es_19141587.mp3,tres,"finance,language_fundamentals",twenties,male_masculine,México,es
freq,5,1,406,1,13120,21195,5599,29789


In [63]:
overview.groupby("gender").size()

gender
female_feminine     8594
male_masculine     21195
dtype: int64

In [64]:
overview.groupby("age").size()

age
eighties        18
fifties       2354
fourties      4292
nineties         5
seventies      101
sixties        715
teens         3248
thirties      5813
twenties     13120
dtype: int64

We have a slight class imbalance that we may want to remove/have to deal with within the model building.

## Extraction of Audiofiles

This step was necessary as the unpacking of the full tar files for EN & ES took too long.

In [56]:
def extract_selected_files_flat(tar_path, file_paths, output_dir):
    """
    Extract specific files from a tar archive into a flat output_dir.
    Flattens the directory structure, extracting only the file name.
    """
    os.makedirs(output_dir, exist_ok=True)

    successful = []
    failed = []

    with tarfile.open(tar_path, "r") as tar:
        for raw_path in tqdm(file_paths, desc="Extracting"):
            # Normalize path to forward slashes
            path_in_tar = raw_path.replace("\\", "/")
            try:
                member = tar.getmember(path_in_tar)
                extracted_file = tar.extractfile(member)

                if extracted_file is None:
                    failed.append(path_in_tar)
                    continue

                # Get only the file name from the path (robust)
                file_name = path_in_tar.split("/")[-1]
                out_path = os.path.join(output_dir, file_name)

                with open(out_path, "wb") as f:
                    f.write(extracted_file.read())

                if os.path.isfile(out_path):
                    successful.append(path_in_tar)
                else:
                    failed.append(path_in_tar)

            except KeyError:
                failed.append(path_in_tar)

    print(f"Extracted {len(successful)} files.")
    if failed:
        print(f"Failed to extract {len(failed)} files.")
    return successful, failed



def ensure_files_unpacked(tar_path, tar_internal_path, file_names, unpacked_dir):
    """
    Ensure that selected files are unpacked in the given directory. 
    If any are missing, extract them from the tar archive.

    Parameters:
    - tar_path (str): Path to the .tar archive.
    - tar_internal_path (str): Folder path inside the tar archive (e.g., "data/").
    - file_names (list[str]): List of file names (not full paths) to check/extract.
    - unpacked_dir (str): Directory where files should be unpacked.
    """

    missing_files = []
    once=True
    for file_name in tqdm(file_names, desc="Checking existing files", unit="file"):
        target_path = os.path.join(unpacked_dir, file_name)
        if not os.path.isfile(target_path):
            if once:
                once=False
                print(target_path)
                print(tar_internal_path+"/"+file_name)
            missing_files.append(tar_internal_path+"/"+file_name)

    if missing_files:
        print(f"{len(missing_files)} files missing. Extracting...")
        extract_selected_files(tar_path, missing_files, unpacked_dir)
    else:
        print("All files are already present.")


In [65]:
tar_path="./data/cv-corpus-21.0-2025-03-14-es.tar"
tar_internal_path="cv-corpus-21.0-2025-03-14/es"
ensure_files_unpacked(tar_path,tar_internal_path, overview["path"],folder_path)

Checking existing files:   1%|▏         | 438/29789 [00:00<00:13, 2203.44file/s]

./data/cv-corpus-21.0-2025-03-14/es\clips/common_voice_es_34950484.mp3
cv-corpus-21.0-2025-03-14/es/clips/common_voice_es_34950484.mp3


Checking existing files: 100%|██████████| 29789/29789 [00:09<00:00, 3163.18file/s]


2617 files missing. Extracting...


Extracting: 100%|██████████| 2617/2617 [07:20<00:00,  5.95it/s]  


Extracted 2617 files.


## Conversion of the Audiofiles

In [40]:
def extract_features(file_path, alternate_path=None):
    try:
        # Use alternate_path if file_path is empty or invalid
        if not file_path or not os.path.isfile(file_path):
            if alternate_path and os.path.isfile(alternate_path):
                file_path = alternate_path
            else:
                raise FileNotFoundError(f"Neither file_path '{file_path}' nor alternate_path '{alternate_path}' is valid.")

        y, sr = librosa.load(file_path, sr=None, mono=True)

        # Feature extraction
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        spec_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        spec_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        spec_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        rmse = librosa.feature.rms(y=y)

        # Combine features into a single feature vector
        features = np.hstack([
            np.mean(mfccs, axis=1), np.std(mfccs, axis=1),
            np.mean(chroma, axis=1), np.std(chroma, axis=1),
            np.mean(spec_centroid), np.std(spec_centroid),
            np.mean(spec_bw), np.std(spec_bw),
            np.mean(spec_contrast, axis=1), np.std(spec_contrast, axis=1),
            np.mean(spec_rolloff), np.std(spec_rolloff),
            np.mean(zcr), np.std(zcr),
            np.mean(rmse), np.std(rmse)
        ])

        return features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


In [41]:
def build_columns():
    columns = []
    columns += [f"mfcc_{i+1:02d}_mean" for i in range(20)]
    columns += [f"mfcc_{i+1:02d}_std" for i in range(20)]
    columns += [f"chroma_{i+1:02d}_mean" for i in range(12)]
    columns += [f"chroma_{i+1:02d}_std" for i in range(12)]
    columns += ["spec_centroid_mean", "spec_centroid_std"]
    columns += ["spec_bandwidth_mean", "spec_bandwidth_std"]
    columns += [f"spec_contrast_band_{i+1}_mean" for i in range(7)]
    columns += [f"spec_contrast_band_{i+1}_std" for i in range(7)]
    columns += ["spec_rolloff_mean", "spec_rolloff_std"]
    columns += ["zcr_mean", "zcr_std"]
    columns += ["rmse_mean", "rmse_std"]
    return columns

In [42]:
def extract_features_from_dataframe(output_csv_path,df, folder_path,  alternate_folder_path=None, parallel=True):
    features_list = []
    valid_indices = []

    paths = [os.path.join(folder_path, name) for name in df["path"]]
    alternate_paths = [os.path.join(alternate_folder_path, name) for name in df["path"]] if alternate_folder_path else [None] * len(paths)

    # Use ThreadPoolExecutor for parallel processing
    # This helps to speed up the feature extraction process
    if parallel:
        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
            futures = {executor.submit(extract_features, path, alt_path): idx for idx, (path, alt_path) in enumerate(zip(paths, alternate_paths))}
            for future in tqdm(concurrent.futures.as_completed(futures), total=len(paths), desc="Extracting features"):
                result = future.result()
                if result is not None:
                    features_list.append(result)
                    valid_indices.append(futures[future])
    else:
        for idx, (path, alt_path) in tqdm(enumerate(zip(paths, alternate_paths)), total=len(paths), desc="Extracting features"):
            result = extract_features(path, alt_path)
            if result is not None:
                features_list.append(result)
                valid_indices.append(idx)

    # Build features DataFrame
    columns = build_columns()
    features_df = pd.DataFrame(features_list, columns=columns)
    
    # Match features to original DataFrame
    merged_df = df.iloc[valid_indices].reset_index(drop=True)
    final_df = pd.concat([merged_df, features_df], axis=1)

    # Save to CSV
    final_df.to_csv(output_csv_path, index=False)
    print(f"Saved extracted features to {output_csv_path}")

    return final_df

In [None]:
df=extract_features_from_dataframe("./data/data_es_5.csv", overview,folder_path, "./data/cv-corpus-21.0-2025-03-14/es/cv-corpus-21.0-2025-03-14/es/", parallel=True)
df.describe()

Extracting features:   0%|          | 117/29789 [00:14<48:27, 10.21it/s]  