# Data Conversion

This notebook transforms our soundfiles into a numerical dataset for our models.

In [1]:
#Potentially needed installations
#!pip install librosa
# !pip install playsound
# !pip install tqdm

In [2]:
#Imports
import numpy as np
import pandas as pd
import librosa
#from playsound import playsound
#This is used to show a progress bar in the terminal. Helpful as the conversion can take a while.
from tqdm import tqdm
import concurrent.futures

## Preparation of the Overview Dataframe

In [3]:
def prepare_overview(list_of_files, folder_path=None):
    """
    This function takes a list of files and creates a joint dataframe for audio file conversion
    """
    overview = None
    for file in list_of_files:
        file_path = f"{folder_path}/{file}" if folder_path else file
        with open(file_path, "r") as file:
            if overview is None:
                overview = pd.read_csv(file, sep="\t")
            else:
                overview = pd.concat([overview, pd.read_csv(file, sep="\t")], ignore_index=True)
    return overview

In [4]:
def fix_gender(gender):
    if gender == "male":
        return "male_masculine"
    elif gender == "female":
        return "female_feminine"
    else:
        return gender

In [5]:
def preprocess_overview(overview, folder_path):
    """
    This function takes a dataframe and preprocesses it for audio file conversion
    """
    # we only need files with a gender label
    overview = overview.dropna(subset=["gender"])
    # irrelevant columns for our analysis
    overview = overview.drop(columns=["variant", "segment", "sentence_id", "up_votes", "down_votes"])
    # fixing gender labels
    overview["gender"] = overview["gender"].apply(fix_gender)
    # dropping all files that are not simply male or female
    overview = overview[(overview["gender"] == "female_feminine") | (overview["gender"] == "male_masculine")]
    # randomly selecting one clip per client_id
    overview = overview.groupby("client_id").sample(n=1, random_state=27)
    # changing the path to reflect the location of the audio files
    overview["path"] = overview["path"].apply(lambda x: f"{folder_path}/{x}")
    return overview.reset_index(drop=True)

In [19]:
list_of_files = ["validated.tsv","other.tsv"]
folder_path_danish="./data/cv-corpus-21.0-2025-03-14/da"
folder_path_swedish="./data/cv-corpus-21.0-2025-03-14/sv-SE"
folder_path=folder_path_swedish
overview = prepare_overview(list_of_files,folder_path)
overview.info()
overview.describe(include="all")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48286 entries, 0 to 48285
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   client_id        48286 non-null  object 
 1   path             48286 non-null  object 
 2   sentence_id      48286 non-null  object 
 3   sentence         48286 non-null  object 
 4   sentence_domain  1 non-null      object 
 5   up_votes         48286 non-null  int64  
 6   down_votes       48286 non-null  int64  
 7   age              39471 non-null  object 
 8   gender           38493 non-null  object 
 9   accents          2025 non-null   object 
 10  variant          0 non-null      float64
 11  locale           48286 non-null  object 
 12  segment          34 non-null     object 
dtypes: float64(1), int64(2), object(10)
memory usage: 4.8+ MB


Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment
count,48286,48286,48286,48286,1,48286.0,48286.0,39471,38493,2025,0.0,48286,34
unique,854,48286,22485,22482,1,,,8,2,33,,1,1
top,672229f2d103c33ac3790d10c9b32b3b36a111b47c1af0...,common_voice_sv-SE_42574105.mp3,2f29fd368abca5fa5f68cdf4e49e58759784cf0be3c403...,En bred fågel.,media_entertainment,,,fourties,male_masculine,Skånska,,sv-SE,Benchmark
freq,12100,1,8,8,1,,,18419,22661,664,,48286,34
mean,,,,,,1.855445,0.066707,,,,,,
std,,,,,,0.855139,0.270849,,,,,,
min,,,,,,0.0,0.0,,,,,,
25%,,,,,,2.0,0.0,,,,,,
50%,,,,,,2.0,0.0,,,,,,
75%,,,,,,2.0,0.0,,,,,,


In [20]:
overview.groupby("gender").size()

gender
female_feminine    15832
male_masculine     22661
dtype: int64

In [None]:
# On my device, all audio files are in the same "clips" folder. Please change the path if your files are in a different location.
overview=preprocess_overview(overview, folder_path+"/clips")
overview.head()

Unnamed: 0,client_id,path,sentence,sentence_domain,age,gender,accents,locale
0,010246d26e7eab48c8e2cec624120c7e61b54499ffd01d...,./data/cv-corpus-21.0-2025-03-14/sv-SE/clips/c...,"Gräset är för högt nu, det måste slås.",,twenties,male_masculine,,sv-SE
1,02a0c47555a29d08a1d5b5751e5f52a3dc2661f8b11cd5...,./data/cv-corpus-21.0-2025-03-14/sv-SE/clips/c...,Hon måste ha ändrat dem efter att ritningarna ...,,twenties,female_feminine,,sv-SE
2,0420c3d5c2c4d5172363dece116475da807b9c81a52005...,./data/cv-corpus-21.0-2025-03-14/sv-SE/clips/c...,Vad tänker ni göra?,,thirties,female_feminine,,sv-SE
3,045631584eeb1219b8c91fc059a83d704c6f813d1b2906...,./data/cv-corpus-21.0-2025-03-14/sv-SE/clips/c...,Jag ska göra allt jag kan för att hitta adressen.,,thirties,male_masculine,,sv-SE
4,0477ecb563e9e78844854fdbeaa7d711c82922b4560e55...,./data/cv-corpus-21.0-2025-03-14/sv-SE/clips/c...,"Om insjöfåglarnas folk vill lyda mitt råd, bör...",,fifties,male_masculine,,sv-SE
...,...,...,...,...,...,...,...,...
240,f1e639097ee4059ff2717fdf9a0a2d38031fa0b55d5ff4...,./data/cv-corpus-21.0-2025-03-14/sv-SE/clips/c...,Det är nog salvia.,,twenties,male_masculine,,sv-SE
241,f5b6b0249fea41111094edd9c17830ecb849c8e4235bd7...,./data/cv-corpus-21.0-2025-03-14/sv-SE/clips/c...,Vi har valt koncentration framför småhem.,,twenties,male_masculine,,sv-SE
242,f65ab2934d6232d0463b5f1b35f3c37d8dd8942c5872e2...,./data/cv-corpus-21.0-2025-03-14/sv-SE/clips/c...,Han vände sig långsamt om.,,thirties,male_masculine,,sv-SE
243,f720c999dd862e5c76b482177d565b2ff833daefff434a...,./data/cv-corpus-21.0-2025-03-14/sv-SE/clips/c...,Det låter inte så ergonomiskt?,,seventies,male_masculine,,sv-SE


In [22]:
overview.describe(include="all")

Unnamed: 0,client_id,path,sentence,sentence_domain,age,gender,accents,locale
count,245,245,245,0.0,244,245,19,245
unique,245,245,245,0.0,7,2,16,1
top,010246d26e7eab48c8e2cec624120c7e61b54499ffd01d...,./data/cv-corpus-21.0-2025-03-14/sv-SE/clips/c...,"Gräset är för högt nu, det måste slås.",,twenties,male_masculine,Stockholmska,sv-SE
freq,1,1,1,,92,213,3,245


In [23]:
overview.groupby("gender").size()

gender
female_feminine     32
male_masculine     213
dtype: int64

In [24]:
overview.groupby("age").size()

age
fifties      15
fourties     41
seventies     1
sixties       5
teens        13
thirties     77
twenties     92
dtype: int64

We have a slight class imbalance that we may want to remove/have to deal with within the model building.

## Conversion of the Audiofiles

In [12]:
def extract_features(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None, mono=True)

        # Feature extraction
        # MFCC (Mel-frequency cepstral coefficients) gives the timbre of the audio
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        
        # Chroma feature gives the harmonic content of the audio
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        
        # Spectral features
        # Spectral centroid gives the center of mass of the spectrum
        spec_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        # Spectral bandwidth gives the width of the spectrum
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        # Spectral contrast gives the difference in amplitude between peaks and valleys in the sound spectrum
        spec_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        # Spectral rolloff gives the frequency below which a certain percentage of the total spectral energy is contained
        spec_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        
        # Zero crossing rate gives the rate at which the signal changes sign
        zcr = librosa.feature.zero_crossing_rate(y)
        # Root mean square energy gives the energy of the signal (i.e., the loudness)
        rmse = librosa.feature.rms(y=y)

        # Combine features into a single feature vector
        features = np.hstack([
            np.mean(mfccs, axis=1), np.std(mfccs, axis=1),
            np.mean(chroma, axis=1), np.std(chroma, axis=1),
            np.mean(spec_centroid), np.std(spec_centroid),
            np.mean(spec_bw), np.std(spec_bw),
            np.mean(spec_contrast, axis=1), np.std(spec_contrast, axis=1),
            np.mean(spec_rolloff), np.std(spec_rolloff),
            np.mean(zcr), np.std(zcr),
            np.mean(rmse), np.std(rmse)
        ])

        return features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


In [13]:
def build_columns():
    columns = []
    columns += [f"mfcc_{i+1:02d}_mean" for i in range(20)]
    columns += [f"mfcc_{i+1:02d}_std" for i in range(20)]
    columns += [f"chroma_{i+1:02d}_mean" for i in range(12)]
    columns += [f"chroma_{i+1:02d}_std" for i in range(12)]
    columns += ["spec_centroid_mean", "spec_centroid_std"]
    columns += ["spec_bandwidth_mean", "spec_bandwidth_std"]
    columns += [f"spec_contrast_band_{i+1}_mean" for i in range(7)]
    columns += [f"spec_contrast_band_{i+1}_std" for i in range(7)]
    columns += ["spec_rolloff_mean", "spec_rolloff_std"]
    columns += ["zcr_mean", "zcr_std"]
    columns += ["rmse_mean", "rmse_std"]
    return columns

In [14]:
def extract_features_from_dataframe(df_paths, output_csv_path, parallel=True):
    features_list = []
    valid_indices = []

    paths = df_paths['path'].tolist()

    # Use ThreadPoolExecutor for parallel processing
    # This helps to speed up the feature extraction process
    if parallel:
        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
            futures = {executor.submit(extract_features, path): idx for idx, path in enumerate(paths)}
            for future in tqdm(concurrent.futures.as_completed(futures), total=len(paths), desc="Extracting features"):
                result = future.result()
                if result is not None:
                    features_list.append(result)
                    valid_indices.append(futures[future])
    else:
        for idx, path in tqdm(enumerate(paths), total=len(paths), desc="Extracting features"):
            result = extract_features(path)
            if result is not None:
                features_list.append(result)
                valid_indices.append(idx)

    # Build features DataFrame
    columns = build_columns()
    features_df = pd.DataFrame(features_list, columns=columns)
    
    # Match features to original DataFrame
    merged_df = df_paths.iloc[valid_indices].reset_index(drop=True)
    final_df = pd.concat([merged_df, features_df], axis=1)

    # Save to CSV
    final_df.to_csv(output_csv_path, index=False)
    print(f"Saved extracted features to {output_csv_path}")

    return final_df

In [25]:
df=extract_features_from_dataframe(overview, "./data/data_se.csv", parallel=False)
df.describe()

  return pitch_tuning(
Extracting features: 100%|██████████| 245/245 [00:47<00:00,  5.19it/s]


Saved extracted features to ./data/data_se.csv


Unnamed: 0,mfcc_01_mean,mfcc_02_mean,mfcc_03_mean,mfcc_04_mean,mfcc_05_mean,mfcc_06_mean,mfcc_07_mean,mfcc_08_mean,mfcc_09_mean,mfcc_10_mean,...,spec_contrast_band_4_std,spec_contrast_band_5_std,spec_contrast_band_6_std,spec_contrast_band_7_std,spec_rolloff_mean,spec_rolloff_std,zcr_mean,zcr_std,rmse_mean,rmse_std
count,245.0,245.0,245.0,245.0,245.0,245.0,245.0,245.0,245.0,245.0,...,245.0,245.0,245.0,245.0,245.0,245.0,245.0,245.0,245.0,245.0
mean,-452.220802,98.660524,-1.876204,21.23384,6.12811,5.085606,-4.891212,-1.20886,-4.736151,-4.363436,...,4.570319,4.502011,7.615262,7.026284,4777.279586,2734.629812,0.094204,0.077195,0.039283,0.050243
std,90.666106,23.141665,17.122657,13.251331,11.370246,11.254093,9.083674,9.572155,7.907547,6.796765,...,0.796661,0.954505,2.035804,3.441151,1604.807796,862.612096,0.040878,0.029458,0.023915,0.028008
min,-1131.370972,0.0,-62.593697,-13.387156,-21.280552,-22.198843,-36.158852,-31.255199,-27.254759,-22.483143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-485.240143,82.349358,-11.968564,12.145467,-0.893539,-2.428717,-10.297105,-7.070948,-9.744137,-9.116899,...,4.050233,3.866607,6.303526,5.029064,3673.023897,2179.858731,0.065305,0.05495,0.021505,0.029021
50%,-445.388824,98.806313,-0.381378,19.927856,5.367857,4.130139,-4.218212,-0.588958,-4.248955,-4.998203,...,4.526757,4.320981,7.566176,6.072749,4585.884713,2680.029354,0.089545,0.076289,0.037238,0.049391
75%,-395.285767,114.393898,9.927809,29.082209,12.890342,12.708355,0.972159,5.697218,-0.25748,-0.319549,...,5.054587,4.975202,8.904204,7.43287,5629.971591,3204.306687,0.116161,0.097328,0.053098,0.068849
max,-239.587967,151.355484,37.995781,68.850983,42.085464,42.22192,22.767946,21.387754,22.969196,15.493899,...,6.989169,7.571701,14.75326,22.192086,10674.049613,5501.591895,0.21533,0.15918,0.118088,0.127666
