In [1]:
import os
import re
import numpy as np
import pandas as pd

# Extract Featurres from README

In [2]:
def extract_metadata_from_readme(readme_path):
    """Extracts metadata fields from a README file."""
    metadata = {
        "gender": None,
        "age_range": None,
        "language": None,
        "dialect": None,
    }

    with open(readme_path, "r", encoding="utf-8", errors="ignore") as f:
        content = f.read()

    # Basic regex extraction
    gender_match = re.search(r"Gender:\s*(\w+)", content, re.IGNORECASE)
    age_match = re.search(r"Age Range:\s*([^\n\r]+)", content, re.IGNORECASE)
    lang_match = re.search(r"Language:\s*(\w+)", content, re.IGNORECASE)
    dialect_match = re.search(r"Pronunciation dialect:\s*([^\n\r]+)", content, re.IGNORECASE)

    if gender_match:
        metadata["gender"] = gender_match.group(1).strip()
    if age_match:
        metadata["age_range"] = age_match.group(1).strip()
    if lang_match:
        metadata["language"] = lang_match.group(1).strip()
    if dialect_match:
        metadata["dialect"] = dialect_match.group(1).strip()

    return metadata


# Merge DF

In [3]:
def build_metadata_dataframe(extract_dir):
    rows = []

    for root, dirs, files in os.walk(extract_dir):
        if "etc" in root and "README" in files:
            readme_path = os.path.join(root, "README")
            metadata = extract_metadata_from_readme(readme_path)

            # Get corresponding wav directory
            wav_dir = root.replace("/etc", "/wav")
            if not os.path.exists(wav_dir):
                continue

            for wav_file in os.listdir(wav_dir):
                if wav_file.endswith(".wav"):
                    full_path = os.path.join(wav_dir, wav_file)
                    rel_path = os.path.relpath(full_path, extract_dir)

                    rows.append({
                        "filename": rel_path,
                        **{k: v.lower() if isinstance(v, str) else v for k, v in metadata.items()}
                    })

    return pd.DataFrame(rows)


In [4]:
features_df = pd.read_parquet("voxforge_data/audio_features.parquet")
metadata_df = build_metadata_dataframe("voxforge_data/extracted")

In [5]:
features_df

Unnamed: 0,filename,mean_freq_kHz,std_freq_kHz,median_freq_kHz,first_quantile_kHz,third_quantile_kHz,iqr_kHz,skewness,kurtosis,mode_freq_kHz,peak_freq_kHz,sp_entropy,flatness,centroid_kHz,modindx
0,robin-20070310-vf12/wav/vf12-34.wav,3.999961,2.309424,3.999961,1.999981,5.999942,3.999961,4.289625,22.129724,0.596192,0.596192,13.797732,0.131873,1.104101,2.316162
1,robin-20070310-vf12/wav/vf12-29.wav,4.000000,2.309454,4.000000,2.000000,6.000000,4.000000,4.468912,24.387798,0.518989,0.518989,13.671813,0.145211,1.173674,2.266816
2,robin-20070310-vf12/wav/vf12-24.wav,4.000000,2.309442,4.000000,2.000000,6.000000,4.000000,4.768208,28.682903,0.522362,0.522362,13.983390,0.154861,1.182493,2.340302
3,robin-20070310-vf12/wav/vf12-06.wav,4.000000,2.309460,4.000000,2.000000,6.000000,4.000000,4.288241,22.928845,0.165990,0.165990,13.383058,0.113975,1.056596,2.309039
4,robin-20070310-vf12/wav/vf12-38.wav,3.999964,2.309422,3.999964,1.999982,5.999946,3.999964,4.437734,23.512740,0.327474,0.327474,14.031024,0.154637,1.212154,2.255088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69033,Luis-20130226-peg/wav/b0325.wav,4.000000,2.309476,4.000000,2.000000,6.000000,4.000000,8.821168,99.341927,0.127484,0.127484,12.227013,0.186738,0.926109,3.981581
69034,Luis-20130226-peg/wav/b0323.wav,4.000000,2.309447,4.000000,2.000000,6.000000,4.000000,7.781451,82.411050,0.121120,0.121120,12.875872,0.149606,0.784108,3.738265
69035,Luis-20130226-peg/wav/b0329.wav,4.000000,2.309484,4.000000,2.000000,6.000000,4.000000,8.237866,97.468529,0.143143,0.143143,12.128605,0.123457,0.790771,3.505875
69036,Luis-20130226-peg/wav/b0327.wav,4.000000,2.309497,4.000000,2.000000,6.000000,4.000000,8.133196,93.356766,0.117333,0.117333,11.977746,0.152190,0.854203,3.538266


In [6]:
metadata_df

Unnamed: 0,filename,gender,age_range,language,dialect
0,robin-20070310-vf12/wav/vf12-34.wav,male,adult;,,british english (mother tongue is dutch).
1,robin-20070310-vf12/wav/vf12-29.wav,male,adult;,,british english (mother tongue is dutch).
2,robin-20070310-vf12/wav/vf12-24.wav,male,adult;,,british english (mother tongue is dutch).
3,robin-20070310-vf12/wav/vf12-06.wav,male,adult;,,british english (mother tongue is dutch).
4,robin-20070310-vf12/wav/vf12-38.wav,male,adult;,,british english (mother tongue is dutch).
...,...,...,...,...,...
83653,Luis-20130226-peg/wav/b0325.wav,male,adult,en,european english
83654,Luis-20130226-peg/wav/b0323.wav,male,adult,en,european english
83655,Luis-20130226-peg/wav/b0329.wav,male,adult,en,european english
83656,Luis-20130226-peg/wav/b0327.wav,male,adult,en,european english


In [7]:
merged_df = features_df.merge(metadata_df, on="filename", how="inner")

In [8]:
merged_df

Unnamed: 0,filename,mean_freq_kHz,std_freq_kHz,median_freq_kHz,first_quantile_kHz,third_quantile_kHz,iqr_kHz,skewness,kurtosis,mode_freq_kHz,peak_freq_kHz,sp_entropy,flatness,centroid_kHz,modindx,gender,age_range,language,dialect
0,robin-20070310-vf12/wav/vf12-34.wav,3.999961,2.309424,3.999961,1.999981,5.999942,3.999961,4.289625,22.129724,0.596192,0.596192,13.797732,0.131873,1.104101,2.316162,male,adult;,,british english (mother tongue is dutch).
1,robin-20070310-vf12/wav/vf12-29.wav,4.000000,2.309454,4.000000,2.000000,6.000000,4.000000,4.468912,24.387798,0.518989,0.518989,13.671813,0.145211,1.173674,2.266816,male,adult;,,british english (mother tongue is dutch).
2,robin-20070310-vf12/wav/vf12-24.wav,4.000000,2.309442,4.000000,2.000000,6.000000,4.000000,4.768208,28.682903,0.522362,0.522362,13.983390,0.154861,1.182493,2.340302,male,adult;,,british english (mother tongue is dutch).
3,robin-20070310-vf12/wav/vf12-06.wav,4.000000,2.309460,4.000000,2.000000,6.000000,4.000000,4.288241,22.928845,0.165990,0.165990,13.383058,0.113975,1.056596,2.309039,male,adult;,,british english (mother tongue is dutch).
4,robin-20070310-vf12/wav/vf12-38.wav,3.999964,2.309422,3.999964,1.999982,5.999946,3.999964,4.437734,23.512740,0.327474,0.327474,14.031024,0.154637,1.212154,2.255088,male,adult;,,british english (mother tongue is dutch).
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66081,Luis-20130226-peg/wav/b0325.wav,4.000000,2.309476,4.000000,2.000000,6.000000,4.000000,8.821168,99.341927,0.127484,0.127484,12.227013,0.186738,0.926109,3.981581,male,adult,en,european english
66082,Luis-20130226-peg/wav/b0323.wav,4.000000,2.309447,4.000000,2.000000,6.000000,4.000000,7.781451,82.411050,0.121120,0.121120,12.875872,0.149606,0.784108,3.738265,male,adult,en,european english
66083,Luis-20130226-peg/wav/b0329.wav,4.000000,2.309484,4.000000,2.000000,6.000000,4.000000,8.237866,97.468529,0.143143,0.143143,12.128605,0.123457,0.790771,3.505875,male,adult,en,european english
66084,Luis-20130226-peg/wav/b0327.wav,4.000000,2.309497,4.000000,2.000000,6.000000,4.000000,8.133196,93.356766,0.117333,0.117333,11.977746,0.152190,0.854203,3.538266,male,adult,en,european english


In [9]:
merged_df['age_range'].unique()

array(['adult;', 'adult', 'youth', 'please select', '[adult];', 'senior',
       'unknown', 'erwachsener', 'senior;', 'adulto', None, 'jeune',
       'adult (born in 1983)', '[youth];', 'male', 'youth;'], dtype=object)

# Remapping Age and Gender

In [10]:
merged_df["age_range"] = (
    merged_df["age_range"]
    .str.lower()
    .str.replace(";", "", regex=False)
    .str.strip()
    .replace({
        "erwachsener": "adult",
        "adulto": "adult",
        "adulte": "adult",
        "adult (born in 1983)": "adult",
        "[adult]": "adult",
        "[adult]": "adult",
        "youth;": "youth",
        "[youth]": "youth",
        "jeune": "youth",
        "senior;": "senior",
        "please select": "unknown",
        None: "unknown",
        "male": "unknown"  
    })
    .fillna("unknown")
)


In [11]:
merged_df["age_range"].value_counts()

age_range
adult      51394
unknown     7037
youth       6282
senior      1373
Name: count, dtype: int64

In [12]:
merged_df['language'].unique()

array([None, 'en'], dtype=object)

In [13]:
merged_df['gender'].unique()

array(['male', 'female', 'please', None, 'unknown', 'männlich',
       'masculino', 'masculin', 'adult', 'weiblich', 'make'], dtype=object)

In [14]:
gender_map = {
    'male': 'male',
    'make': 'male',
    'männlich': 'male',
    'masculino': 'male',
    'masculin': 'male',
    'female': 'female',
    'weiblich': 'female'
}

merged_df['gender'] = merged_df['gender'].map(gender_map)

merged_df = merged_df[merged_df['gender'].isin(['male', 'female'])]

In [15]:
en_merged_df = merged_df[merged_df['dialect'].str.contains('english', case=False, na=False)]

In [16]:
en_merged_df = en_merged_df.drop(columns=['dialect'])
en_merged_df = en_merged_df.drop(columns=['language'])
en_merged_df

Unnamed: 0,filename,mean_freq_kHz,std_freq_kHz,median_freq_kHz,first_quantile_kHz,third_quantile_kHz,iqr_kHz,skewness,kurtosis,mode_freq_kHz,peak_freq_kHz,sp_entropy,flatness,centroid_kHz,modindx,gender,age_range
0,robin-20070310-vf12/wav/vf12-34.wav,3.999961,2.309424,3.999961,1.999981,5.999942,3.999961,4.289625,22.129724,0.596192,0.596192,13.797732,0.131873,1.104101,2.316162,male,adult
1,robin-20070310-vf12/wav/vf12-29.wav,4.000000,2.309454,4.000000,2.000000,6.000000,4.000000,4.468912,24.387798,0.518989,0.518989,13.671813,0.145211,1.173674,2.266816,male,adult
2,robin-20070310-vf12/wav/vf12-24.wav,4.000000,2.309442,4.000000,2.000000,6.000000,4.000000,4.768208,28.682903,0.522362,0.522362,13.983390,0.154861,1.182493,2.340302,male,adult
3,robin-20070310-vf12/wav/vf12-06.wav,4.000000,2.309460,4.000000,2.000000,6.000000,4.000000,4.288241,22.928845,0.165990,0.165990,13.383058,0.113975,1.056596,2.309039,male,adult
4,robin-20070310-vf12/wav/vf12-38.wav,3.999964,2.309422,3.999964,1.999982,5.999946,3.999964,4.437734,23.512740,0.327474,0.327474,14.031024,0.154637,1.212154,2.255088,male,adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66081,Luis-20130226-peg/wav/b0325.wav,4.000000,2.309476,4.000000,2.000000,6.000000,4.000000,8.821168,99.341927,0.127484,0.127484,12.227013,0.186738,0.926109,3.981581,male,adult
66082,Luis-20130226-peg/wav/b0323.wav,4.000000,2.309447,4.000000,2.000000,6.000000,4.000000,7.781451,82.411050,0.121120,0.121120,12.875872,0.149606,0.784108,3.738265,male,adult
66083,Luis-20130226-peg/wav/b0329.wav,4.000000,2.309484,4.000000,2.000000,6.000000,4.000000,8.237866,97.468529,0.143143,0.143143,12.128605,0.123457,0.790771,3.505875,male,adult
66084,Luis-20130226-peg/wav/b0327.wav,4.000000,2.309497,4.000000,2.000000,6.000000,4.000000,8.133196,93.356766,0.117333,0.117333,11.977746,0.152190,0.854203,3.538266,male,adult


# Dropping Non-English Dialect

In [17]:
en_merged_df = en_merged_df.dropna()

en_merged_df = en_merged_df.drop_duplicates(subset=["filename"])

In [18]:
en_merged_df

Unnamed: 0,filename,mean_freq_kHz,std_freq_kHz,median_freq_kHz,first_quantile_kHz,third_quantile_kHz,iqr_kHz,skewness,kurtosis,mode_freq_kHz,peak_freq_kHz,sp_entropy,flatness,centroid_kHz,modindx,gender,age_range
0,robin-20070310-vf12/wav/vf12-34.wav,3.999961,2.309424,3.999961,1.999981,5.999942,3.999961,4.289625,22.129724,0.596192,0.596192,13.797732,0.131873,1.104101,2.316162,male,adult
1,robin-20070310-vf12/wav/vf12-29.wav,4.000000,2.309454,4.000000,2.000000,6.000000,4.000000,4.468912,24.387798,0.518989,0.518989,13.671813,0.145211,1.173674,2.266816,male,adult
2,robin-20070310-vf12/wav/vf12-24.wav,4.000000,2.309442,4.000000,2.000000,6.000000,4.000000,4.768208,28.682903,0.522362,0.522362,13.983390,0.154861,1.182493,2.340302,male,adult
3,robin-20070310-vf12/wav/vf12-06.wav,4.000000,2.309460,4.000000,2.000000,6.000000,4.000000,4.288241,22.928845,0.165990,0.165990,13.383058,0.113975,1.056596,2.309039,male,adult
4,robin-20070310-vf12/wav/vf12-38.wav,3.999964,2.309422,3.999964,1.999982,5.999946,3.999964,4.437734,23.512740,0.327474,0.327474,14.031024,0.154637,1.212154,2.255088,male,adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66081,Luis-20130226-peg/wav/b0325.wav,4.000000,2.309476,4.000000,2.000000,6.000000,4.000000,8.821168,99.341927,0.127484,0.127484,12.227013,0.186738,0.926109,3.981581,male,adult
66082,Luis-20130226-peg/wav/b0323.wav,4.000000,2.309447,4.000000,2.000000,6.000000,4.000000,7.781451,82.411050,0.121120,0.121120,12.875872,0.149606,0.784108,3.738265,male,adult
66083,Luis-20130226-peg/wav/b0329.wav,4.000000,2.309484,4.000000,2.000000,6.000000,4.000000,8.237866,97.468529,0.143143,0.143143,12.128605,0.123457,0.790771,3.505875,male,adult
66084,Luis-20130226-peg/wav/b0327.wav,4.000000,2.309497,4.000000,2.000000,6.000000,4.000000,8.133196,93.356766,0.117333,0.117333,11.977746,0.152190,0.854203,3.538266,male,adult


# Save DF as Parquet

In [19]:
# en_merged_df.to_parquet("en_merged_df.parquet", index=False)
# print("✅ Saved en_merged_df to datung/en_merged_df.parquet")

✅ Saved en_merged_df to datung/en_merged_df.parquet
