In [3]:
import pandas as pd
from pathlib import Path

In [20]:
# Inside the container, this points to the mounted dataset
data_path = Path("../data/raw/common_voice/validated.tsv")
df = pd.read_csv(data_path, sep="\t")
print("Initial shape:",df.shape)

Initial shape: (249, 13)


In [21]:
df.head()

Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment
0,116398939d6be70fc5fb532924a130c0adf286ac283499...,common_voice_en_41923025.mp3,f5a2a431746c5229ab696ba0e1a518fe7b26e208ff3b84...,"He was born at Wichenford, in Worcestershire, ...",,2,0,thirties,,United States English,,en,
1,24a4da2e8f053a45a0715849c222a40a4b0da9872efb2e...,common_voice_en_42356358.mp3,f6f009587d8812c147af1cc05079e1fcd8120c8a98cdf8...,The Portuguese division was overrun and withdr...,,2,0,teens,,United States English,,en,
2,30849595699bc853c3810a78448acede46888b4e2d0809...,common_voice_en_42165090.mp3,f69afa5e77812e8be0085c874d2a9767323c78ffb43ba6...,Her health by this stage was also poor.,,2,0,,,,,en,
3,42d53f34c1bc50f7a7c4ed1765a8d1ffeaf5cd441513cc...,common_voice_en_41921729.mp3,f5739acbefdbd3aac990792966fac4d40dcb39eb8dfa21...,His sporting interests outside of cricket incl...,,2,0,nineties,,England English,,en,
4,436b9e1f9da710d74eb01209f8f269bee70e93cadf2053...,common_voice_en_42528393.mp3,f7d35c60d76f025c45a9495757d1ee0e2b7c206317a288...,The following year he was elected to be part o...,,2,0,teens,,United States English,,en,


In [22]:
# Show columns to inspect
print("Columns in validated.tsv:", df.columns.tolist())

Columns in validated.tsv: ['client_id', 'path', 'sentence_id', 'sentence', 'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment']


In [23]:
# Dropping rows with missing or unknown age
df = df.dropna(subset=["age"])
df = df[df["age"] != "unknown"]

In [24]:
print("After dropping missing/unknown ages:", df.shape)

After dropping missing/unknown ages: (237, 13)


In [25]:
# Checking all unique age values
print("Unique age labels:")
print(sorted(df["age"].unique()))

Unique age labels:
['fifties', 'fourties', 'nineties', 'sixties', 'teens', 'thirties', 'twenties']


In [26]:
# Define age bins — you can customize further if needed
def bin_age(age):
    if age in ["teens"]:
        return "teen"
    elif age in ["twenties"]:
        return "young_adult"
    elif age in ["thirties", "forties"]:
        return "adult"
    elif age in ["fifties", "sixties"]:
        return "middle_aged"
    elif age in ["seventies", "eighties", "nineties"]:
        return "senior"
    else:
        return "unknown"

In [27]:
# ✅ Apply binning function to create new column
df["age_group"] = df["age"].apply(bin_age)

# ✅ Drop any that failed binning (i.e., stayed 'unknown')
df = df[df["age_group"] != "unknown"]

In [28]:
# ✅ Preview result
print(df["age_group"].value_counts())
df[["path", "age", "age_group"]].head()

age_group
young_adult    126
adult           56
middle_aged     33
teen            14
senior           1
Name: count, dtype: int64


Unnamed: 0,path,age,age_group
0,common_voice_en_41923025.mp3,thirties,adult
1,common_voice_en_42356358.mp3,teens,teen
3,common_voice_en_41921729.mp3,nineties,senior
4,common_voice_en_42528393.mp3,teens,teen
6,common_voice_en_42555516.mp3,twenties,young_adult


In [29]:
# Paths
clips_dir = Path("../data/raw/common_voice/clips/")
processed_dir = Path("../data/processed/")
processed_dir.mkdir(exist_ok=True, parents=True)

In [30]:
# Librosa settings
SAMPLE_RATE = 16000
MAX_DURATION = 3  # seconds
MAX_LEN = SAMPLE_RATE * MAX_DURATION  # samples
NUM_MFCC = 13

In [None]:
np.random.seed(42)