# Data Loading and Dataset Filtering

This notebook is responsible for loading the Mozilla Common Voice dataset metadata and creating a reproducible subset of samples for downstream feature extraction and model training.

In [None]:
# Import os library for file operations
import os
import pandas as pd
import numpy as np

# Define the dataset path
DATASET_PATH = r"D:/mcv-scripted-en-v23.0/cv-corpus-23.0-2025-09-05/en"

# Output folder for saving the subset
OUTPUT_FOLDER = r"D:/mcv-scripted-en-v23.0/1000_samples_output"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Create full paths to audio files and add them to the DataFrame
df_small["audio_path"] = df_small["path"].apply(lambda x: os.path.join(DATASET_PATH, "clips", x)
)

# Sample 1000 rows from the DataFrame
df_small = df.sample(n=1000, random_state=42)

# Create full paths to audio files and add them to the DataFrame
df_small["audio_path"] = df_small["path"].apply(
    lambda x: os.path.join(DATASET_PATH, "clips", x)
)

# Save Dataset Subset to CSV

This cell saves a reduced subset of the dataset to a CSV file for faster experimentation and debugging.

In [None]:
# Save the subset DataFrame to a CSV file
subset_path = os.path.join(DATASET_PATH, "subset_1000.csv")

# Save the DataFrame to a CSV file without the index
df_small.to_csv(subset_path, index=False)

# Print a message indicating where the subset has been saved
print(f"Subset saved to {subset_path}")

# Copy Audio Files for Dataset Subset

This cell copies the audio files referenced in the subset DataFrame into a single output directory. 

In [None]:
# Import shutil library for high-level file operations
import shutil

# Initialize a counter for copied files and directories
copied = 0

# Iterate over each audio file path in the DataFrame and copy the files to the output folder
for src in df_small["audio_path"]:
    if os.path.exists(src):
        dst = os.path.join(OUTPUT_FOLDER, os.path.basename(src))
        shutil.copy2(src, dst)
        copied += 1

# Print the number of copied files and the output folder location
print(f"Number of copied files: {copied} into {OUTPUT_FOLDER}")

# Verify Copied Audio Files

This cell checks the output directory and prints the number of audio files copied to confirm the copy operation was successful.

In [None]:
# Print the number of files in the output folder to verify the copy operation
print("Files in small dataset:", len(os.listdir(OUTPUT_FOLDER)))