In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
dataset_path = "dataset/train.tsv"
df = pd.read_csv(dataset_path, sep="\t")

In [None]:
# Remove all columns except 'path', 'sentence', 'age', 'gender', 'accents'
df = df[['path', 'sentence', 'age', 'gender', 'accents']]
# Filter out rows where 'accents' is NaN or empty
df_filtered = df[df['accents'].notna() & (df['accents'] != '') & df['gender'].notna() & df['age'].notna()]
print(df_filtered.head())
print(df_filtered.count())

In [None]:
# Chart the distribution of accents in a pie chart
# Create a function to display the actual counts
def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{v:d}'.format(v=val)
    return my_autopct

top_values = 10

# Create a figure with 2 rows and 2 columns of subplots
fig, axs = plt.subplots(1,3, figsize=(30,20))

# Accents distribution
accents_value_counts = df_filtered['accents'].value_counts()
axs[0].pie(accents_value_counts[0:top_values],              
             autopct=make_autopct(accents_value_counts[0:top_values]), 
             startangle=120)
axs[0].legend(accents_value_counts.index[0:top_values], loc='lower right', bbox_to_anchor=(-0.1, 0))
axs[0].set_title("Distribution of Accents")

# Gender distribution
gender_value_counts = df_filtered['gender'].value_counts()
axs[1].pie(gender_value_counts[0:top_values],              
              autopct=make_autopct(gender_value_counts[0:top_values]),
              startangle=30)
axs[1].legend(gender_value_counts.index[0:top_values], loc='lower center', bbox_to_anchor=(-0.1, 0))
axs[1].set_title("Distribution of Gender")

# Age distribution
age_value_counts = df_filtered['age'].value_counts()
axs[2].pie(age_value_counts[0:top_values],              
              autopct=make_autopct(age_value_counts[0:top_values]),
              startangle=30)
axs[2].legend(age_value_counts.index[0:top_values], loc='lower center', bbox_to_anchor=(-0.1, 0))
axs[2].set_title("Distribution of Age")

plt.tight_layout()
plt.show()

In [None]:
# insert a new row into df_filtered
# new_row = pd.DataFrame({
#   "path": ["sample.wav"],
#   "sentence": ["Please call Stella, ask her to bring these things to the store"],
#   "age": ["twenties"],
#   "gender": ["male_masculine"],
#   "accents": ["England English"]
# })
# df_filtered = pd.concat([new_row, df_filtered.head(10)], ignore_index=True)
# df_filtered.tail()

In [None]:
output_path = "dataset/train_clean.csv"
df_filtered.to_csv(output_path, index=False)

In [None]:
from datasets import load_dataset, Audio
# Load the dataset and save it in a format compatible with Hugging Face datasets
input_path = "dataset/train_clean.csv"
dataset = load_dataset("csv", data_files=input_path, split="train")
dataset = dataset.cast_column("path", Audio(sampling_rate=16000))

def load_audio(dataset):
    return {"path": [x['path'] for x in dataset['path']]}

dataset = dataset.map(load_audio, batched=True)
dataset.save_to_disk("dataset/train_clean")