# Install Dataset

This notebook installs a local copy of the ATC dataset in your project folder. Audio is available from huggingface, which requires a huggingface account. 

A CSV file is generated containing the paths to the audio clips and the text transcriptions.

In [None]:
# recommend running this in a virtual environment

! pip install -r requirements.txt


Import Dataset

In [None]:
from huggingface_hub import login
from datasets import load_dataset

import os
import numpy as np
import pandas as pd
import scipy.io.wavfile as wav

In [None]:
# get a token from huggingface
# instructions here: https://huggingface.co/docs/huggingface_hub/quick-start
token = "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" # replace with your token
login(token)
! huggingface-cli whoami

In [None]:
atc_dataset = load_dataset("jacktol/atc-dataset")

In [None]:
print(atc_dataset)

df_train = atc_dataset['train'].to_pandas()
df_val = atc_dataset['test'].to_pandas()

# should have 11868 samples in train and 2927 in val


In [None]:
# Save audio files to disk

# Create a directory to save training audio files
os.makedirs("audio_files", exist_ok=True)

# Function to save audio bytes as a .wav file
def save_audio(idx, audio_data):
    file_path = f"audio_files/audio_{idx}.wav"
    with open(file_path, "wb") as f:
        f.write(audio_data["bytes"])  # Save raw bytes to file
    return file_path

df_train["audio_path"] = df_train["audio"].apply(lambda x: save_audio(df_train.index[df_train["audio"] == x][0], x))




In [None]:
# Create a directory to save validation audio files
os.makedirs("val_audio_files", exist_ok=True)

# Function to save audio bytes as a .wav file
def save_audio(idx, audio_data):
    file_path = f"val_audio_files/audio_{idx}.wav"
    with open(file_path, "wb") as f:
        f.write(audio_data["bytes"])  # Save raw bytes to file
    return file_path

df_val["audio_path"] = df_val["audio"].apply(lambda x: save_audio(df_val.index[df_val["audio"] == x][0], x))

In [None]:
print("***** Training Data *****")
print(df_train.head())
print("***** Validation Data *****")
print(df_val.head())

In [None]:
# Save the dataframes to .csv files
# Index corresponds to the index of the audio file
df_train = df_train.drop(columns=["audio"])
df_train.index.name = "index"              # name the index
df_train.reset_index(inplace=True)         # convert index to a column
df_train.to_csv("train_data.csv", index=False)  # don't write a new index column

df_val = df_val.drop(columns=["audio"])
df_val.index.name = "index"
df_val.reset_index(inplace=True)
df_val.to_csv("val_data.csv", index=False)



