# Install Dataset

This notebook installs a local copy of the ATC dataset in your project folder. Audio is available from huggingface, which requires a huggingface account. 

A CSV file is generated containing the paths to the audio clips and the text transcriptions.

In [4]:
# recommend running this in a virtual environment

! pip install -r requirements.txt


Collecting seaborn (from -r requirements.txt (line 12))
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Import Dataset

In [2]:
from huggingface_hub import login
from datasets import load_dataset

import os
import numpy as np
import pandas as pd
import scipy.io.wavfile as wav

In [None]:
# get a token from huggingface
# instructions here: https://huggingface.co/docs/huggingface_hub/quick-start
token = None # replace with your token
login(token)
! huggingface-cli whoami

jmp22


In [13]:
atc_dataset = load_dataset("jacktol/atc-dataset")

Generating train split: 100%|██████████| 11868/11868 [00:00<00:00, 17394.62 examples/s]
Generating test split: 100%|██████████| 2927/2927 [00:00<00:00, 17162.14 examples/s]


In [None]:
print(atc_dataset)

df_train = atc_dataset['train'].to_pandas()
df_val = atc_dataset['test'].to_pandas()

# should have 11868 samples in train and 2927 in val


DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 11868
    })
    test: Dataset({
        features: ['audio', 'text'],
        num_rows: 2927
    })
})


In [None]:
# Save audio files to disk

# Create a directory to save training audio files
os.makedirs("audio_files", exist_ok=True)

# Function to save audio bytes as a .wav file
def save_audio(idx, audio_data):
    file_path = f"audio_files/audio_{idx}.wav"
    with open(file_path, "wb") as f:
        f.write(audio_data["bytes"])  # Save raw bytes to file
    return file_path

df_train["audio_path"] = df_train["audio"].apply(lambda x: save_audio(df_train.index[df_train["audio"] == x][0], x))




In [43]:
# Create a directory to save validation audio files
os.makedirs("val_audio_files", exist_ok=True)

# Function to save audio bytes as a .wav file
def save_audio(idx, audio_data):
    file_path = f"val_audio_files/audio_{idx}.wav"
    with open(file_path, "wb") as f:
        f.write(audio_data["bytes"])  # Save raw bytes to file
    return file_path

df_val["audio_path"] = df_val["audio"].apply(lambda x: save_audio(df_val.index[df_val["audio"] == x][0], x))

In [54]:
print("***** Training Data *****")
print(df_train.head())
print("***** Validation Data *****")
print(df_val.head())

***** Training Data *****
                                               audio  \
0  {'bytes': b'RIFF\xe4\xb7\x00\x00WAVEfmt \x10\x...   
1  {'bytes': b'RIFF$w\x01\x00WAVEfmt \x10\x00\x00...   
2  {'bytes': b'RIFF\xa4u\x00\x00WAVEfmt \x10\x00\...   
3  {'bytes': b'RIFF\xe4\x8a\x00\x00WAVEfmt \x10\x...   
4  {'bytes': b'RIFF$F\x00\x00WAVEfmt \x10\x00\x00...   

                                                text  
0                             o kilo hotel uniform u  
1  hotel charlie papa frequency change approved b...  
2               direct padka request three five zero  
3        praha good morning lufthansa two five three  
4                                   sa six mike alfa  
***** Validation Data *****
                                               audio  \
0  {'bytes': b'RIFF\x84\xc2\x00\x00WAVEfmt \x10\x...   
1  {'bytes': b'RIFF\xa4\xb1\x00\x00WAVEfmt \x10\x...   
2  {'bytes': b'RIFFd\xec\x00\x00WAVEfmt \x10\x00\...   
3  {'bytes': b'RIFF\xc4\xaa\x00\x00WAVEfmt \x10\x...   

In [49]:
# Save the dataframes to .csv files
# Index corresponds to the index of the audio file
df_train = df_train.drop(columns=["audio"])
df_train.to_csv("train_data.csv", index=True)
df_val = df_val.drop(columns=["audio"])
df_val.to_csv("val_data.csv", index=True)