# Creating ATCOSIM HuggingFace Dataset

### Initialize

In [2]:
import datasets
from datasets import Audio
import glob
import pandas as pd
import os

### Create Metadata

In [28]:
wav_files = glob.glob('./WhisperModel/ATCOSIM/DATA_NON_EMPTY/*.wav')

txt_files = glob.glob('./WhisperModel/ATCOSIM/DATA_NON_EMPTY/*.filtered.txt')

In [29]:
train_p = 0.75
val_p = 0.05
test_p = 0.20

train_frac  = int(train_p * len(wav_files))
val_frac    = int(val_p * len(wav_files))
test_frac   = len(wav_files)-train_frac-val_frac

print("Files:",len(wav_files))
print("Train:", train_frac, 100*train_p, '%')
print("Val  :", val_frac, 100*val_p, '%')
print("Test :", test_frac, 100*test_p, '%')

Files: 9559
Train: 7169 75.0 %
Val  : 477 5.0 %
Test : 1913 20.0 %


In [30]:
wav_files_train = wav_files[:train_frac]
wav_files_val   = wav_files[train_frac:-test_frac]
wav_files_test  = wav_files[-test_frac:]

print(os.getcwd())

/scratch/junzisun/whisper


In [31]:
wav_file = wav_files[0]
orig_wav = wav_file
dest_wav = '/'.join(orig_wav.split('/')[:-1])+'/train/'+orig_wav.split('/')[-1]
orig_txt = '/'.join(wav_file.split('/')[:-2])+'/TXTdata/'+wav_file.split('/')[-1][:-3]+'filtered.txt'
dest_txt = '/'.join(orig_txt.split('/')[:-2])+'/DATA_NON_EMPTY/train/'+orig_txt.split('/')[-1]

print(orig_wav, dest_wav)
print(orig_txt, dest_txt)

./WhisperModel/ATCOSIM/DATA_NON_EMPTY/zf3_02_060.wav ./WhisperModel/ATCOSIM/DATA_NON_EMPTY/train/zf3_02_060.wav
./WhisperModel/ATCOSIM/TXTdata/zf3_02_060.filtered.txt ./WhisperModel/ATCOSIM/DATA_NON_EMPTY/train/zf3_02_060.filtered.txt


In [32]:
for wav_file in wav_files_train:
    orig_wav = wav_file
    dest_wav = '/'.join(orig_wav.split('/')[:-1])+'/train/'+orig_wav.split('/')[-1]
    orig_txt = '/'.join(wav_file.split('/')[:-2])+'/TXTdata/'+wav_file.split('/')[-1][:-3]+'filtered.txt'
    dest_txt = '/'.join(orig_txt.split('/')[:-2])+'/DATA_NON_EMPTY/train/'+orig_txt.split('/')[-1]
    os.rename(orig_wav, dest_wav)
    os.rename(orig_txt, dest_txt)

In [33]:
for wav_file in wav_files_val:
    orig_wav = wav_file
    dest_wav = '/'.join(orig_wav.split('/')[:-1])+'/val/'+orig_wav.split('/')[-1]
    orig_txt = '/'.join(wav_file.split('/')[:-2])+'/TXTdata/'+wav_file.split('/')[-1][:-3]+'filtered.txt'
    dest_txt = '/'.join(orig_txt.split('/')[:-2])+'/DATA_NON_EMPTY/val/'+orig_txt.split('/')[-1]
    os.rename(orig_wav, dest_wav)
    os.rename(orig_txt, dest_txt)

In [34]:
for wav_file in wav_files_test:
    orig_wav = wav_file
    dest_wav = '/'.join(orig_wav.split('/')[:-1])+'/test/'+orig_wav.split('/')[-1]
    orig_txt = '/'.join(wav_file.split('/')[:-2])+'/TXTdata/'+wav_file.split('/')[-1][:-3]+'filtered.txt'
    dest_txt = '/'.join(orig_txt.split('/')[:-2])+'/DATA_NON_EMPTY/test/'+orig_txt.split('/')[-1]
    os.rename(orig_wav, dest_wav)
    os.rename(orig_txt, dest_txt)

In [35]:
wav_files = glob.glob('./WhisperModel/ATCOSIM/DATA_NON_EMPTY/*/*.wav')

In [38]:
df = pd.DataFrame(columns=['file_name', 'text'])
for file in wav_files:
    i = wav_files.index(file)

    with open(file[:-3]+'filtered.txt') as f:
        text = f.read()

    df.loc[i, 'file_name'] = '/'.join(file.split('/')[-2:])
    df.loc[i, 'text'] = text
    
df.to_csv('./WhisperModel/ATCOSIM/DATA_NON_EMPTY/metadata.csv', index=False)

In [39]:
df

Unnamed: 0,file_name,text
0,train/sm4_04_034.wav,and expect further climb in one minute when co...
1,train/zf3_01_059.wav,alitalia one five eight youre recleared after ...
2,train/zf1_04_074.wav,swissair six five five two stop climb level tw...
3,train/zf1_07_190.wav,airfrans six four seven reims one three four d...
4,train/zf2_04_217.wav,lufthansa four four one six ah left turn headi...
...,...,...
9554,val/sm3_05_091.wav,delta mike echo proceed direct to trasadingen
9555,val/zf1_08_017.wav,lufthansa eight two six eight climb to flight ...
9556,val/zf3_01_011.wav,alitalia six four seven four zurich radar good...
9557,val/zf3_03_152.wav,lufthansa three five five zero milan one three...


### Load Dataset

In [40]:
dataset = datasets.load_dataset("audiofolder", data_dir = "./WhisperModel/ATCOSIM/DATA_NON_EMPTY/")

Resolving data files:   0%|          | 0/14338 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/954 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/3826 [00:00<?, ?it/s]

Downloading and preparing dataset audiofolder/default to /home/junzisun/.cache/huggingface/datasets/audiofolder/default-a3481a8dfb07a1a1/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files:   0%|          | 0/7170 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/7169 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/7169 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/478 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/477 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/477 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1914 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1913 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1913 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset audiofolder downloaded and prepared to /home/junzisun/.cache/huggingface/datasets/audiofolder/default-a3481a8dfb07a1a1/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

### Push to Hub

In [41]:
dataset.push_to_hub('jlvdoorn/atcosim')

Pushing split train to the Hub.


Map:   0%|          | 0/1793 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/4 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1792 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1792 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1792 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/6 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Map:   0%|          | 0/477 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Map:   0%|          | 0/1913 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/631 [00:00<?, ?B/s]

In [42]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 7169
    })
    validation: Dataset({
        features: ['audio', 'text'],
        num_rows: 477
    })
    test: Dataset({
        features: ['audio', 'text'],
        num_rows: 1913
    })
})