# Creating ZCU-CZ-ATC HuggingFace Dataset

### Initialize

In [1]:
import datasets
from datasets import Audio
import glob
import pandas as pd

### Create Metadata

In [3]:
wav_files = glob.glob('./WhisperModel/ZCU_CZ_ATC/*.wav')

txt_files = glob.glob('./WhisperModel/ZCU_CZ_ATC/*.txt')

In [4]:
train_p = 0.75
val_p = 0.05
test_p = 0.20

train_frac  = int(train_p * len(wav_files))
val_frac    = int(val_p * len(wav_files))
test_frac   = len(wav_files)-train_frac-val_frac

print("Files:",len(wav_files))
print("Train:", train_frac, 100*train_p, '%')
print("Val  :", val_frac, 100*val_p, '%')
print("Test :", test_frac, 100*test_p, '%')

Files: 2657
Train: 1992 75.0 %
Val  : 132 5.0 %
Test : 533 20.0 %


In [13]:
wav_files_train = wav_files[:train_frac]
wav_files_val   = wav_files[train_frac:-test_frac]
wav_files_test  = wav_files[-test_frac:]

In [14]:
wav_file = wav_files[0]
orig_wav = wav_file
dest_wav = '/'.join(orig_wav.split('/')[:-1])+'/train/'+orig_wav.split('/')[-1]
orig_txt = orig_wav[:-3]+'txt'
dest_txt = '/'.join(orig_txt.split('/')[:-1])+'/train/'+orig_txt.split('/')[-1]

print(orig_wav, dest_wav)
print(orig_txt, dest_txt)

./WhisperModel/ZCU_CZ_ATC/TWR-4Sap7y.wav ./WhisperModel/ZCU_CZ_ATC/train/TWR-4Sap7y.wav
./WhisperModel/ZCU_CZ_ATC/TWR-4Sap7y.txt ./WhisperModel/ZCU_CZ_ATC/train/TWR-4Sap7y.txt


In [16]:
import os
for wav_file in wav_files_train:
    orig_wav = wav_file
    dest_wav = '/'.join(orig_wav.split('/')[:-1])+'/train/'+orig_wav.split('/')[-1]
    orig_txt = orig_wav[:-3]+'txt'
    dest_txt = '/'.join(orig_txt.split('/')[:-1])+'/train/'+orig_txt.split('/')[-1]
    os.rename(orig_wav, dest_wav)
    os.rename(orig_txt, dest_txt)

In [17]:
for wav_file in wav_files_test:
    orig_wav = wav_file
    dest_wav = '/'.join(orig_wav.split('/')[:-1])+'/test/'+orig_wav.split('/')[-1]
    orig_txt = orig_wav[:-3]+'txt'
    dest_txt = '/'.join(orig_txt.split('/')[:-1])+'/test/'+orig_txt.split('/')[-1]
    os.rename(orig_wav, dest_wav)
    os.rename(orig_txt, dest_txt)

In [18]:
for wav_file in wav_files_val:
    orig_wav = wav_file
    dest_wav = '/'.join(orig_wav.split('/')[:-1])+'/val/'+orig_wav.split('/')[-1]
    orig_txt = orig_wav[:-3]+'txt'
    dest_txt = '/'.join(orig_txt.split('/')[:-1])+'/val/'+orig_txt.split('/')[-1]
    os.rename(orig_wav, dest_wav)
    os.rename(orig_txt, dest_txt)

In [25]:
wav_files = glob.glob('./WhisperModel/ZCU_CZ_ATC/DATA/*/*.wav')
txt_files = glob.glob('./WhisperModel/ZCU_CZ_ATC/DATA/*/*.txt')

print(len(wav_files), len(txt_files))

2657 2657


In [28]:
df = pd.DataFrame(columns=['file_name', 'text'])
for file in wav_files:
    i = wav_files.index(file)

    with open(file[:-3]+'txt') as f:
        text = f.read()

    df.loc[i, 'file_name'] = '/'.join(file.split('/')[-2:])
    df.loc[i, 'text'] = text
    
df.to_csv('./WhisperModel/ZCU_CZ_ATC/DATA/metadata.csv', index=False)

In [29]:
df

Unnamed: 0,file_name,text
0,val/TWR-mbIQOx.wav,Skytravel 1 0 1 0 confirm ready for departure?...
1,val/APP-5aTgZQ.wav,Lufthansa 7 K K descend FL 9 0 K K descend l...
2,val/TWR-MgjO8P.wav,+zynÄ› tower Lot 5 2 6 dobrĂ˝ veÄŤer approac...
3,val/TWR-nLCO4H.wav,Contract R G RWY clear to land 3 1 Contactr ...
4,val/ACCU-RXM6rs.wav,Lufthansa 1 4 9 2 climb to FL 3 4 0 Lufthans...
...,...,...
2652,train/TWR-pINehs.wav,CSA 9 2 6 ready for departure CSA 9 2 6 roge...
2653,train/APP-K3EjsL.wav,Russia 1 2 Z high speed OK? yes climb FL 1...
2654,train/APP-UkAljF.wav,German Wings 7 P turn left heading 3 4 0 G...
2655,train/APP-pnMGlr.wav,Praha approach CSA 9 2 7 dobrĂ˝ den passing ...


### Load Dataset

In [30]:
dataset = datasets.load_dataset("audiofolder", data_dir = "./WhisperModel/ZCU_CZ_ATC/DATA")

Resolving data files:   0%|          | 0/3984 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/264 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1066 [00:00<?, ?it/s]

Downloading and preparing dataset audiofolder/default to /home/junzisun/.cache/huggingface/datasets/audiofolder/default-11753bce73d0436a/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files:   0%|          | 0/1993 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1992 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1992 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/133 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/132 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/132 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/534 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/533 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/533 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset audiofolder downloaded and prepared to /home/junzisun/.cache/huggingface/datasets/audiofolder/default-11753bce73d0436a/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

### Push to Hub

In [31]:
dataset.push_to_hub('jlvdoorn/zcu-cz-atc')

Pushing split train to the Hub.


Map:   0%|          | 0/996 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/996 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/3 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Map:   0%|          | 0/132 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Map:   0%|          | 0/533 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/401 [00:00<?, ?B/s]

In [36]:
dataset['train'][0]

{'audio': {'path': '/scratch/junzisun/whisper/WhisperModel/ZCU_CZ_ATC/DATA/train/ACCU-0CUxlz.wav',
  'array': array([ 0.00036621,  0.00036621, -0.00036621, ...,  0.        ,
          0.        ,  0.        ]),
  'sampling_rate': 8000},
 'text': 'Lufthansa 7 3 9 Praha Lufthansa 7 3 9  radio descending down level 3 5 0 good afternoon Lufthansa 7 3 9 Praha radar contact descend FL 3 0 0 level by RAPET Lufthansa 7 3 9 descending FL 3 0 0 level  '}

In [37]:
dataset['validation'][0]

{'audio': {'path': '/scratch/junzisun/whisper/WhisperModel/ZCU_CZ_ATC/DATA/val/ACCU-1AL1gO.wav',
  'array': array([ 0.00036621, -0.00036621,  0.00036621, ...,  0.        ,
          0.        ,  0.        ]),
  'sampling_rate': 8000},
 'text': 'Singapure 3 2 5 Praha climb to flight level 3 3 0  climb flight level 3 3 0 Singapore 3 2 5 Praha Radar dobrĂ˝ veÄŤer Wizzair 4 C B de+ 7 0 3 Wizzair 4 C B Praha Radar good evening radar contact '}

In [38]:
dataset['test'][0]

{'audio': {'path': '/scratch/junzisun/whisper/WhisperModel/ZCU_CZ_ATC/DATA/test/ACCU-07R4Pv.wav',
  'array': array([0.00036621, 0.00036621, 0.00036621, ..., 0.        , 0.        ,
         0.        ]),
  'sampling_rate': 8000},
 'text': 'Lufthansa 4 J A contact Munich 1 3 2 point 5 5 0 good bye Lufthansa 4 J A 1 3 2 . see you later (Praha(PrĂˇg))2 7 (9(najn)) H hello can you just confirm the frequency please1 2 7 . 4 2 5 ok we try again on 1 2 7 . 4 2 5 until we get no reply for the moment thank you radar Wizzair 4 good morning climbin+ Wizzair 4 P R Praha hello radar contact squawk 1 4 7 1 climb FL 3 4 0 climb FL Wizzair 4  to FL 2  Contract 8 T W Praha hello radar contact direct LAGAR direct LAGAR climb flight 8  7 (9(najn)) H Bratislava 1 3 4 . 4 7 5 Praha hello radar good morning  8 0 1 6 passing level 1 0 0 climb FL 2 4 0 inbound to LANUX Niki 8 0 1 6 Praha hello radar contact 7 (9(najn)) H Praha yeah  7 (9(najn)) H we got a radio no reply on 1 2 7 4 2 5 yeah right frequency is 