In [1]:
import os
import librosa
import json
import pandas as pd
from dataset import generate_data_csv, generate_hdf5
from tqdm import tqdm

## data csv

In [2]:
label_csv = r'Z:\data\ai_dataset\audio\audio_set\class_labels_indices.csv'
train_csv = r'Z:\data\ai_dataset\audio\audio_set\balanced_train_segments.csv'
eval_csv = r'Z:\data\ai_dataset\audio\audio_set\eval_segments.csv'
balance_data_root = r'Z:\data\ai_dataset\audio\audio_set\balanced_train_segments'
eval_data_root = r'Z:\data\ai_dataset\audio\audio_set\eval_segments'

saved_label_json = r'workspace\label.json'
saved_data_csv = r'workspace\ft_data.csv'

In [3]:
def wav_valid(wav_f, sr=32000, min_len=1):
    wavform, _ = librosa.load(wav_f, sr=sr)
    return True if wavform.shape[0] >= int(min_len*sr) else False

In [4]:
lb_df = pd.read_csv(label_csv, index_col=0)
train_df = pd.read_csv(train_csv, sep='\t')
eval_df = pd.read_csv(eval_csv, sep='\t')
lb_df.head(), train_df.head(8), eval_df.head(8)

(             mid                   display_name
 index                                          
 0       /m/09x0r                         Speech
 1      /m/05zppz      Male speech, man speaking
 2       /m/02zsn  Female speech, woman speaking
 3       /m/0ytgt     Child speech, kid speaking
 4      /m/01h8n0                   Conversation,
      # Segments csv created Sun Mar  5 10:54:31 2017
 0  # num_ytids=22160, num_segs=22160, num_unique_...
 1  # YTID, start_seconds, end_seconds, positive_l...
 2  --PJHxphWEs, 30.000, 40.000, "/m/09x0r,/t/dd00...
 3           --ZhevVpy1s, 50.000, 60.000, "/m/012xff"
 4  --aE2O5G5WE, 0.000, 10.000, "/m/03fwl,/m/04rlf...
 5  --aO5cdqSAg, 30.000, 40.000, "/t/dd00003,/t/dd...
 6  --aaILOrkII, 200.000, 210.000, "/m/032s66,/m/0...
 7           --cB2ZVjpnA, 30.000, 40.000, "/m/01y3hg",
      # Segments csv created Sun Mar  5 10:54:25 2017
 0  # num_ytids=20371, num_segs=20371, num_unique_...
 1  # YTID, start_seconds, end_seconds, positive_l...
 2  --4

In [5]:
label_list = lb_df['mid'].tolist()
label_map = {_k: _v for _v, _k in enumerate(label_list)}
with open(saved_label_json, 'w') as _f:
    json.dump(label_map, _f, indent=4)

In [6]:
train_file_info = train_df.iloc[:, 0].tolist()[2:]
train_file_info = list(map(lambda x: x.split(' '), train_file_info))
for _file_info in train_file_info:
    assert len(_file_info) == 4 and _file_info[-1].startswith('"') and _file_info[-1].endswith('"')
train_file_info = list(map(lambda x: [x[0][:-1], x[-1][1:-1]], train_file_info))

eval_file_info = eval_df.iloc[:, 0].tolist()[2:]
eval_file_info = list(map(lambda x: x.split(' '), eval_file_info))
for _file_info in eval_file_info:
    assert len(_file_info) == 4 and _file_info[-1].startswith('"') and _file_info[-1].endswith('"')
eval_file_info = list(map(lambda x: [x[0][:-1], x[-1][1:-1]], eval_file_info))
print(f'train: total {len(train_file_info)}; {train_file_info[0]} {len(train_file_info[0][0])})')
print(f'eval: total {len(eval_file_info)}; {eval_file_info[0]} {len(eval_file_info[0][0])})')

train: total 22160; ['--PJHxphWEs', '/m/09x0r,/t/dd00088'] 11)
eval: total 20371; ['--4gqARaEJE', '/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk'] 11)


In [7]:
train_wavs = os.listdir(balance_data_root)
eval_wavs = os.listdir(eval_data_root)
print(f'train: {len(train_wavs)}; {train_wavs[0]}-{train_wavs[0]}')
print(f'eval: {len(eval_wavs)}; {eval_wavs[0]}-{eval_wavs[0]}')

train: 20550; -pjK2u3Qtxc.wav--pjK2u3Qtxc.wav
eval: 18887; VMbJTgzMhKE.wav-VMbJTgzMhKE.wav


In [8]:
result = {
    'train': [],
    'eval': [],
}

In [9]:
train_miss = 0
for _file_info in tqdm(train_file_info):
    wav_name = _file_info[0] + '.wav'
    if not wav_name in train_wavs:
        train_miss += 1
        continue
    
    wav_path = os.path.join(balance_data_root, wav_name)
    if not wav_valid(wav_path):
        continue
    result['train'].append({
        'file': wav_path,
        'label': ','.join(list(map(lambda x: str(label_map[x]), _file_info[1].split(',')))),
    })
train_miss

100%|██████████| 22160/22160 [07:25<00:00, 49.78it/s]


1610

In [10]:
eval_miss = 0
for _file_info in tqdm(eval_file_info):
    wav_name = _file_info[0] + '.wav'
    if not wav_name in eval_wavs:
        eval_miss += 1
        continue
    
    wav_path = os.path.join(eval_data_root, wav_name)
    if not wav_valid(wav_path):
        continue
    result['eval'].append({
        'file': wav_path,
        'label': ','.join(list(map(lambda x: str(label_map[x]), _file_info[1].split(',')))),
    })
eval_miss

100%|██████████| 20371/20371 [07:40<00:00, 44.28it/s]


1484

In [11]:
for _k, _v in result.items():
    print(f'{_k}: {len(_v)}, samples: {_v[0]}')

train: 20547, samples: {'file': 'Z:\\data\\ai_dataset\\audio\\audio_set\\balanced_train_segments\\--PJHxphWEs.wav', 'label': '0,451'}
eval: 18884, samples: {'file': 'Z:\\data\\ai_dataset\\audio\\audio_set\\eval_segments\\--4gqARaEJE.wav', 'label': '73,361,74,72'}


In [12]:
generate_data_csv(saved_data_csv, **result)

## h5

In [15]:
train_hdf5 = r'E:\common\dataset\audio\audio_set\h5\ft_train.hdf5'
eval_h5df = r'E:\common\dataset\audio\audio_set\h5\ft_eval.hdf5'
saved_data_csv = r'workspace\ft_data.csv'

In [16]:
df = pd.read_csv(saved_data_csv, index_col=0)
df.head()

Unnamed: 0,file,type,label
0,Z:\data\ai_dataset\audio\audio_set\balanced_tr...,train,451
1,Z:\data\ai_dataset\audio\audio_set\balanced_tr...,train,375
2,Z:\data\ai_dataset\audio\audio_set\balanced_tr...,train,951370
3,Z:\data\ai_dataset\audio\audio_set\balanced_tr...,train,3234
4,Z:\data\ai_dataset\audio\audio_set\balanced_tr...,train,427431


In [17]:
train_files = df[df['type']=='train']['file'].tolist()
eval_files = df[df['type']=='eval']['file'].tolist()
train_files[0], eval_files[0]

('Z:\\data\\ai_dataset\\audio\\audio_set\\balanced_train_segments\\--PJHxphWEs.wav',
 'Z:\\data\\ai_dataset\\audio\\audio_set\\eval_segments\\--4gqARaEJE.wav')

In [18]:
generate_hdf5(train_files, train_hdf5)

handle wav: 20547, file_apart 1


20547it [10:23, 32.93it/s]

finish writing 20547





In [19]:
generate_hdf5(eval_files, eval_h5df)

handle wav: 18884, file_apart 1


18884it [09:37, 32.68it/s]

finish writing 18884



