In [13]:
!pip install h5py



In [14]:
import h5py
import numpy as np
with h5py.File("mytestfile.hdf5", "w") as f:
    dset = f.create_dataset("mydataset", (100,), dtype='i')
    dset[...] = np.arange(100)

In [15]:
f = h5py.File('mytestfile.hdf5', 'r+')
f

<HDF5 file "mytestfile.hdf5" (mode r+)>

In [16]:
dset = f['mydataset']
dset

<HDF5 dataset "mydataset": shape (100,), type "<i4">

In [17]:
dset[...] = np.arange(100)

In [18]:
dset[10]

10

In [19]:
dset[0:100:10]

array([ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90], dtype=int32)

In [20]:
f.name

'/'

In [21]:
grp = f.create_group("subgroup")

In [22]:
dset2 = grp.create_dataset("another_dataset", (50,), dtype='f')

In [23]:
dset2.name

'/subgroup/another_dataset'

In [24]:
dset3 = f.create_dataset('subgroup2/dataset_three', (10,), dtype='i')

In [25]:
dset3.name

'/subgroup2/dataset_three'

In [26]:
dataset_three = f['subgroup2/dataset_three']

In [27]:
dataset_three.name

'/subgroup2/dataset_three'

In [28]:
dset

<HDF5 dataset "mydataset": shape (100,), type "<i4">

In [29]:
dataset_three

<HDF5 dataset "dataset_three": shape (10,), type "<i4">

In [30]:
f.name

'/'

In [31]:
f.visit(lambda name: print(name))

mydataset
subgroup
subgroup/another_dataset
subgroup2
subgroup2/dataset_three


In [32]:
import pandas as pd
from pathlib import Path

download_path = Path.cwd()/'UrbanSound8K'
metadata_file = download_path/'metadata'/'UrbanSound8K.csv'

df = pd.read_csv(metadata_file)
# Construct file path by concatenating fold and file name
df['relative_path'] = '/fold' + df['fold'].astype(str) + '/' + df['slice_file_name'].astype(str)
# Take relevant columns
df = df[['relative_path', 'classID']]
df.head()

Unnamed: 0,relative_path,classID
0,/fold5/100032-3-0-0.wav,3
1,/fold5/100263-2-0-117.wav,2
2,/fold5/100263-2-0-121.wav,2
3,/fold5/100263-2-0-126.wav,2
4,/fold5/100263-2-0-137.wav,2


In [33]:
import numpy as np
from tqdm import tqdm
def static_preprocessing(df, data_path):
    duration = 4000
    sr = 44100
    channel = 2
    shift_pct = 0.4

    processed_audio_files = np.empty((len(df)), dtype=object)

    for idx in tqdm(range(len(df))):
        # open and perform static processing
        audio_file = data_path + df.loc[idx, 'relative_path']
        aud = AudioUtil.open(audio_file)
        reaud = AudioUtil.resample(aud, sr)
        rechan = AudioUtil.rechannel(reaud, channel)
        dur_aud = AudioUtil.pad_trunc(rechan, duration)
        
        # store in side new np.array
        processed_audio_files[idx] = dur_aud
        

    # concatenate processed_audio and classID into a new dataframe
    processed_df = pd.DataFrame(np.column_stack((processed_audio_files, df["classID"])),
    columns=["processed", "classID"])
    return processed_df


In [41]:
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
                  index=['a', 'b', 'c'])

In [42]:
df.to_hdf('data.h5', key='df', mode='w')

In [43]:
f = h5py.File('data.h5', 'r+')
f

<HDF5 file "data.h5" (mode r+)>

In [44]:
f.visit(lambda name: print(name))

df
df/axis0
df/axis1
df/block0_items
df/block0_values


In [53]:
f["df/block0_items"][0]

b'A'

In [54]:
f["df/block0_items"][1]

b'B'