In [1]:
import h5py
import numpy as np
import pandas as pd
import pickle

In [2]:
# Load h5 file. Run one file at a time

# 3 files to reformat and convert into a dataframe:
# /Users/greenapple/project3/data/raw/packed_features/bal_train.h5
# /Users/greenapple/project3/data/raw/packed_features/eval.h5
# /Users/greenapple/project3/data/raw/packed_features/unbal_train.h5

data = h5py.File('/Users/greenapple/project3/data/raw/packed_features/unbal_train.h5')

  


In [3]:
# Columns present in bal_train.h5 file
list(data.keys())

['video_id_list', 'x', 'y']

In [4]:
# Shapes of 'video_id_list', 'x' and 'y'
data['video_id_list'], data['x'], data['y']

(<HDF5 dataset "video_id_list": shape (2041789,), type "|S11">,
 <HDF5 dataset "x": shape (2041789, 10, 128), type "|u1">,
 <HDF5 dataset "y": shape (2041789, 527), type "|b1">)

In [5]:
# Reshape feature array to dataframe format
x = np.array(data['x'])
x_reshaped = x.reshape(2041789, 1280)
x_reshaped.shape

(2041789, 1280)

In [6]:
# Make a single column with numbers for sound classes
y_array = np.array(data['y'])

y_list = []
for row in y_array:
    index = -1
    for el in row:
        index += 1
        if el == True:
            y_list.append(index)
            break 
            
len(y_list)

2041789

In [7]:
# Make dataframe with refomatted arrays
df = pd.DataFrame()
df = pd.DataFrame(np.array(data['video_id_list']), columns=['video_id_list'])
df['y'] = y_list

In [8]:
# Load file with sound classes labels
labels = pd.read_csv('/Users/greenapple/project3/data/raw/class_labels_indices.csv')
name_dict = labels.set_index('index').to_dict()['display_name']

In [9]:
# Add sound classes labeles to dataframe
df['y_name'] = df['y']
df['y_name'] = df['y_name'].replace(name_dict)

In [10]:
# Add columns with features
x_df = pd.DataFrame(x_reshaped)
df = pd.concat([df, x_df], axis=1)

In [11]:
df

Unnamed: 0,video_id_list,y,y_name,0,1,2,3,4,5,6,...,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279
0,b'--ZXVvvzZY4',0,Speech,87,97,115,170,239,52,131,...,154,140,0,179,117,184,189,43,111,113
1,b'--7T50tAIrg',0,Speech,73,77,122,94,17,87,180,...,255,31,255,96,255,191,44,255,153,255
2,b'--zhhN7ldMU',137,Music,174,106,132,48,78,53,148,...,255,4,69,126,160,72,241,173,92,71
3,b'--I878jVbrY',137,Music,228,107,52,107,79,193,123,...,112,198,0,129,39,96,47,70,198,153
4,b'--N8Xc-3C3k',0,Speech,61,98,166,182,203,32,219,...,213,255,173,67,246,255,0,185,36,91
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2041784,b'zzSuwklObyQ',27,Singing,210,140,156,181,135,149,169,...,98,238,130,0,77,111,255,190,0,255
2041785,b'zz-ecEuWiCQ',0,Speech,20,208,52,0,255,183,230,...,255,177,255,145,173,0,174,255,255,0
2041786,b'zzvOEksj2V0',0,Speech,131,122,216,117,171,141,77,...,202,82,0,182,119,221,97,181,99,230
2041787,b'zzLl8g4g91I',189,Bowed string instrument,203,102,137,66,126,132,211,...,64,0,132,86,235,0,123,144,0,0


In [12]:
df.loc[df['y_name']=='Meow']

Unnamed: 0,video_id_list,y,y_name,0,1,2,3,4,5,6,...,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279
2509,b'-1kyiLNK6iQ',83,Meow,138,230,16,77,0,255,83,...,255,0,0,94,0,255,0,0,203,255
8092,b'-7x3M4faMEs',83,Meow,141,77,81,155,97,76,90,...,0,0,0,0,0,0,0,0,0,0
10989,b'-AGOo7oqPgI',83,Meow,81,48,158,58,83,144,16,...,114,117,121,126,126,130,114,133,130,187
17016,b'-GERSENU7I4',83,Meow,19,104,38,166,0,89,0,...,83,1,0,126,201,154,141,134,255,131
24860,b'-OBDFmzhD8Y',83,Meow,120,122,144,61,126,86,66,...,0,124,236,169,156,214,68,255,119,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1988093,b'xklbg6QI32A',83,Meow,82,113,160,64,189,166,33,...,193,82,165,106,5,46,160,208,194,31
2009992,b'yf2IplueXIQ',83,Meow,95,190,155,62,215,131,151,...,187,66,79,116,86,8,0,255,255,225
2014299,b'yq3VxeaIaU4',83,Meow,63,87,170,35,175,99,67,...,103,126,1,49,102,175,106,0,207,25
2015003,b'ysVbRnNY2MM',83,Meow,178,138,166,98,61,192,255,...,155,146,156,22,255,177,45,150,136,210


In [14]:
df['y_name'].value_counts()

Speech           999421
Music            623184
Vehicle           77011
Singing           37733
Animal            19905
                  ...  
Sad music             1
Ratchet, pawl         1
Tabla                 1
Moo                   1
Splinter              1
Name: y_name, Length: 523, dtype: int64

In [15]:
# Pickle dataframe
# 3 dataframes to pickle: df_bal, df_eval, df_unbal

with open('/Users/greenapple/project3/data/processed/df_unbal.pkl', 'wb') as f:
    pickle.dump(df, f)