In [6]:
import pickle
import pandas as pd

# def get_class_name(filename):
#     if 'cat' in filename:
#         class_name = 0
#     else:
#         class_name = 1 
#     return class_name

def load_data(dataset='training'):    
    return pd.read_pickle('../data_processed/' + dataset + '_set.pkl')

def get_dimensions(shape='mel_only', frames=None):
    if shape=='mel_only':            
        mel_height = 128
        mel_depth = 1
    elif shape=='mel_delta':
        mel_height = 256
        mel_depth = 1        
    elif shape=='mel_delta_stacked':
        mel_height = 128
        mel_depth = 2        
    mel_width = int(frames.shape[0]/mel_height/mel_depth)
    return mel_height, mel_width, mel_depth    

In [207]:
def extract_mel_spectrograms(dataset='training', features=['Mel'], shape='mel_only', window_size=28):
    
    df = load_data(dataset)
        
    #Where it will be stored
    files = []
    labels = []
    data = []
    
    #List of file names in the dataset
    file_names = list(df.File_id.unique())
    
    for file in file_names:
        
        class_name = get_class_name(file)        
        #Filter for the file and extract needed features        
        frames = np.array(df[df['File_id'] == file][features])
        frames = frames.ravel()
        frames = np.concatenate(frames)
        
        #obtain some dimentions about the set to load
        mel_height, mel_width, mel_depth = get_dimensions(shape=shape, frames=frames)

        #Combine all the frames into a mel_spectrogram
        try:
            mel = np.reshape(frames, (mel_height, mel_width, mel_depth))
        except ValueError:                      
            print(file)
            mel = np.reshape(frames, (mel_height, mel_width, mel_depth))
            
        
        #each mel needs to be chopped into segments of window_size width
        batch_size = int(mel.shape[1] / window_size)
        for i in list(range(batch_size)):
            labels.append(class_name)
            files.append(file)            
            data.append(mel[:,i*window_size:(i+1)*window_size])
            
    return np.array(data, dtype=np.float32), np.array(labels), np.array(files)

In [214]:
x_train, y_train, f_train = extract_mel_spectrograms(dataset='Train', features=['Mel', 'Mel_deltas'], 
                                                    shape='mel_delta_stacked', window_size=28)

In [3]:
from util_functions import load_features_with_deltas_stacking, random_mini_batches
train_data, test_data, train_labels, test_labels = load_features_with_deltas_stacking()

In [5]:
test_data.shape

(0,)

In [7]:
df = load_data(dataset='training')
df.head()

Unnamed: 0,File_id,Mel,Mel_deltas,audio,channels,file_name,mfcc_deltas,mfccs,sample_rates,Label
44,cat_140,"[[4.72798677418e-07, 6.43605592482e-06, 0.0006...","[[0.023151710141, 0.023151710141, 0.0231517101...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,data/cats_dogs/cat_140.wav,"[[78.3430072595, 78.3430072595, 78.3430072595,...","[[-621.888163714, -514.707871055, -328.0597368...",16000,0
47,cat_143,"[[0.400532454936, 0.370885300403, 0.1869364666...","[[-0.0317690699701, -0.0317690699701, -0.03176...","[-0.0133972, -0.0136414, -0.0125122, -0.009735...",1,data/cats_dogs/cat_143.wav,"[[-1.19749248861, -1.19749248861, -1.197492488...","[[-232.206815087, -160.990056594, -165.8800949...",16000,0
162,cat_98,"[[0.0936867859149, 0.0233685110979, 2.68289609...","[[-0.00741383724129, -0.00741383724129, -0.007...","[0.0514832, 0.0202026, 0.0200806, 0.0310059, 0...",1,data/cats_dogs/cat_98.wav,"[[-2.82157313679, -2.82157313679, -2.821573136...","[[-78.7450920377, -74.9702123871, -92.75200084...",16000,0
66,cat_161,"[[0.385523469687, 1.62445774303, 1.3270942123,...","[[1.49649656235, 1.49649656235, 1.49649656235,...","[0.00283813, 0.000671387, 0.00146484, 0.000274...",1,data/cats_dogs/cat_161.wav,"[[-0.95505003891, -0.95505003891, -0.955050038...","[[-446.583630298, -436.893587484, -424.2445981...",16000,0
157,cat_93,"[[1.97665634329e-05, 0.00225395298427, 0.00727...","[[4.09756489549e-05, 4.09756489549e-05, 4.0975...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,data/cats_dogs/cat_93.wav,"[[27.2208429681, 27.2208429681, 27.2208429681,...","[[-524.75263806, -279.940137321, -195.23618208...",16000,0
