In [22]:
import numpy as np
#np.random.seed(1001)

import os
import shutil
import pandas as pd
import gc
from util import *
from dataset import *
from cnn2d import *
from seresnet import *
from analyze import *

from scipy.stats import skew, kurtosis
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_validation import StratifiedKFold
from keras.callbacks import (EarlyStopping, LearningRateScheduler,
                             ModelCheckpoint, TensorBoard, ReduceLROnPlateau)

import logging
import IPython.display as ipd

In [None]:
train_root = '../data/audio_train/'
test_root = '../data/audio_test/'
train = pd.read_csv("../data/train.csv", index_col="fname")
test = pd.read_csv("../data/sample_submission.csv", index_col="fname")

LABELS = list(train.label.unique())
label_idx = {label: i for i, label in enumerate(LABELS)}
train["label_idx"] = train.label.apply(lambda x: label_idx[x])

train = train[:10]
test = test[:10]

argucnt = 3
config = Config(
            sampling_rate=16000, n_mels = 128, n_mfcc = 128,
            audio_duration=5, n_folds=10, learning_rate=0.001,
            postfunc='', mixup=5, mixup_alpha = 2,
            use_mfcc = True
        )

In [None]:
def mfcc_stats_one(data):
    M = librosa.feature.mfcc(data, sr=16000, n_mfcc=20)
    data_row = np.hstack((np.mean(M, axis=1), np.std(M, axis=1), np.min(M, axis=1),
                              np.max(M, axis=1), skew(M, axis=1), kurtosis(M, axis=1)))
    spectral_features = [
        librosa.feature.spectral_centroid,
        librosa.feature.spectral_bandwidth,
        librosa.feature.spectral_contrast,
        librosa.feature.spectral_rolloff,
        librosa.feature.spectral_flatness,
        librosa.feature.zero_crossing_rate
    ]
    for feat in spectral_features:
        S = feat(data)[0]
        #print(data_row.shape, S.shape)
        data_row = np.hstack((data_row, np.mean(S), np.std(S), np.min(S),
                              np.max(S), skew(S), kurtosis(S)))
        
    return data_row

def mfcc_stats(wav):
    res = np.zeros((len(wav), 156), np.float32)
    for i in tqdm(range(len(wav))):
        data = wav[i].squeeze()
        data_row = mfcc_stats_one(data)
        res[i] = data_row
    return clean_np(res)

def get_cache_mfcc_statics(wav, config, flex = ''):
    flex = '{}_{}_{}_{}'.format(config.sampling_rate, config.audio_duration,
                                len(wav), flex)
    statics_path = '../cache/mfcc_statics_{}.npy'.format(flex)
    if os.path.exists(statics_path):
        mfcc_statics = np.load(statics_path)
    else:
        mfcc_statics = mfcc_stats(wav)
        np.save(statics_path, mfcc_statics)
    return mfcc_statics

In [None]:
X_train_original, X_train_mel, X_train_mfcc, y =\
            get_cachedata_all_train(train, train_root, config, argucnt, 'constant')

In [None]:
mel_new = get_cache_mfcc_statics(X_train_original, config, 'train')

In [2]:
#mfcc = np.load('../cache/mfcc_statics_16000_5_9400_test.npy')
mel = np.load('../cache/mel_statics_16000_5_9400_test.npy')
#wav = np.load('../cache/wav_statics_16000_5_9400_test.npy')
#seg = np.load('../cache/segment_statics_16000_5_9400_test.npy')

In [4]:
mel.shape

(9400, 192)

In [40]:
train_root = '../data/audio_train/'
test_root = '../data/audio_test/'

train = pd.read_csv("../data/train.csv", index_col="fname")
test = pd.read_csv("../data/sample_submission.csv", index_col="fname")

config = Config(sampling_rate=16000, n_mels = 128, 
                        audio_duration=5, n_folds=10, learning_rate=0.001,
                       postfunc='', mixup=0, mixup_alpha = 2,
                       use_mfcc = True)
argucnt = 3
padfunc = 'constant'
X_train_original, X_train_mel, X_train_mfcc, y =\
    get_cachedata_all_train(train, train_root, config, argucnt, padfunc)
X_test_original, X_test_mel, X_test_mfcc =\
    get_cachedata_all_test(test, test_root, config, padfunc)

wav_statics_train = get_cache_wav_statics(X_train_original, config, 'train')
wav_statics_test = get_cache_wav_statics(X_test_original, config, 'test')

wav_statics_train[wav_statics_train == -np.inf] = 0
wav_statics_test[wav_statics_test == -np.inf] = 0

mfcc_statics_train = get_cache_mfcc_statics(X_train_original, config, flex='train')
mfcc_statics_test = get_cache_mfcc_statics(X_test_original, config, flex='test')

mel_statics_train = get_cache_mel_statics(X_train_original, config, flex='train')
mel_statics_test = get_cache_mel_statics(X_test_original, config, flex='test')

seg_statics_train = get_segment_statics(X_train_original, config, flex='train')
seg_statics_test = get_segment_statics(X_test_original, config, flex='test')

train_statics = np.hstack([wav_statics_train, mfcc_statics_train, mel_statics_train, seg_statics_train])
test_statics = np.hstack([wav_statics_test, mfcc_statics_test, mel_statics_test, seg_statics_test])

../cache/wav_train_16000_5_9473_128_3.npy


In [25]:
scaler = MinMaxScaler()

In [42]:
all_statics = scaler.fit_transform(test_statics)

In [39]:
wav_statics_train[wav_statics_train == -np.inf]

array([-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
       -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
       -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
       -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
       -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
       -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
       -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
       -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
       -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
       -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
       -inf, -inf, -inf, -inf, -inf], dtype=float32)