#### EDAのまとめ
前提：単数ラベルのもののみ＆テストデータのラベルに含まれるもののみを訓練データから抜粋  
・訓練データを整理するとラベルの数は74個（テストデータは80個）  
・ラベルのダブりが多いのが７５個少ないので3個  
・基本的に短い音声が多い  
・１つだけ1分くらいの長い音声あり  　
  
  
#### アイデア
・ヒストグラムを見ると短い音声フレームに偏りがあるので正規化とか有効そう  
・カンマ区切りの複数ラベルを考慮する必要あり（one-hot-encordingとか？）  
・とりあえず今は成形済みのデータ（train_curated）のみを使う  
・長い音声（1つ）の処理  

In [1]:
import librosa
import numpy as np
import pandas as pd
import scipy
from keras import losses, models, optimizers
from keras.activations import relu, softmax
from keras.callbacks import (EarlyStopping, LearningRateScheduler,
                            ModelCheckpoint, TensorBoard, ReduceLROnPlateau)
from keras.layers import (Convolution1D, Dense, Dropout, GlobalAveragePooling1D, 
                          GlobalMaxPool1D, Input, MaxPool1D, concatenate)
from keras.utils import Sequence, to_categorical

COMPLETE_RUN = True

Using TensorFlow backend.


In [2]:
train = pd.read_csv('../input/train_curated.csv')
test = pd.read_csv('../input/sample_submission.csv')

In [3]:
train.shape

(4970, 2)

In [9]:
train = train[train.labels.isin(test.columns[1:])] # 単数ラベルかつテストデータのラベルと一致するもののみ取り出す
category_group = train.groupby(['labels']).count() # 訓練データのlabelsの列で同じラベルをグルーピングしてカウントする
category_group.columns = ['counts'] 

In [10]:
class Config(object):
    def __init__(self,
                 sampling_rate=16000, audio_duration=2, 
                 n_classes=len(category_group),
                 use_mfcc=False, n_folds=10, learning_rate=0.0001, 
                 max_epochs=50, n_mfcc=20):
        self.sampling_rate = sampling_rate
        self.audio_duration = audio_duration
        self.n_classes = n_classes
        self.use_mfcc = use_mfcc
        self.n_mfcc = n_mfcc
        self.n_folds = n_folds
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs

        self.audio_length = self.sampling_rate * self.audio_duration
        if self.use_mfcc:
            self.dim = (self.n_mfcc, 1 + int(np.floor(self.audio_length/512)), 1)
        else:
            self.dim = (self.audio_length, 1)

In [12]:
class DataGenerator(Sequence):
    def __init__(self, config, data_dir, list_IDs, labels=None, 
                 batch_size=64, preprocessing_fn=lambda x: x):
        self.config = config
        self.data_dir = data_dir
        self.list_IDs = list_IDs
        self.labels = labels
        self.batch_size = batch_size
        self.preprocessing_fn = preprocessing_fn
        self.on_epoch_end()
        self.dim = self.config.dim

    def __len__(self):
        return int(np.ceil(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        return self.__data_generation(list_IDs_temp)

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.list_IDs))

    def __data_generation(self, list_IDs_temp):
        cur_batch_size = len(list_IDs_temp)
        X = np.empty((cur_batch_size, *self.dim))

        input_length = self.config.audio_length
        for i, ID in enumerate(list_IDs_temp):
            file_path = self.data_dir + ID
            
            # Read and Resample the audio
            data, _ = librosa.core.load(file_path, sr=self.config.sampling_rate,
                                        res_type='kaiser_fast')

            # Random offset / Padding
            if len(data) > input_length:
                max_offset = len(data) - input_length
                offset = np.random.randint(max_offset)
                data = data[offset:(input_length+offset)]
            else:
                if input_length > len(data):
                    max_offset = input_length - len(data)
                    offset = np.random.randint(max_offset)
                else:
                    offset = 0
                data = np.pad(data, (offset, input_length - len(data) - offset), "constant")
                
            # Normalization + Other Preprocessing
            if self.config.use_mfcc:
                data = librosa.feature.mfcc(data, sr=self.config.sampling_rate,
                                                   n_mfcc=self.config.n_mfcc)
                data = np.expand_dims(data, axis=-1)
            else:
                data = self.preprocessing_fn(data)[:, np.newaxis]
            X[i,] = data

        if self.labels is not None:
            y = np.empty(cur_batch_size, dtype=int)
            for i, ID in enumerate(list_IDs_temp):
                y[i] = self.labels[ID]
            return X, to_categorical(y, num_classes=self.config.n_classes)
        else:
            return X

In [13]:
def audio_norm(data):
    max_data = np.max(data)
    min_data = np.min(data)
    data = (data-min_data)/(max_data-min_data+1e-6)
    return data - 0.5

Unnamed: 0,fname,labels
4959,ff962f57.wav,Fart
4960,ff9a20a8.wav,Bark
4961,ffa4cfd1.wav,Sigh
4962,ffa689eb.wav,Printer
4963,ffad130c.wav,Whispering
4965,ffd4ed26.wav,Tick-tock
4966,ffdc411e.wav,Slam
4967,ffe2178b.wav,Bus
4968,fffa69b7.wav,Electric_guitar
4969,ffff4631.wav,Meow
