## Mode

In [1]:
DEBUG_MODE = False

## Parameters

In [35]:
CURATED_ONLY = True # use only curated data for training
USE_CLEAN_NOISY = True # Use clean noisy or dirty noisy
TRAIN_AUGMENT = True # use augmentation for training data?
MODEL = 'cnn8th' # choose among 'crnn', 'simple', 'cnn8th'

In [34]:
SIZE=128
EPOCHS = [303, 0, 0]
TTA = [19, 0, 0]
BATCH_SIZE = 32
checkpoint_file = ['model_best1.h5', 'model_best2.h5', 'model_best3.h5']
LR = 4e-4
PATIENCE = 9 #ReduceOnPlateau option
LR_FACTOR = 0.75 #ReduceOnPlateau option
VALID_AUGMENT = False
SEED = 1129
USE_MIXUP = True
MIXUP_PROB = 0.25

SAMPLING_RATE = 44100  # 44.1[kHz]
SAMPLE_DURATION = 2  # 2[sec]
N_MEL = 128  # spectrogram y axis size
FRAME_PER_SEC = N_MEL
FFT_WINDOW_SIZE = 40
SPEC_AUGMENTATION_RATE = 2

# SPEC_AUGMENTATION
NUM_MASK = 2
FREQ_MASKING_MAX_PERCENTAGE = 0.15
TIME_MASKING_MAX_PERCENTAGE = 0.30

# No K-Fold implementation yet
# NUM_K_FOLDS = 5 # how many folds (K) you gonna splits
# NUM_MODEL_RUN = 5 # how many models (<= K) you gonna train [e.g. set to 1 for a simple train/test split]

In [4]:
COMPETITION_DATASET_NAME = "freesound-audio-tagging-2019"
PREPROCESSED_DATASET_NAME = "fat2019_prep_mels1"

Training

In [5]:
ACTIVATION = 'linear' 
LOSS = 'BCEwithLogits' 

## Import

In [6]:
import gc
import os
import random
import numpy as np
import pandas as pd
import pickle
import librosa

from fastprogress import master_bar, progress_bar
from functools import partial
from inspect import currentframe
from numba import jit
from pathlib import Path
from PIL import Image
from psutil import cpu_count
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

## Import torch

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import transforms

## Import Keras

In [38]:
import tensorflow as tf
import keras
from keras import backend as K
from keras import metrics
from keras.applications.mobilenet_v2 import preprocess_input as preprocess_mobile
from keras.layers import *
from keras.models import Sequential, load_model, Model
from keras.optimizers import Adam 
from keras.utils import Sequence

## Util

In [9]:
# Utility Cell
def chkprint(*args):
    def view_dir(dictionary):
        string = "\n========\n"
        for key, val in dictionary.items():
             string += key + ":\t" + str(val) + "\n"
        string += "--------------\n"
        return string
    names = {id(v):k for k,v in currentframe().f_back.f_locals.items()}
    out =""
    for arg in args:
        attr_name = names.get(id(arg))
        out += attr_name + ": "
        if type(arg) == dict:
            out += view_dir(arg)
        else:
            out += str(arg) + "\n"
    print(out)

In [10]:
IS_KERNEL = ("local" in os.uname()[1]) is False

## Path

In [11]:
ROOT_PATH = Path("..") if IS_KERNEL else  Path(".").absolute().parents[0]
dataset_dir = ROOT_PATH / "input" / COMPETITION_DATASET_NAME
preprocessed_dir = ROOT_PATH / "input" / PREPROCESSED_DATASET_NAME

In [12]:
HEAD = "debug_" if DEBUG_MODE else ""
CURATED_DIR = HEAD + "train_curated"
NOISY_DIR = HEAD + "train_noisy"
TEST_DIR = HEAD + "test"
SAMPLE_SUBMISSION = HEAD + "sample_submission"

In [13]:
csvs = {
    'train_curated': dataset_dir / '{}.csv'.format(CURATED_DIR),
    'train_noisy': dataset_dir / '{}.csv'.format(NOISY_DIR),
    'clean_train_noisy': preprocessed_dir / 'trn_noisy_best50s.csv',
    'sample_submission': dataset_dir / '{}.csv'.format(SAMPLE_SUBMISSION),
}

dataset = {
    'train_curated': dataset_dir / 'train_curated',
    'train_noisy': dataset_dir / 'train_noisy',
    'test': dataset_dir / 'test',
}

mels = {
    'train_curated': preprocessed_dir / 'mels_train_curated.pkl',
    'train_noisy': preprocessed_dir / 'mels_trn_noisy_best50s.pkl',
}

In [14]:
# Utility Cell
chkprint(dataset_dir, preprocessed_dir, HEAD, csvs, dataset, mels)

dataset_dir: /Users/berry/Kaggle/kaggle_freesound/input/freesound-audio-tagging-2019
preprocessed_dir: /Users/berry/Kaggle/kaggle_freesound/input/fat2019_prep_mels1
HEAD: 
csvs: 
train_curated:	/Users/berry/Kaggle/kaggle_freesound/input/freesound-audio-tagging-2019/train_curated.csv
train_noisy:	/Users/berry/Kaggle/kaggle_freesound/input/freesound-audio-tagging-2019/train_noisy.csv
clean_train_noisy:	/Users/berry/Kaggle/kaggle_freesound/input/fat2019_prep_mels1/trn_noisy_best50s.csv
sample_submission:	/Users/berry/Kaggle/kaggle_freesound/input/freesound-audio-tagging-2019/sample_submission.csv
--------------
dataset: 
train_curated:	/Users/berry/Kaggle/kaggle_freesound/input/freesound-audio-tagging-2019/train_curated
train_noisy:	/Users/berry/Kaggle/kaggle_freesound/input/freesound-audio-tagging-2019/train_noisy
test:	/Users/berry/Kaggle/kaggle_freesound/input/freesound-audio-tagging-2019/test
--------------
mels: 
train_curated:	/Users/berry/Kaggle/kaggle_freesound/input/fat2019_prep_

## Initialization

In [15]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(SEED)

In [16]:
DataLoader = partial(DataLoader, num_workers=cpu_count())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Definition of Keras functions

In [17]:
# from https://www.kaggle.com/rio114/keras-cnn-with-lwlrap-evaluation/
def tf_one_sample_positive_class_precisions(y_true, y_pred) :
    num_samples, num_classes = y_pred.shape
    
    # find true labels
    pos_class_indices = tf.where(y_true > 0) 
    
    # put rank on each element
    retrieved_classes = tf.nn.top_k(y_pred, k=num_classes).indices
    sample_range = tf.zeros(shape=tf.shape(tf.transpose(y_pred)), dtype=tf.int32)
    sample_range = tf.add(sample_range, tf.range(tf.shape(y_pred)[0], delta=1))
    sample_range = tf.transpose(sample_range)
    sample_range = tf.reshape(sample_range, (-1,num_classes*tf.shape(y_pred)[0]))
    retrieved_classes = tf.reshape(retrieved_classes, (-1,num_classes*tf.shape(y_pred)[0]))
    retrieved_class_map = tf.concat((sample_range, retrieved_classes), axis=0)
    retrieved_class_map = tf.transpose(retrieved_class_map)
    retrieved_class_map = tf.reshape(retrieved_class_map, (tf.shape(y_pred)[0], num_classes, 2))
    
    class_range = tf.zeros(shape=tf.shape(y_pred), dtype=tf.int32)
    class_range = tf.add(class_range, tf.range(num_classes, delta=1))
    
    class_rankings = tf.scatter_nd(retrieved_class_map,
                                          class_range,
                                          tf.shape(y_pred))
    
    #pick_up ranks
    num_correct_until_correct = tf.gather_nd(class_rankings, pos_class_indices)

    # add one for division for "presicion_at_hits"
    num_correct_until_correct_one = tf.add(num_correct_until_correct, 1) 
    num_correct_until_correct_one = tf.cast(num_correct_until_correct_one, tf.float32)
    
    # generate tensor [num_sample, predict_rank], 
    # top-N predicted elements have flag, N is the number of positive for each sample.
    sample_label = pos_class_indices[:, 0]   
    sample_label = tf.reshape(sample_label, (-1, 1))
    sample_label = tf.cast(sample_label, tf.int32)
    
    num_correct_until_correct = tf.reshape(num_correct_until_correct, (-1, 1))
    retrieved_class_true_position = tf.concat((sample_label, 
                                               num_correct_until_correct), axis=1)
    retrieved_pos = tf.ones(shape=tf.shape(retrieved_class_true_position)[0], dtype=tf.int32)
    retrieved_class_true = tf.scatter_nd(retrieved_class_true_position, 
                                         retrieved_pos, 
                                         tf.shape(y_pred))
    # cumulate predict_rank
    retrieved_cumulative_hits = tf.cumsum(retrieved_class_true, axis=1)

    # find positive position
    pos_ret_indices = tf.where(retrieved_class_true > 0)

    # find cumulative hits
    correct_rank = tf.gather_nd(retrieved_cumulative_hits, pos_ret_indices)  
    correct_rank = tf.cast(correct_rank, tf.float32)

    # compute presicion
    precision_at_hits = tf.truediv(correct_rank, num_correct_until_correct_one)

    return pos_class_indices, precision_at_hits

In [18]:
def tf_lwlrap(y_true, y_pred):
    num_samples, num_classes = y_pred.shape
    pos_class_indices, precision_at_hits = (tf_one_sample_positive_class_precisions(y_true, y_pred))
    pos_flgs = tf.cast(y_true > 0, tf.int32)
    labels_per_class = tf.reduce_sum(pos_flgs, axis=0)
    weight_per_class = tf.truediv(tf.cast(labels_per_class, tf.float32),
                                  tf.cast(tf.reduce_sum(labels_per_class), tf.float32))
    sum_precisions_by_classes = tf.zeros(shape=(num_classes), dtype=tf.float32)  
    class_label = pos_class_indices[:,1]
    sum_precisions_by_classes = tf.unsorted_segment_sum(precision_at_hits,
                                                        class_label,
                                                       num_classes)
    labels_per_class = tf.cast(labels_per_class, tf.float32)
    labels_per_class = tf.add(labels_per_class, 1e-7)
    per_class_lwlrap = tf.truediv(sum_precisions_by_classes,
                                  tf.cast(labels_per_class, tf.float32))
    out = tf.cast(tf.tensordot(per_class_lwlrap, weight_per_class, axes=1), dtype=tf.float32)
    return out

In [19]:
def BCEwithLogits(y_true, y_pred):
    return K.mean(K.binary_crossentropy(y_true, y_pred, from_logits=True), axis=-1)

## Load data

In [20]:
train_curated = pd.read_csv(csvs['train_curated'])
noisy_name = "clean_train_noisy" if USE_CLEAN_NOISY else "train_noisy"
train_noisy = pd.read_csv(csvs[noisy_name])
train_df = train_curated
if CURATED_ONLY is False:
    train_df = pd.concat([train_curated, train_noisy], sort=True, ignore_index=True)
test_df = pd.read_csv(csvs['sample_submission'])
labels = test_df.columns[1:].tolist()
num_classes = len(labels)

In [21]:
# Utility Cell
display(train_curated.head(5))
display(train_noisy.head(5))
display(test_df.head(5))
display(labels[:10])
chkprint(num_classes)

Unnamed: 0,fname,labels
0,0006ae4e.wav,Bark
1,0019ef41.wav,Raindrop
2,001ec0ad.wav,Finger_snapping
3,0026c7cb.wav,Run
4,0026f116.wav,Finger_snapping


Unnamed: 0,fname,labels,singled
0,35688e71.wav,Bathtub_(filling_or_washing),True
1,60d25862.wav,Bathtub_(filling_or_washing),True
2,c0f6fce9.wav,Bathtub_(filling_or_washing),True
3,f3221561.wav,Bathtub_(filling_or_washing),True
4,b2af1dc9.wav,Bathtub_(filling_or_washing),True


Unnamed: 0,fname,Accelerating_and_revving_and_vroom,Accordion,Acoustic_guitar,Applause,Bark,Bass_drum,Bass_guitar,Bathtub_(filling_or_washing),Bicycle_bell,...,Toilet_flush,Traffic_noise_and_roadway_noise,Trickle_and_dribble,Walk_and_footsteps,Water_tap_and_faucet,Waves_and_surf,Whispering,Writing,Yell,Zipper_(clothing)
0,000ccb97.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0012633b.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,001ed5f1.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,00294be0.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,003fde7a.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


['Accelerating_and_revving_and_vroom',
 'Accordion',
 'Acoustic_guitar',
 'Applause',
 'Bark',
 'Bass_drum',
 'Bass_guitar',
 'Bathtub_(filling_or_washing)',
 'Bicycle_bell',
 'Burping_and_eructation']

num_classes: 80



In [22]:
y_train = np.zeros((len(train_df), num_classes)).astype(int)
for i, row in enumerate(train_df['labels'].str.split(',')):
    for label in row:
        idx = labels.index(label)
        y_train[i, idx] = 1

In [23]:
with open(mels['train_curated'], 'rb') as curated, open(mels['train_noisy'], 'rb') as noisy:
    x_train = pickle.load(curated)
    if CURATED_ONLY == False:
        x_train.extend(pickle.load(noisy))

In [24]:
# Utility Cell
print("y_train.shape: {}".format(y_train.shape))
print("length of x_train: {}".format(len(x_train)))

y_train.shape: (4970, 80)
length of x_train: 4970


## Definition of models

In [30]:
def conv_simple_block(x, n_filters):
    x = Convolution2D(n_filters, (3,1), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    
    x = Convolution2D(n_filters, (3,1), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = AveragePooling2D()(x)
    return x

In [28]:
def create_model_simplecnn(n_out=num_classes):
    inp = Input(shape=(128,128,3))
    # np = Input(shape=(None,None,3))
    x = conv_simple_block(inp,64)
    x = conv_simple_block(x,128)
    x = conv_simple_block(x,256)
    x = conv_simple_block(x,128)
    
    # x1 = GlobalAveragePooling2D()(x)
    # x2 = GlobalMaxPooling2D()(x)
    # x = Add()([x1,x2])

    x = Flatten()(x)
    x = Dropout(0.2)(x)

    x = Dense(128, activation='linear')(x)
    x = PReLU()(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    predictions = Dense(n_out, activation=ACTIVATION)(x)

    model = Model(inputs=inp, outputs=predictions)
    return model

In [32]:
def output_of_lambda(input_shape):
    return (input_shape[0], input_shape[2], input_shape[3])

def my_max(x):
    return K.max(x, axis=1, keepdims=False)

def crnn_simple_block(x, n_filters):
    x = Convolution2D(n_filters, (3,1), padding="same")(x)
    x = Activation("relu")(x)
    x = Convolution2D(n_filters, (3,1), padding="same")(x)
    x = Activation("relu")(x)
    x = MaxPooling2D()(x)
    x = Dropout(0.2)(x)
    return x

def create_model_crnn(n_out= num_classes):
    
    # inp = Input(shape=(128,128,3))
    inp = Input(shape=(128,None,3))
    x = crnn_simple_block(inp,64)
    x = crnn_simple_block(x,128)
    x = crnn_simple_block(x,256)
    
    # eliminate the frequency dimension, x = (batch, time, channels)
    x = Lambda(my_max, output_shape=output_of_lambda)(x)
    
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    #  x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(128, activation='linear')(x)
    x = PReLU()(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    predictions = Dense(n_out, activation=ACTIVATION)(x)

    model = Model(inputs=inp, outputs=predictions)
    return model

In [33]:
# from the 8th solution in 2018 competition
# https://github.com/sainathadapa/kaggle-freesound-audio-tagging
def create_model_cnn8th(n_out=num_classes):
    regu=0
    inp = Input(shape=(128,128,3))

    x = Conv2D(48, 11,  strides=(1,1),kernel_initializer='he_uniform', activation='relu', padding='same',kernel_regularizer=regularizers.l2(regu))(inp)
    x = BatchNormalization()(x)
    x = Conv2D(48, 11,  strides=(2,3),kernel_initializer='he_uniform', activation='relu', padding='same',kernel_regularizer=regularizers.l2(regu))(x)
    x = MaxPooling2D(3, strides=(1,2))(x)
    x = BatchNormalization()(x)

    x = Conv2D(128, 5, strides=(1,1),kernel_initializer='he_uniform', activation='relu', padding='same',kernel_regularizer=regularizers.l2(regu))(x)
    x = BatchNormalization()(x)
    x = Conv2D(128, 5, strides=(2,3),kernel_initializer='he_uniform', activation='relu', padding='same',kernel_regularizer=regularizers.l2(regu))(x)
    x = MaxPooling2D(3, strides=2)(x)
    x = BatchNormalization()(x)

    x = Conv2D(192, 3, strides=1,kernel_initializer='he_uniform', activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = Conv2D(192, 3, strides=1,kernel_initializer='he_uniform', activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = Conv2D(128, 3, strides=1,kernel_initializer='he_uniform', activation='relu', padding='same',kernel_regularizer=regularizers.l2(regu))(x)
    x = MaxPooling2D(3, strides=(1,2))(x)
    x = BatchNormalization()(x)

    x = Flatten()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)

    predictions = Dense(n_out, activation=ACTIVATION)(x)

    model = Model(inputs=inp, outputs=predictions)
    return model

In [40]:
K.clear_session()
preprocess_input = preprocess_mobile
if MODEL == 'crnn':
    model = create_model_crnn(n_out=num_classes)
elif MODEL == 'cnn8th':
    model = create_model_cnn8th(n_out=num_classes)
else:
    model = create_model_simplecnn(n_out=num_classes)

print(MODEL)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
cnn8th
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 128, 128, 3)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 128, 128, 48)      17472     
_________________________________________________________________
batch_normalization_1 (Batch (None, 128, 128, 48)      192       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 64, 43, 48)        278832    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 62, 21, 48)        0         
_________________________________________________________________
batch_normaliz