In [17]:
from __future__ import print_function
from __future__ import absolute_import

import warnings
import numpy as np
import os
import itertools
import sys
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn.metrics import confusion_matrix
from copy import copy

import tensorflow as tf

from keras.utils import Sequence
from keras import regularizers
from keras import layers
from keras.models import Model, Sequential

from keras.layers import Conv2D,SeparableConv2D, Dense, Dropout, Flatten, Activation, AveragePooling2D, MaxPooling2D, GlobalAveragePooling2D
from keras.layers import Input, Lambda, BatchNormalization, Reshape, Permute
from keras.layers import Concatenate, concatenate, Add, add, Multiply, multiply
from keras.layers.recurrent import GRU, LSTM

from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.models import load_model

from keras import backend as K

NUM_OF_GENRES = 10
BATCH_SIZE = 128

In [18]:
# data generator
class GTZANGenerator(Sequence):
    def __init__(self, X, y, batch_size=BATCH_SIZE, is_test = False):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.is_test = is_test
    
    def __len__(self):
        return int(np.ceil(len(self.X)/self.batch_size))
    
    def __getitem__(self, index):
        # Get batch indexes
        signals = self.X[index*self.batch_size:(index+1)*self.batch_size]

        # Apply data augmentation
        if not self.is_test:
            pass
            #signals = self.__augment(signals)
            
        return signals, self.y[index*self.batch_size:(index+1)*self.batch_size]
    
    def __augment(self, signals, hor_flip = 0.5, random_cutout = 0.5):
        spectrograms =  []
        for s in signals:
            signal = copy(s)
            
            # Perform horizontal flip
            #if np.random.rand() < hor_flip:
                #signal = np.flip(signal, 1)

            # Perform random cutoout of some frequency/time
            if np.random.rand() < random_cutout:
                lines = np.random.randint(signal.shape[0], size=3)
                cols = np.random.randint(signal.shape[0], size=4)
                signal[lines, :, :] = -80 # dB
                signal[:, cols, :] = -80 # dB

            spectrograms.append(signal)
        return np.array(spectrograms)
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.X))
        np.random.shuffle(self.indexes)
        return None

In [19]:
# confusion matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
# majority vote
def majority_vote(scores):
    values, counts = np.unique(scores,return_counts=True)
    ind = np.argmax(counts)
    return values[ind]


In [20]:
def XceptionNet(input_shape):
    ipt = Input(shape=input_shape)
    x = Conv2D(32, (3, 3), strides=(2, 2), use_bias=False, name='block1_conv1')(ipt)
    x = BatchNormalization(name='block1_conv1_bn')(x)
    x = Activation('relu', name='block1_conv1_act')(x)
    x = Conv2D(64, (3, 3), use_bias=False, name='block1_conv2')(x)
    x = BatchNormalization(name='block1_conv2_bn')(x)
    x = Activation('relu', name='block1_conv2_act')(x)

    residual = Conv2D(128, (1, 1), strides=(2, 2),
                      padding='same', use_bias=False)(x)
    residual = BatchNormalization()(residual)

    x = SeparableConv2D(128, (3, 3), padding='same', use_bias=False, name='block2_sepconv1')(x)
    x = BatchNormalization(name='block2_sepconv1_bn')(x)
    x = Activation('relu', name='block2_sepconv2_act')(x)
    x = SeparableConv2D(128, (3, 3), padding='same', use_bias=False, name='block2_sepconv2')(x)
    x = BatchNormalization(name='block2_sepconv2_bn')(x)

    x = MaxPooling2D((3, 3), strides=(2, 2), padding='same', name='block2_pool')(x)
    x = layers.add([x, residual])

    residual = Conv2D(256, (1, 1), strides=(2, 2),
                      padding='same', use_bias=False)(x)
    residual = BatchNormalization()(residual)

    x = Activation('relu', name='block3_sepconv1_act')(x)
    x = SeparableConv2D(256, (3, 3), padding='same', use_bias=False, name='block3_sepconv1')(x)
    x = BatchNormalization(name='block3_sepconv1_bn')(x)
    x = Activation('relu', name='block3_sepconv2_act')(x)
    x = SeparableConv2D(256, (3, 3), padding='same', use_bias=False, name='block3_sepconv2')(x)
    x = BatchNormalization(name='block3_sepconv2_bn')(x)

    x = MaxPooling2D((3, 3), strides=(2, 2), padding='same', name='block3_pool')(x)
    x = layers.add([x, residual])

    residual = Conv2D(728, (1, 1), strides=(2, 2),
                      padding='same', use_bias=False)(x)
    residual = BatchNormalization()(residual)

    x = Activation('relu', name='block4_sepconv1_act')(x)
    x = SeparableConv2D(728, (3, 3), padding='same', use_bias=False, name='block4_sepconv1')(x)
    x = BatchNormalization(name='block4_sepconv1_bn')(x)
    x = Activation('relu', name='block4_sepconv2_act')(x)
    x = SeparableConv2D(728, (3, 3), padding='same', use_bias=False, name='block4_sepconv2')(x)
    x = BatchNormalization(name='block4_sepconv2_bn')(x)

    x = MaxPooling2D((3, 3), strides=(2, 2), padding='same', name='block4_pool')(x)
    x = layers.add([x, residual])

    for i in range(8):
        residual = x
        prefix = 'block' + str(i + 5)

        x = Activation('relu', name=prefix + '_sepconv1_act')(x)
        x = SeparableConv2D(728, (3, 3), padding='same', use_bias=False, name=prefix + '_sepconv1')(x)
        x = BatchNormalization(name=prefix + '_sepconv1_bn')(x)
        x = Activation('relu', name=prefix + '_sepconv2_act')(x)
        x = SeparableConv2D(728, (3, 3), padding='same', use_bias=False, name=prefix + '_sepconv2')(x)
        x = BatchNormalization(name=prefix + '_sepconv2_bn')(x)
        x = Activation('relu', name=prefix + '_sepconv3_act')(x)
        x = SeparableConv2D(728, (3, 3), padding='same', use_bias=False, name=prefix + '_sepconv3')(x)
        x = BatchNormalization(name=prefix + '_sepconv3_bn')(x)

        x = layers.add([x, residual])

    residual = Conv2D(1024, (1, 1), strides=(2, 2),
                      padding='same', use_bias=False)(x)
    residual = BatchNormalization()(residual)

    x = Activation('relu', name='block13_sepconv1_act')(x)
    x = SeparableConv2D(728, (3, 3), padding='same', use_bias=False, name='block13_sepconv1')(x)
    x = BatchNormalization(name='block13_sepconv1_bn')(x)
    x = Activation('relu', name='block13_sepconv2_act')(x)
    x = SeparableConv2D(1024, (3, 3), padding='same', use_bias=False, name='block13_sepconv2')(x)
    x = BatchNormalization(name='block13_sepconv2_bn')(x)

    x = MaxPooling2D((3, 3), strides=(2, 2), padding='same', name='block13_pool')(x)
    x = layers.add([x, residual])

    x = SeparableConv2D(1536, (3, 3), padding='same', use_bias=False, name='block14_sepconv1')(x)
    x = BatchNormalization(name='block14_sepconv1_bn')(x)
    x = Activation('relu', name='block14_sepconv1_act')(x)

    x = SeparableConv2D(2048, (3, 3), padding='same', use_bias=False, name='block14_sepconv2')(x)
    x = BatchNormalization(name='block14_sepconv2_bn')(x)
    x = Activation('relu', name='block14_sepconv2_act')(x)
    
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024, activation='relu', name='dense1')(x)
    x = Dropout(0.25)(x)
    x = Dense(512, activation='relu', name='dense2')(x)
    x = Dropout(0.25)(x)
    x = Dense(256, activation = 'relu', name='dense3')(x)
    x = Dropout(0.25)(x)
    x = Dense(128, activation = 'relu', name='dense4')(x)
    predictions = Dense(10, activation='softmax', name='result')(x)
    
    model = Model(inputs=ipt, outputs=predictions)
    return model

In [21]:
# load data
print('============================================================================')
print('Loading dataset...')
print('============================================================================')
X_train = np.load('/home/jaehwlee/Genre_classification/mel_data/X_train.npy')
X_valid = np.load('/home/jaehwlee/Genre_classification/mel_data/X_valid.npy')
X_test = np.load('/home/jaehwlee/Genre_classification/mel_data/X_test.npy')
y_train = np.load('/home/jaehwlee/Genre_classification/mel_data/y_train.npy')
y_valid = np.load('/home/jaehwlee/Genre_classification/mel_data/y_valid.npy')
y_test = np.load('/home/jaehwlee/Genre_classification/mel_data/y_test.npy')

# X_train = np.load('/home/jaehwlee/SE-ResNeXt/mel_data/0519X_train.npy')
# X_test =np.load('/home/jaehwlee/SE-ResNeXt/mel_data/0519X_test.npy')
# y_train = np.load('/home/jaehwlee/SE-ResNeXt/mel_data/0519y_train.npy')
# y_test = np.load('/home/jaehwlee/SE-ResNeXt/mel_data/0519y_test.npy')

song_samples = 660000
genres = {'metal': 0, 'disco': 1, 'classical': 2, 'hiphop': 3, 'jazz': 4, 
          'country': 5, 'pop': 6, 'blues': 7, 'reggae': 8, 'rock': 9}

print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)

print(y_train.shape)
print(y_valid.shape)
print(y_test.shape)

print('============================================================================')
print('complete!')
print('============================================================================')

Loading dataset...
(24960, 128, 129, 1)
(6240, 128, 129, 1)
(7800, 128, 129, 1)
(24960, 10)
(6240, 10)
(7800, 10)
complete!


In [22]:
# callback function
reduceLROnPlat = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.95,
    patience=3,
    verbose=1,
    mode='min',
    min_delta=0.0001,
    cooldown=2,
    min_lr=1e-5
)

mc = ModelCheckpoint('xception_net.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True, save_weights_only=True)

rl = ReduceLROnPlateau(monitor='val_loss', factor=0.95, patience=3, verbose=1, mode='min', min_delta=0.0001, cooldown=2, min_lr=1e-5)
callback_list = [mc,rl]

In [23]:
batch_size = BATCH_SIZE
train_generator = GTZANGenerator(X_train, y_train)
steps_per_epoch = np.ceil(len(X_train)/batch_size)

validation_generator = GTZANGenerator(X_valid, y_valid)
val_steps = np.ceil(len(X_test)/batch_size)

In [24]:
model = XceptionNet(X_train[0].shape)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
hist = model.fit_generator(
    train_generator,
    steps_per_epoch=steps_per_epoch,
    validation_data=validation_generator,
    validation_steps=val_steps,
    epochs=150,
    verbose=1,
    callbacks=[mc,rl])

model.save('xception_net.h5')

score = model.evaluate(X_test, y_test, verbose=0)
print("test_loss = {:.3f} and test_acc = {:.3f}".format(score[0], score[1]))

Epoch 1/150

Epoch 00001: val_acc improved from -inf to 0.10031, saving model to xception_net.h5
Epoch 2/150

Epoch 00002: val_acc improved from 0.10031 to 0.17477, saving model to xception_net.h5
Epoch 3/150

Epoch 00003: val_acc improved from 0.17477 to 0.20666, saving model to xception_net.h5
Epoch 4/150

Epoch 00004: val_acc improved from 0.20666 to 0.23804, saving model to xception_net.h5
Epoch 5/150

Epoch 00005: val_acc improved from 0.23804 to 0.30837, saving model to xception_net.h5
Epoch 6/150

Epoch 00006: val_acc did not improve from 0.30837
Epoch 7/150

Epoch 00007: val_acc did not improve from 0.30837
Epoch 8/150

Epoch 00008: val_acc did not improve from 0.30837

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0009500000451225787.
Epoch 9/150

Epoch 00009: val_acc improved from 0.30837 to 0.31366, saving model to xception_net.h5
Epoch 10/150

Epoch 00010: val_acc improved from 0.31366 to 0.33282, saving model to xception_net.h5
Epoch 11/150

Epoch 00011: val_a


Epoch 00080: val_acc did not improve from 0.36728

Epoch 00080: ReduceLROnPlateau reducing learning rate to 0.00044012657308485355.
Epoch 81/150

Epoch 00081: val_acc did not improve from 0.36728
Epoch 82/150

Epoch 00082: val_acc did not improve from 0.36728
Epoch 83/150

Epoch 00083: val_acc did not improve from 0.36728
Epoch 84/150

Epoch 00084: val_acc did not improve from 0.36728

Epoch 00084: ReduceLROnPlateau reducing learning rate to 0.00041812024719547477.
Epoch 85/150

Epoch 00085: val_acc did not improve from 0.36728
Epoch 86/150

KeyboardInterrupt: 

In [None]:
# plot confuison matrix
preds = np.argmax(model.predict(X_test), axis = 1)
y_orig = np.argmax(y_test, axis = 1)
cm = confusion_matrix(preds, y_orig)

keys = OrderedDict(sorted(genres.items(), key=lambda t: t[1])).keys()

plt.figure(figsize=(10,10))
plot_confusion_matrix(cm, keys, normalize=True)

In [None]:

# 특징 추출
#model = Model(inputs=base_model.input, outputs=base_model.get_layer('dense4').output)
#music_feature = model.predict(X_train)