<div style="text-align: right"> - last read : 2019. 05. 18 </div>

## SSL (방위각 추정)
- Based on Spectrogram and Phase
- 2 Models: Convolutional GRU & 2D-CNN then 2D-CNN

### Inputs : 
- Input : Magnitude and Phase from two stereo channels : So there are 4 feature maps
- Magnitude, Phase : Basic Magnitude and Phase with STFT. 
- STFT : n_fft=1024, hop = 512 with sr=44100
- Stack 4 feature maps.  

In [1]:
# Common imports
import numpy as np
import pandas as pd
import os, sys, glob  
import tensorflow as tf

import librosa
import librosa.display

# To plot pretty figures
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# plt.style.use('ggplot')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

def reset_graph(seed=42):
    tf.reset_default_graph() 
    tf.set_random_seed(seed)
    np.random.seed(seed)
    
def reset_keras_session(seed=42):
    tf.keras.backend.clear_session()
    tf.set_random_seed(seed)
    np.random.seed(seed)

import warnings
warnings.filterwarnings("ignore")   # To rid of warnings 

os_sep = os.sep 

if sys.platform == 'win32':   # if windows 
    home = os.path.join('D:', os.sep, 'hblee')   # d:\hblee
    data_repo = os.path.join('D:', os.sep, 'Data_Repo_Win')   # d:\Data_Repo_Win
elif sys.platform == "linux" or sys.platform == "linux2" :    # if linux 
    home = os.path.expanduser("~")   # home = os.getenv("HOME")
    data_repo = os.path.join(home, 'Data_Repo')
    
#sys.path.append(os.path.join(home, 'Google_Sync', 'Dev_Exercise', 'utils'))
    
from tensorflow import keras 
keras.__version__, tf.VERSION

('2.2.4-tf', '1.13.1')

In [2]:

sample_data_repo = os.path.join(home, 'Downloads', 'audio')
samples = glob.glob(os.path.join(sample_data_repo, '**', '*wav'), recursive=True)
samples = sorted(samples)   # sort the samples

sample_vad_seg_repo = os.path.join(home, 'Downloads', 'binary_segment')   # 적절하게 변경 필요 
samples_vad_seg = glob.glob(os.path.join(sample_vad_seg_repo, '**', '*[npy|npz]'), recursive=True)
samples_vad_seg = sorted(samples_vad_seg)   

#  Checking 
print('samples: ', len(samples), samples[25])
print('samples segmented: ', len(samples_vad_seg), samples_vad_seg[25])   

samples:  315 /home/user4/Downloads/audio/100도/output31.wav
samples segmented:  315 /home/user4/Downloads/binary_segment/100도/output31.npz


# train data set 만들기

In [3]:
def mag_phase(file_path, sr=48000, n_fft=1024, hop_length=512, db=False, n_mels=50) :
    """
    stft의 magnitude와 phase 리턴.  첫번째 bin 제거 
    input : file path to an audio sample.  Assumed stereo. 
    """
    audio, sr = librosa.load(file_path, sr=sr, mono=False)   # 원래의 sr, stereo
    DL = librosa.stft(audio[0], n_fft=n_fft, hop_length=hop_length)
    DL_mag, DL_phase = librosa.magphase(DL)
    
    DR = librosa.stft(audio[1], n_fft=n_fft, hop_length=hop_length)
    DR_mag, DR_phase = librosa.magphase(DR)
    
    if db :
        DL_mag = librosa.core.amplitude_to_db(DL_mag)
        DR_mag = librosa.core.amplitude_to_db(DR_mag)
     
    # rescale the right magnitudes w.r.t left channel magnitude 
    avg = DL_mag.mean()     
    stdv = DL_mag.std()
    DL_mag = (DL_mag - avg)/stdv
    DR_mag = (DR_mag - avg)/stdv
    
    # return( (DL_mag, np.angle(DL_phase)), (DR_mag, np.angle(DR_phase)) )
    return( (DL_mag[1:, :], np.angle(DL_phase)[1:, :]), (DR_mag[1:, :], np.angle(DR_phase)[1:, :]) )

In [4]:
def generatio_tensor_instances(array_2d, dest_path, seq_len, hop, label):
    """
    array_2d : spectrogram.
    seq_len : number of frames in a instance
    label : 0 and 1's. The same length as original numpy vector 
    """
    row_size, col_size = array_2d.shape[0], array_2d.shape[1]
    ratio = len(label)/col_size  # ratio : how many data points per frame 
    stack_array = []    # 4D tensor that will hold the instances
    label_array = []

    j=0
    while j <= (col_size - (seq_len+1)): 
        context_frame = array_2d[:, j:(j+seq_len)]
        seg_label = round( label[int(j*ratio):int((j+seq_len)*ratio)].mean() )  # 이 것 바꿀 필요 
        
        dest_path_ext = ''.join([dest_path, '_', str(j)])
        os.makedirs(os.path.dirname(dest_path_ext), exist_ok=True)

        np.savez(dest_path_ext, spectrogram = context_frame,
                 label=seg_label)
        
        stack_array.append(context_frame[:,:,np.newaxis])   # make context_frame to 3d tensor & append 
        label_array.append(seg_label)
            
        j = j+hop
        
    return np.stack(stack_array, axis=0), label_array

#### 각각의 샘플에 대해 mag_phase() 함수와 generate_instances() 함수를 사용하고 마지막 차원에 대해 concatenate하여 4d tensor를 만들어준다.

In [5]:
no_samples = len(samples) 

mag_L_instances = []    # elements are ndarrays
mag_R_instances = []
phase_L_instances = []
phase_R_instances = []
label_instances = []         # elements are lists

for i in range(0, no_samples):
    voice_noise_label = np.load(samples_vad_seg[i])
    if('npy' in samples_vad_seg[i].split('/')[-1]):
        label = voice_noise_label[0]        # use the left channel label.  this take care of 0 degree problem
    else:                                   # npz file
        label = voice_noise_label["label"]    
    (mag_L, phase_L), (mag_R, phase_R) = mag_phase(samples[i], db=True)
    
    # generate instances with 1.16 sec duration (100 frames), at every 0.116 sec apart (10 hops)
    voice_dest_path = os.path.join("mag", "Left", str(i))
    mag_L_instances_sub, _ = generatio_tensor_instances(mag_L, voice_dest_path, 100, 10, label)
    
    voice_dest_path = os.path.join("mag", "Right", str(i))
    mag_R_instances_sub, _ = generatio_tensor_instances(mag_R, voice_dest_path, 100, 10, label)
    
    voice_dest_path = os.path.join("phase", "Left", str(i))
    phase_L_instances_sub, _ = generatio_tensor_instances(phase_L, voice_dest_path, 100, 10, label)
    
    voice_dest_path = os.path.join("phase", "Right", str(i))
    phase_R_instances_sub, label_sub = generatio_tensor_instances(phase_R, voice_dest_path, 100, 10, label)
    
    mag_L_instances.append(mag_L_instances_sub)
    mag_R_instances.append(mag_R_instances_sub)
    phase_L_instances.append(phase_L_instances_sub)
    phase_R_instances.append(phase_R_instances_sub)
    
    label_instances.append(np.array(label_sub))
    

print(len(mag_L_instances), len(phase_R_instances), len(label_instances))

mag_L_instances[0].shape, phase_R_instances[0].shape, label_instances[0].shape
# the first sample produced 15 instances.  

315 315 315


((28, 512, 100, 1), (28, 512, 100, 1), (28,))

In [6]:
stacked_instances = []

for i in range(0, no_samples):
    concat_tensor = np.concatenate([mag_L_instances[i], phase_L_instances[i], 
                                    mag_R_instances[i], phase_R_instances[i]], axis = -1)
    stacked_instances.append(concat_tensor)
    
len(stacked_instances), stacked_instances[0].shape    # L, R magnitudes and phases are stacked.

(315, (28, 512, 100, 4))

In [7]:
# the total number of instances generated:
total = 0
for i in range(0, no_samples):
    total = total + stacked_instances[i].shape[0]
print("Numer of the instances generated : : ",total) 

# the ratio of instances with 0 or 1 label.  76% of the instances are labeled 1 (voice)
ave=[]
for sample in label_instances:
    ave.append(np.mean(sample))
print("Percentage of voice instances: ", np.mean(ave) )

# Check the shapes 
for i in range(0, no_samples, 5):
    print(stacked_instances[i].shape, label_instances[i].shape)
    

Numer of the instances generated : :  30862
Percentage of voice instances:  0.4461591195302443
(28, 512, 100, 4) (28,)
(179, 512, 100, 4) (179,)
(28, 512, 100, 4) (28,)
(28, 512, 100, 4) (28,)
(179, 512, 100, 4) (179,)
(179, 512, 100, 4) (179,)
(28, 512, 100, 4) (28,)
(28, 512, 100, 4) (28,)
(28, 512, 100, 4) (28,)
(743, 512, 100, 4) (743,)
(28, 512, 100, 4) (28,)
(179, 512, 100, 4) (179,)
(179, 512, 100, 4) (179,)
(179, 512, 100, 4) (179,)
(179, 512, 100, 4) (179,)
(743, 512, 100, 4) (743,)
(179, 512, 100, 4) (179,)
(28, 512, 100, 4) (28,)
(28, 512, 100, 4) (28,)
(743, 512, 100, 4) (743,)
(179, 512, 100, 4) (179,)
(28, 512, 100, 4) (28,)
(28, 512, 100, 4) (28,)
(28, 512, 100, 4) (28,)
(179, 512, 100, 4) (179,)
(179, 512, 100, 4) (179,)
(28, 512, 100, 4) (28,)
(28, 512, 100, 4) (28,)
(28, 512, 100, 4) (28,)
(179, 512, 100, 4) (179,)
(179, 512, 100, 4) (179,)
(28, 512, 100, 4) (28,)
(28, 512, 100, 4) (28,)
(28, 512, 100, 4) (28,)
(179, 512, 100, 4) (179,)
(28, 512, 100, 4) (28,)
(28, 51

각각의 음성 샘플에 대해 알맞게 데이터가 형성

To this point, the `label_instances` showed the labels for `voice (1) and non_voice (0)` instances.  Now transform the `label_instances` to show the **class labels of the voice directions.**

#### noise와 voice 방향에 따라 labeling
- noise : 0                 
- 0도 : 1    
- 20도 : 2
- 40도 : 3
- 60도 : 4
- 80도 : 5
- 100도 : 6
- 120도 : 7
- 140도 : 8
- 160도 : 9
- 180도 :10                   

In [8]:
for i in range(143,179): #20도
    for j in range(0, len(label_instances[i])):
        if(label_instances[i][j] == 1):
            label_instances[i][j] = 2
            
for i in range(179,218): #40도
    for j in range(0, len(label_instances[i])):
        if(label_instances[i][j] == 1):
            label_instances[i][j] = 3
                       
for i in range(277,290): #60도
    for j in range(0, len(label_instances[i])):
        if(label_instances[i][j] == 1):
            label_instances[i][j] = 4
            
for i in range(218,265): #80도
    for j in range(0, len(label_instances[i])):
        if(label_instances[i][j] == 1):
            label_instances[i][j] = 5
            
for i in range(0,51): #100도
    for j in range(0, len(label_instances[i])):
        if(label_instances[i][j] == 1):
            label_instances[i][j] = 6
            
for i in range(290,303): #120도
    for j in range(0, len(label_instances[i])):
        if(label_instances[i][j] == 1):
            label_instances[i][j] = 7
            
for i in range(51,92): #140도
    for j in range(0, len(label_instances[i])):
        if(label_instances[i][j] == 1):
            label_instances[i][j] = 8

for i in range(92,143): #160도
    for j in range(0, len(label_instances[i])):
        if(label_instances[i][j] == 1):
            label_instances[i][j] = 9
                        
for i in range(303,no_samples): #180도
    for j in range(0, len(label_instances[i])):
        if(label_instances[i][j] == 1):
            label_instances[i][j] = 10
# label_instances

In [10]:
label_instances

[array([0., 0., 0., 0., 0., 0., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6.,
        6., 6., 6., 6., 6., 6., 6., 6., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6.,
        6., 6., 6., 6., 6., 6., 6., 6., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 6., 6., 6., 6., 6., 6., 6.,
        6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

We have 822 instances. And we have labeled them into 5 classes.  Let's see how those labels are distributed.

In [11]:
instances_labels = np.array([])
for audio_clip in label_instances:
    instances_labels = np.hstack([instances_labels, audio_clip])
   
pd.Series(instances_labels).value_counts().sort_index()   

0.0     23999
1.0       153
2.0       655
3.0      1278
4.0       139
5.0      1715
6.0       910
7.0       203
8.0       733
9.0       901
10.0      176
dtype: int64

- Class-0 Noise instances have the largest number

### Now we have `total_instances_tensor & total_label`
- `total_instances_tensor` : list. 50 elements.  Each element has instances of the audio 
- `total_label` ; list. 50 elements 

## Construct `train and validation set` split.
- Out of 50 audio samples, we will take 40 samples for training set, and the remaining to the validation set.  Try to mix them evenly.
- Note that `total_label` indices has : 0~11(Class-1), 12~24(Class-2), 23~37(Class-3), 38~49(Class-4) and Class-0 is assigned to the noise  
- 샘플을 랜덤하게 섞어주고 처음부터 40개는 train data set으로 마지막 10개는 validation data set으로 사용

In [12]:
import copy
vad_label_instances = copy.deepcopy(label_instances)

# transform the list to ndarray
total_instances_tensors = np.array(stacked_instances)  
total_label_tensors = np.array(label_instances)
total_vad_label_tensors = np.array(vad_label_instances)

# randomly choose indices to be split to training and validation set
np.random.seed(77)   # 19, 7, 5, 113, 34

idx = np.random.permutation(no_samples)

'''
c1 = 0
c2 = 0
c3 = 0
c4 = 0

for i in idx[-10:]:
    if 0 <= i <= 11:
        c1 = c1 + 1
    elif 12 <= i <= 24 :
        c2 = c2 + 1
    elif 25 <= i <= 37 :
        c3 = c3 + 1
    elif 38 <= i :
        c4 = c4 + 1

print("Valid set distr.: Class-1: %d,  Class-2: %d,  Class-3: %d,  Class-4: %d\n" % (c1, c2, c3, c4))
'''

# Shuffle 
X = total_instances_tensors[[idx]]   # Shuffle the data using fancy indexing
y = total_label_tensors[[idx]]
y_vad = total_vad_label_tensors[[idx]]

# Test 
for i in range(0, no_samples, 10):
    print(X[i].shape, y[i].shape)
    
# Split 
X_train = np.concatenate(X[:221], axis=0)
y_train = np.concatenate(y[:221], axis=0)
y_train_vad = np.concatenate(y_vad[:40], axis=0)

X_val = np.concatenate(X[221: ], axis=0)
y_val = np.concatenate(y[221: ], axis=0)
y_val_vad = np.concatenate(y_vad[221: ], axis=0)

print()
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

# Convert scalar y's to One-Hot.  'y_val_vad' is not needed because it is binary classification 
y_train = keras.utils.to_categorical(y_train, 11)    # 5-Classes classification 
y_val = keras.utils.to_categorical(y_val, 11)

(28, 512, 100, 4) (28,)
(28, 512, 100, 4) (28,)
(28, 512, 100, 4) (28,)
(743, 512, 100, 4) (743,)
(28, 512, 100, 4) (28,)
(179, 512, 100, 4) (179,)
(28, 512, 100, 4) (28,)
(179, 512, 100, 4) (179,)
(179, 512, 100, 4) (179,)
(20, 512, 100, 4) (20,)
(28, 512, 100, 4) (28,)
(179, 512, 100, 4) (179,)
(28, 512, 100, 4) (28,)
(13, 512, 100, 4) (13,)
(28, 512, 100, 4) (28,)
(12, 512, 100, 4) (12,)
(28, 512, 100, 4) (28,)
(31, 512, 100, 4) (31,)
(179, 512, 100, 4) (179,)
(743, 512, 100, 4) (743,)
(28, 512, 100, 4) (28,)
(179, 512, 100, 4) (179,)
(28, 512, 100, 4) (28,)
(28, 512, 100, 4) (28,)
(28, 512, 100, 4) (28,)
(21, 512, 100, 4) (21,)
(179, 512, 100, 4) (179,)
(28, 512, 100, 4) (28,)
(18, 512, 100, 4) (18,)
(21, 512, 100, 4) (21,)
(28, 512, 100, 4) (28,)
(19, 512, 100, 4) (19,)

(23053, 512, 100, 4) (23053,)
(7809, 512, 100, 4) (7809,)


# 모델1. 2D CNN + Bidirectional GRU

In [13]:
from tensorflow.keras.layers import TimeDistributed, Bidirectional
from tensorflow.keras.layers import Conv2D, Conv1D, MaxPooling2D, MaxPooling1D, Input, Flatten, Dropout
from tensorflow.keras import layers, models
from sklearn.metrics import confusion_matrix, classification_report

In [14]:
# reset_keras_session(100)

if 'model' in locals():
    del model
    
input_spectrogram = Input(shape=(X_train.shape[1], X_train.shape[2], X_train.shape[3]))

conv_1 = Conv2D(64, (3, 3), activation='relu', padding='same')(input_spectrogram)
conv_1_pool = MaxPooling2D((3, 2))(conv_1)

conv_2 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv_1_pool)
conv_2_pool = MaxPooling2D((3, 2))(conv_2)

conv_3 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv_2_pool)
conv_3_pool = MaxPooling2D((3, 2))(conv_3)

conv_4 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv_3_pool)
conv_4_pool = MaxPooling2D((3, 2))(conv_4)

shape_conv_4_pool = conv_4_pool.get_shape().as_list()    # (None, height, width, channel)
conv_5 = Conv2D(128, (shape_conv_4_pool[1], 1), padding='valid', activation='relu')(conv_4_pool)
# 앞의 conv_5 필터 수를 256 으로 늘리면 학습이 잘 진행되지 않음 

shape_conv_5 = conv_5.get_shape().as_list()
reshaped = layers.Reshape((shape_conv_5[2], shape_conv_5[3]))(conv_5)   # reshape to (timesteps, features) explicitly 
bgru = Bidirectional(layers.GRU(units=128))(reshaped)  # GRU units 의 수를 늘리면?  

fc1 = layers.Dense(32, activation='relu')(bgru)
fc1_drop = Dropout(0.5)(fc1)

dense_out = layers.Dense(11, activation='softmax')(fc1_drop)

model = models.Model(inputs=input_spectrogram, outputs=dense_out)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 512, 100, 4)       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 512, 100, 64)      2368      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 170, 50, 64)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 170, 50, 64)       36928     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 56, 25, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)    

In [None]:
#show_model_graph(model)

In [None]:
model.compile(optimizer ='adam',loss='categorical_crossentropy', metrics =['acc'])

callbacks_list = [keras.callbacks.EarlyStopping(monitor='acc', patience=50),
                  keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                       factor=0.1, patience=50)]

history = model.fit(X_train, y_train,
                    epochs=100, batch_size=32, 
                    callbacks=callbacks_list,
                    validation_data=(X_val, y_val),
                    shuffle=False)

Train on 23053 samples, validate on 7809 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

In [None]:
import sklearn.metrics 
y_val_pred = model.predict(X_val)
cm = sklearn.metrics.confusion_matrix(np.argmax(y_val, axis=1), 
                                      np.argmax(y_val_pred, axis=1))
acc = sklearn.metrics.accuracy_score(np.argmax(y_val, axis=1),
                                     np.argmax(y_val_pred, axis=1))
print("Accuracy : ", acc)
cm

model.save('SSL_STFT_2DConv_RNN.h5')  

In [108]:
# You can restore the best saved model and compare :
best_model = keras.models.load_model(os.path.join('best_models_archive', 'SSL_2DConv_RNN_STFT.h5'))
y_val_pred = best_model.predict(X_val)
cm = sklearn.metrics.confusion_matrix(np.argmax(y_val, axis=1), 
                                      np.argmax(y_val_pred, axis=1))
acc = sklearn.metrics.accuracy_score(np.argmax(y_val, axis=1),
                                     np.argmax(y_val_pred, axis=1))
print("Accuracy : ", acc)
cm

Accuracy :  0.826530612244898


array([[31,  1,  3,  5,  5],
       [ 1, 12,  0,  0,  0],
       [11,  2, 25,  0,  0],
       [ 1,  0,  0, 52,  0],
       [ 5,  0,  0,  0, 42]], dtype=int64)

### Best Model의 Validation Set에 대한 Confusion Matrix 분석
- 특히 Class_0 (Noise)에 대한 판단에 오류가 많다. 

In [103]:
# 실제 ground truth 의 class 분포 
pd.Series(np.argmax(y_val, axis=1)).value_counts().sort_index()  

0    45
1    13
2    38
3    53
4    47
dtype: int64

In [104]:
# 예측의 class 분포
pd.Series(np.argmax(y_val_pred, axis=1)).value_counts().sort_index()

0    59
1    28
2    14
3    39
4    56
dtype: int64

- class-0 (Noise)가 실제로 45개 그 중 35개만 맞춤.  특히 ground-true가 class-2 (60도) 인데, 이를 class-0 (noise)라 틀리게 분류한 것이 11개 
- class-0 noise를 class-3 또는 class-4라 잘 못 분류한 것이 10개

# 모델2. 2D CNN + 1D CNN
- 처음에는 2D CNN을 써서 `주파수-시간` 2차원 도메인.  2D CNN 뒤에 1D CNN을 stack.
- 앞서 만든 데이터셋 활용 : `X_train, y_train, X_val, y_val` 

In [133]:
for data_set in [[X_train, 'X_train'], [y_train, 'y_train'], 
                 [X_val,'X_val'], [y_val, 'y_val']]:
    print(data_set[1], ": ", data_set[0].shape)

X_train :  (626, 512, 100, 4)
y_train :  (626, 5)
X_val :  (196, 512, 100, 4)
y_val :  (196, 5)


In [134]:
if 'model' in locals():
    del model
    
input_spectrogram = Input(shape=(X_train.shape[1], X_train.shape[2], X_train.shape[3]))

conv_1 = Conv2D(32, (3, 3), activation='relu', padding='valid')(input_spectrogram)
conv_1_pool = MaxPooling2D((3, 2))(conv_1)

conv_2 = Conv2D(64, (3, 3), activation='relu', padding='valid')(conv_1_pool)
conv_2_pool = MaxPooling2D((3, 2))(conv_2)

conv_3 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv_2_pool)
conv_3_pool = MaxPooling2D((3, 2))(conv_3)

conv_4 = Conv2D(256, (3, 3), activation='relu', padding='same')(conv_3_pool)
conv_4_pool = MaxPooling2D((3, 2))(conv_4)

shape_conv_4_pool = conv_4_pool.get_shape().as_list()    # (None, height, width, channel)
conv_5 = Conv2D(512, (shape_conv_4_pool[1], 1), padding='valid', activation='relu')(conv_4_pool)

shape_conv_5 = conv_5.get_shape().as_list()
reshaped = layers.Reshape((shape_conv_5[2], shape_conv_5[3]))(conv_5)   # reshape to (timesteps, features) explicitly 

conv_6 = Conv1D(1024, kernel_size=3, activation='relu')(reshaped)
flatten = layers.Flatten()(conv_6)

fc1 = layers.Dense(64, activation='relu')(flatten)
fc1_drop = Dropout(0.1)(fc1)

dense_out = layers.Dense(5, activation='softmax')(fc1_drop)

model = models.Model(inputs=input_spectrogram, outputs=dense_out)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_13 (InputLayer)        (None, 512, 100, 4)       0         
_________________________________________________________________
conv2d_60 (Conv2D)           (None, 510, 98, 32)       1184      
_________________________________________________________________
max_pooling2d_48 (MaxPooling (None, 170, 49, 32)       0         
_________________________________________________________________
conv2d_61 (Conv2D)           (None, 168, 47, 64)       18496     
_________________________________________________________________
max_pooling2d_49 (MaxPooling (None, 56, 23, 64)        0         
_________________________________________________________________
conv2d_62 (Conv2D)           (None, 56, 23, 128)       73856     
_________________________________________________________________
max_pooling2d_50 (MaxPooling (None, 18, 11, 128)       0         
__________

In [135]:
model.compile(optimizer ='adam',loss='categorical_crossentropy', metrics =['acc'])

callbacks_list = [keras.callbacks.EarlyStopping(monitor='acc', patience=50),
                  keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                       factor=0.1, patience=50)]

history = model.fit(X_train, y_train,
                    epochs=150, batch_size=32, 
                    callbacks=callbacks_list,
                    validation_data=(X_val, y_val),
                    shuffle=False)

Train on 626 samples, validate on 196 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150


Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150


In [137]:
import sklearn.metrics 
y_val_pred = model.predict(X_val)
cm = sklearn.metrics.confusion_matrix(np.argmax(y_val, axis=1), 
                                      np.argmax(y_val_pred, axis=1))
acc = sklearn.metrics.accuracy_score(np.argmax(y_val, axis=1),
                                     np.argmax(y_val_pred, axis=1))
print("Accuracy : ", acc)
model.save('SSL_STFT_2DConv_1DConv.h5')  

cm

Accuracy :  0.8010204081632653


array([[29,  9,  2,  2,  3],
       [ 0, 10,  0,  3,  0],
       [10,  0, 28,  0,  0],
       [ 9,  0,  0, 44,  0],
       [ 1,  0,  0,  0, 46]], dtype=int64)