In [6]:
import numpy as np
import pandas as pd
import os
import librosa,librosa.display
from tqdm.notebook import tqdm
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


https://www.kaggle.com/code/rajanmargaye/audio-mnist-with-lstm-auc-93/notebook

In [7]:
path='../data/0_A_0.wav'
raw_data,framerate=librosa.load(path)
raw_data,framerate

(array([-0.04289964, -0.04552099, -0.03932492, ..., -0.00223843,
         0.00246574,  0.        ], dtype=float32),
 22050)

In [8]:
data=pd.DataFrame(columns=['raw_data','duration','digit',"person","number_index"])
dir_path='../data/'
for i in tqdm(os.listdir(dir_path)):
        raw_data,frame_rate=librosa.load(dir_path+i)
        duration=librosa.get_duration(raw_data,frame_rate)
        data.loc[len(data.index)]=[raw_data,duration,i.split('_')[0],i.split("_")[1],i.split("_")[2].split(".")[0]]

  0%|          | 0/3000 [00:00<?, ?it/s]

In [9]:
data

Unnamed: 0,raw_data,duration,digit,person,number_index
0,"[-0.042899642, -0.045520995, -0.039324917, -0....",0.298005,0,A,0
1,"[0.0010019833, 0.00088148087, 0.0006781536, 0....",0.590884,0,A,1
2,"[-0.0077365004, -0.014448198, -0.016177949, -0...",0.744762,0,A,10
3,"[0.003040686, 0.0036373322, 0.003993512, 0.004...",0.457642,0,A,11
4,"[0.0020078255, 0.002592097, 0.00282455, 0.0028...",0.506259,0,A,12
...,...,...,...,...,...
2995,"[-0.0001598679, -0.0003172595, -0.00037161465,...",0.359637,9,F,5
2996,"[-0.00019665736, -7.2244766e-05, 0.00013556477...",0.347256,9,F,6
2997,"[0.00022448921, 0.00018810731, 0.00012995154, ...",0.351882,9,F,7
2998,"[0.00031612845, 0.0002592627, 0.0001870415, 0....",0.395510,9,F,8


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[['raw_data','duration']],data['digit'], test_size=0.3, random_state=45,stratify=data['digit'])

In [11]:
for i in range(0,101,10):
    print(i,' th percentile is ',np.percentile([len(i) for i in X_train['raw_data']],i))

0  th percentile is  3167.0
10  th percentile is  6092.0
20  th percentile is  7198.8
30  th percentile is  7933.0
40  th percentile is  8650.0
50  th percentile is  9270.0
60  th percentile is  9971.199999999999
70  th percentile is  10799.0
80  th percentile is  11761.6
90  th percentile is  13334.400000000005
100  th percentile is  50335.0


In [12]:
for i in range(90,101,1):
    print(i,' th percentile is ',np.percentile([len(i) for i in X_train['raw_data']],i))

90  th percentile is  13334.400000000005
91  th percentile is  13570.720000000001
92  th percentile is  13801.24
93  th percentile is  13995.190000000002
94  th percentile is  14217.84
95  th percentile is  14664.25
96  th percentile is  15274.519999999997
97  th percentile is  16422.18
98  th percentile is  18049.059999999998
99  th percentile is  20768.21999999983
100  th percentile is  50335.0


In [13]:
max_length=50335

### We are padding the sequence as we going to use LSTM

In [14]:
import tensorflow as tf
X_train_pad=tf.keras.preprocessing.sequence.pad_sequences(X_train['raw_data'],maxlen=max_length, dtype='float32')
X_test_pad=tf.keras.preprocessing.sequence.pad_sequences(X_test['raw_data'],maxlen=max_length, dtype='float32')
X_train_mask=np.where(X_train_pad>0.0,True,False)
X_test_mask=np.where(X_test_pad>0.0,True,False)

In [15]:
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model
from sklearn.metrics import f1_score

### Fourier Tranformation is computed on overlapping windowed segments of the signal, and we get what is called the spectrogram
### Hence we are converting our raw_data ie time series to spectogram
### Mel spectrogram is a spectrogram where the frequencies are converted to the mel scale

In [36]:
def convert_to_spectrogram(raw_data):
    '''converting to spectrogram'''
    spect = librosa.feature.melspectrogram(y=raw_data, n_mels=64) # n_mels as output shape
    mel_spect = librosa.power_to_db(S=spect, ref=np.max)
    return mel_spect

In [37]:
X_train_spectrogram=np.array([convert_to_spectrogram(np.array([float(i) for i in X_train_pad[k] ])) for k in range(len(X_train_pad)) ])
X_test_spectrogram=np.array([convert_to_spectrogram(np.array([float(i) for i in X_test_pad[k] ])) for k in range(len(X_test_pad)) ])

In [38]:
X_train_spectrogram.shape

(2100, 64, 99)

In [39]:
input_layer=Input(shape=(64,99), dtype=np.float32,name='input_layer')
lstm=LSTM(500,name='lstm_layer',return_sequences=True)(input_layer)
d1=Dense(120,activation='relu',name='dense1')(tf.math.reduce_mean(lstm, 2))
d2=Dense(60,activation='relu',name='dense2')(d1)
d3=Dense(10,activation='softmax',name='dense3')(d2)

In [40]:
model = Model(inputs=input_layer, outputs=d3)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 64, 99)]          0         
                                                                 
 lstm_layer (LSTM)           (None, 64, 500)           1200000   
                                                                 
 tf.math.reduce_mean (TFOpLa  (None, 64)               0         
 mbda)                                                           
                                                                 
 dense1 (Dense)              (None, 120)               7800      
                                                                 
 dense2 (Dense)              (None, 60)                7260      
                                                                 
 dense3 (Dense)              (None, 10)                610       
                                                             

In [41]:
def cal_f1(y_true,y_pred):
    return f1_score(y_true,y_pred,average='micro')
def micro_f1(y_true,y_prob):
    y_pred=tf.math.argmax(y_prob,axis=1)
    return tf.py_function(cal_f1,(y_true,y_pred),tf.double)

In [42]:
class LossHistory(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if logs.get('val_micro_f1', -1)>0.97:
            self.model.stop_training=True

loss_history=LossHistory()

filepath="model_save/weights-{epoch:02d}-{micro_f1:.4f}-{val_micro_f1:.4f}.hdf5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=filepath, monitor='val_micro_f1',  verbose=1, save_best_only=True, mode='max')

In [43]:
opt= tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy' ,metrics=['accuracy',micro_f1])

In [44]:
tf.keras.backend.clear_session()
model.fit(X_train_spectrogram,y_train.astype('int')\
           ,validation_data=(X_test_spectrogram,y_test.astype('int'))\
           ,batch_size=32,epochs=400\
           ,callbacks=[loss_history,checkpoint])

Epoch 1/400
Epoch 1: val_micro_f1 improved from -inf to 0.09698, saving model to model_save\weights-01-0.1034-0.0970.hdf5
Epoch 2/400
Epoch 2: val_micro_f1 did not improve from 0.09698
Epoch 3/400
Epoch 3: val_micro_f1 improved from 0.09698 to 0.14763, saving model to model_save\weights-03-0.1168-0.1476.hdf5
Epoch 4/400
Epoch 4: val_micro_f1 did not improve from 0.14763
Epoch 5/400
Epoch 5: val_micro_f1 improved from 0.14763 to 0.18427, saving model to model_save\weights-05-0.1517-0.1843.hdf5
Epoch 6/400
Epoch 6: val_micro_f1 did not improve from 0.18427
Epoch 7/400
Epoch 7: val_micro_f1 did not improve from 0.18427
Epoch 8/400
Epoch 8: val_micro_f1 did not improve from 0.18427
Epoch 9/400
Epoch 9: val_micro_f1 did not improve from 0.18427
Epoch 10/400
Epoch 10: val_micro_f1 improved from 0.18427 to 0.25970, saving model to model_save\weights-10-0.1767-0.2597.hdf5
Epoch 11/400
Epoch 11: val_micro_f1 did not improve from 0.25970
Epoch 12/400
Epoch 12: val_micro_f1 did not improve from 0

In [None]:
opt_res=os.listdir("model_save/")

In [None]:
result=pd.DataFrame()
epoch=[]
f1=[]
val_f1=[]
for i in opt_res:    
    epoch.append(i.split('-')[1])
    f1.append(i.split('-')[2])
    val_f1.append(i.split('-')[3][:6])
result['epoch']=epoch
result['f1']=f1
result['val_f1']=val_f1
values=result[result.epoch==str(result.epoch.astype('int').max())]

In [None]:
print("We have found optimum result at\nEpoch: ",values.iloc[0].epoch,"\nTrain F1 score: ",values.iloc[0].f1,"\nTest F1 score: ",values.iloc[0].val_f1)

We have found optimum result at
Epoch:  329 
Train F1 score:  0.7163 
Test F1 score:  0.7888
