In [2]:
# from google.colab import drive
# drive.mount('/content/drive/')

Importing libraries

In [3]:
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks

Defining some constants

In [4]:
Fs = 16000
n_mfcc  = 20
Data_path = '/content/drive/MyDrive/EE603-Project/files'
Labels_csv_path = '/content/drive/MyDrive/EE603-Project/TrainLabels.csv'

In [5]:
labels = pd.read_csv(Labels_csv_path)
labels.head()

Unnamed: 0,filename,onset,offset,class
0,musph0_.wav,0.0,3.8219,silence
1,musph0_.wav,3.8219,6.8219,music
2,musph0_.wav,6.8219,7.4842,silence
3,musph0_.wav,7.4842,8.4842,speech
4,musph0_.wav,8.4842,10.0,silence


Extracting MFCC Feature

In [6]:
def extract_mfcc(f,on,of):
  '''
  f: it is the path to the file whose mfcc is to be extracted
  on and of: onset time and offset time of feature extraction
  '''
  a,_ = librosa.load(f,sr = Fs)
  d = a[int(on*Fs):int(of*Fs)]
  mfccs = librosa.feature.mfcc(y=d, sr=Fs, n_mfcc=n_mfcc)
  
  return np.mean(mfccs.T,axis=0)

Checking extract_mfcc:

In [7]:
f = '/content/drive/MyDrive/EE603-Project/files/musph0_.wav'
for i in range(4):
  on = labels['onset'][i]
  off = labels['offset'][i]
  print(extract_mfcc(f,on,off))


[-1131.371     0.        0.        0.        0.        0.        0.
     0.        0.        0.        0.        0.        0.        0.
     0.        0.        0.        0.        0.        0.   ]
[-269.5914      70.749886     2.674555   -25.777128   -45.74757
    5.9979787   -9.86615    -35.33644    -21.507563    -3.6931431
    1.5742964    9.805985    -8.926946     3.8702295  -25.474064
   11.410781   -14.259767   -13.246624    -9.13262    -12.650727 ]
[-1131.3708     0.         0.         0.         0.         0.
     0.         0.         0.         0.         0.         0.
     0.         0.         0.         0.         0.         0.
     0.         0.    ]
[-2.0918173e+02  8.0522346e+01 -4.8038278e+00  3.7486053e+01
  1.4823071e+01  6.4469771e+00 -9.8323479e+00 -1.5065784e+00
 -3.7379432e-01 -1.2389159e+00 -1.5326183e+01 -6.7483759e-01
 -9.4394522e+00 -1.4500582e-01 -1.1870794e+01 -6.0792632e+00
 -9.9727039e+00 -3.7979274e+00 -1.1407868e+01 -3.6513884e+00]


Extracting Features and storing in a list

In [8]:
f = os.path.join(os.path.abspath(Data_path),str('musph0_.wav'))
f

'/content/drive/MyDrive/EE603-Project/files/musph0_.wav'

In [9]:
features = []
for id, ro in tqdm(labels.iterrows()):
  f = os.path.join(os.path.abspath(Data_path),str(ro['filename']))
  on = ro['onset']
  of = ro['offset']
  clas = ro['class']
  features.append([extract_mfcc(f,on,of),clas])

  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.sh

Storing features in the form of dataframe for ease of training

In [10]:
df = pd.DataFrame(features,columns=['mfccs','class'])

In [11]:
df.tail()

Unnamed: 0,mfccs,class
1448,"[-1131.371, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",silence
1449,"[-7.95147, 70.178665, 4.8991804, 25.28807, 6.5...",music
1450,"[-939.08984, -1.9651121e-05, -6.1828e-06, 4.85...",silence
1451,"[-157.78967, 94.30114, -17.505339, 31.592728, ...",speech
1452,"[-1131.3708, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",silence


Final processing for training

In [12]:
X = np.array(df['mfccs'].tolist())
Y = np.array(df['class'].tolist())
print(X.shape)
print(Y.shape)

(1453, 20)
(1453,)


Onehot encoding the labels

In [13]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
Y = to_categorical(le.fit_transform(Y))

Train-Test Split

In [14]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25,random_state=42)

Modelling

In [15]:
input_shape = [X_train.shape[1]]
input_shape

[20]

Model Architechture

In [16]:
def model():

  model = keras.Sequential(name="My_sequential")


  model.add(Dense(64,input_shape=input_shape))
  model.add(Activation('relu'))
  model.add(Dropout(0.3))


  model.add(Dense(128))
  model.add(Activation('relu'))
  model.add(Dropout(0.4))


  model.add(Dense(64))
  model.add(Activation('relu'))
  model.add(Dropout(0.4))

  model.add(Dense(32))
  model.add(Activation('relu'))
  model.add(Dropout(0.2))


  model.add(Dense(3))
  model.add(Activation('softmax'))
  
  opt = keras.optimizers.Adam()
  model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer=opt) 


  return model


In [17]:
Model = model()

In [18]:
Model.summary()

Model: "My_sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                1344      
                                                                 
 activation (Activation)     (None, 64)                0         
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 128)               8320      
                                                                 
 activation_1 (Activation)   (None, 128)               0         
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)              

In [19]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint_path = '/content/drive/MyDrive/EE603-Project/model_saved'
checkpoint_dir = os.path.dirname(checkpoint_path)

# Creating a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

Model.fit(X_train, Y_train, batch_size=16, epochs=50, validation_data=(X_test, Y_test), callbacks=[cp_callback], verbose=1)

Epoch 1/50
Epoch 00001: saving model to /content/drive/MyDrive/EE603-Project/model_saved
Epoch 2/50
Epoch 00002: saving model to /content/drive/MyDrive/EE603-Project/model_saved
Epoch 3/50
Epoch 00003: saving model to /content/drive/MyDrive/EE603-Project/model_saved
Epoch 4/50
Epoch 00004: saving model to /content/drive/MyDrive/EE603-Project/model_saved
Epoch 5/50
Epoch 00005: saving model to /content/drive/MyDrive/EE603-Project/model_saved
Epoch 6/50
Epoch 00006: saving model to /content/drive/MyDrive/EE603-Project/model_saved
Epoch 7/50
Epoch 00007: saving model to /content/drive/MyDrive/EE603-Project/model_saved
Epoch 8/50
Epoch 00008: saving model to /content/drive/MyDrive/EE603-Project/model_saved
Epoch 9/50
Epoch 00009: saving model to /content/drive/MyDrive/EE603-Project/model_saved
Epoch 10/50
Epoch 00010: saving model to /content/drive/MyDrive/EE603-Project/model_saved
Epoch 11/50
Epoch 00011: saving model to /content/drive/MyDrive/EE603-Project/model_saved
Epoch 12/50
Epoch 0

<keras.callbacks.History at 0x7f68d962bed0>

In [20]:
os.listdir(checkpoint_dir)

['Pritam_music.wav',
 'Speech.wav',
 'TrainLabels.csv',
 'files',
 'model_saved',
 'val_set',
 'model_saved.index',
 'checkpoint',
 'model_saved.data-00000-of-00001']

# Performing Prediction on Validation Data Provided 

In [21]:
Model2 = model()

In [22]:
Model2.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f68d95af7d0>

In [23]:
test_Data_path = '/content/drive/MyDrive/EE603-Project/val_set/wav'

sheet_name = "test_lables"
url = f"https://docs.google.com/spreadsheets/d/1_s_xHMGRWnH7F9faavlENbVX45eqGAnzeUec4P1vjvQ/gviz/tq?tqx=out:csv&sheet={sheet_name}"

df_test = pd.read_csv(url)

In [24]:
window_length = 2500
window_length_time = window_length/Fs

In [25]:
df_test.head()

Unnamed: 0,filename,onset,offset,class
0,S001,0.7545,1.963,speech
1,S001,3.033,4.365,speech
2,S001,5.285,6.591,speech
3,S001,7.634,9.019,speech
4,S002,0.158,1.06,speech


In [26]:
def extract_mfcc_test(audio):
    mfccs = librosa.feature.mfcc(y=audio, sr=Fs, n_mfcc=n_mfcc)
    return np.mean(mfccs.T,axis=0)

In [27]:
def func(df_test,window_length = 2500):

  features=[]
  final =[]
  window_length = 2500

  for id,ro in tqdm(df_test.iterrows()):

      f = os.path.join(os.path.abspath(test_Data_path),str(ro["filename"]+'.wav'))
      if ro["filename"] in final:
        continue;
      
      final.append(ro["filename"])
      audio,_ = librosa.load(f, Fs)

      for i in range(int(audio.shape[0]/window_length)):
        tempa = audio[i*window_length:(i+1)*window_length]
        td=extract_mfcc_test(tempa)
        fname = str(ro["filename"]) + str(i)
        features.append([fname, td])

  features_df=pd.DataFrame(features,columns=['filename','mfccs'])    
  return final, features_df


In [28]:
final, features_df = func(df_test,window_length = 2500)

40it [00:24,  1.61it/s]


In [29]:
features_df.head()

Unnamed: 0,filename,mfccs
0,S0010,"[-499.80484, 97.405014, -6.242565, 29.702097, ..."
1,S0011,"[-501.80035, 103.22322, -4.691728, 31.590801, ..."
2,S0012,"[-511.67734, 102.85643, 3.918148, 26.303196, 8..."
3,S0013,"[-521.1769, 105.81555, 6.26138, 27.474567, 8.4..."
4,S0014,"[-463.32495, 110.38657, 16.875725, 25.956945, ..."


In [30]:
def preprocessing1(features_df,final,Model2):

  X_test = np.array(features_df['mfccs'].tolist())
  result= []

  for i in tqdm(range(len(X_test))):
    a = Model2.predict(X_test[i].reshape(1,-1))
    result.append(a[0])
  
  res = np.array(result)
  result_dict = {}

  for i in range(res.shape[0]):
    arr = res[i]
    index = np.where(arr == np.amax(arr))[0][0]

    if index == 0:
      result_dict[i] = "music"
    elif index == 1:
      result_dict[i] = "silence"
    else:
      result_dict[i] = "speech"
  
  window_length_time = window_length/Fs
  tempar = ['music','speech']
  arr = []

  for id in result_dict:
    if result_dict[id] in tempar:
      arr.append(id)
  
  for item in arr:
    if (item + 3) < len(result_dict):
      if result_dict[item] == result_dict[item+2]:
        result_dict[item+1] = result_dict[item]
  return result_dict

In [31]:
result_dict = preprocessing1(features_df,final,Model2)

100%|██████████| 762/762 [00:41<00:00, 18.50it/s]


In [32]:
result_dict

{0: 'silence',
 1: 'silence',
 2: 'silence',
 3: 'silence',
 4: 'silence',
 5: 'music',
 6: 'music',
 7: 'music',
 8: 'music',
 9: 'music',
 10: 'music',
 11: 'silence',
 12: 'silence',
 13: 'silence',
 14: 'silence',
 15: 'silence',
 16: 'silence',
 17: 'silence',
 18: 'silence',
 19: 'music',
 20: 'music',
 21: 'music',
 22: 'music',
 23: 'music',
 24: 'music',
 25: 'music',
 26: 'silence',
 27: 'silence',
 28: 'silence',
 29: 'silence',
 30: 'silence',
 31: 'silence',
 32: 'silence',
 33: 'silence',
 34: 'silence',
 35: 'music',
 36: 'music',
 37: 'music',
 38: 'music',
 39: 'music',
 40: 'music',
 41: 'music',
 42: 'silence',
 43: 'silence',
 44: 'silence',
 45: 'silence',
 46: 'silence',
 47: 'silence',
 48: 'silence',
 49: 'music',
 50: 'speech',
 51: 'silence',
 52: 'music',
 53: 'music',
 54: 'music',
 55: 'music',
 56: 'silence',
 57: 'silence',
 58: 'silence',
 59: 'silence',
 60: 'silence',
 61: 'silence',
 62: 'silence',
 63: 'silence',
 64: 'silence',
 65: 'speech',
 66: '

Some constants

In [33]:
file_length = 10 # since each file is of 10 second length
n_frames = int(Fs*file_length/window_length)
threshold = 6

In [34]:
n_frames

64

In [35]:
def predictf(result_dict):
  count_dict = {'music':0,
              'speech':0}

  predictions = []
  threshold = 10
  cntMusic = 0
  cntSpeech = 0

  for id in result_dict:

    if id == (len(result_dict) - 1):
      if cntMusic >= threshold:
        count_dict['music'] = 1
      if cntSpeech >= threshold:
        count_dict['speech'] = 1


      predictions.append([count_dict['music'],count_dict['speech']])
      break


    
    if id % n_frames == 0:
      if id == 0:
        continue
      if cntMusic >= threshold:
        count_dict['music'] = 1
      if cntSpeech >= threshold:
        count_dict['speech'] = 1


      predictions.append([count_dict['music'],count_dict['speech']])

      cntMusic = 0
      cntSpeech = 0
      
      count_dict['music'] = 0
      count_dict['speech'] = 0
      continue
    if result_dict[id] == 'music':
      cntMusic +=1
    elif result_dict[id] == 'speech':
      cntSpeech += 1
    else:
      continue

  df = pd.DataFrame(predictions,columns=['music','speech'])
  return df

In [36]:
df = predictf(result_dict)

In [37]:
df

Unnamed: 0,music,speech
0,1,0
1,1,1
2,1,1
3,1,1
4,1,0
5,1,1
6,1,0
7,1,0
8,1,1
9,1,1


# Predicting for spectrograms

In [38]:
df_test.head()

Unnamed: 0,filename,onset,offset,class
0,S001,0.7545,1.963,speech
1,S001,3.033,4.365,speech
2,S001,5.285,6.591,speech
3,S001,7.634,9.019,speech
4,S002,0.158,1.06,speech


In [39]:
root = '/content/drive/MyDrive/EE603-Project/val_set/spectrogram'

In [40]:
f = '/content/drive/MyDrive/EE603-Project/val_set/spectrogram/S007.npy'

In [41]:
S_db = np.load(f)
S = librosa.db_to_power(S_db, ref=1.0)

In [42]:
S_db.shape

(513, 313)

Some Constants

In [43]:
n_fft = 1024
hop_length = 512
win_length = 1024
Fs = 16000

In [44]:
def duration(S_db):
  n_samples = (S_db.shape[1]-1)*hop_length + win_length
  return int(np.floor(n_samples/Fs))

In [45]:
def samples_to_duration(n):
  r = (n-1)*hop_length + win_length
  return r/Fs

In [46]:
samples_to_duration(313)

10.048

In [47]:
def duration_to_samples(t):
  n_s = t*Fs
  return int(((n_s-win_length)/hop_length ) + 1)

In [48]:
window_length_time = samples_to_duration(1)

In [49]:
window_length_time

0.064

In [50]:
duration_to_samples(0.064)

1

In [51]:
df_test.head()

Unnamed: 0,filename,onset,offset,class
0,S001,0.7545,1.963,speech
1,S001,3.033,4.365,speech
2,S001,5.285,6.591,speech
3,S001,7.634,9.019,speech
4,S002,0.158,1.06,speech


In [52]:
# for id,ro in tqdm(df_test.iterrows()):

#       print(os.path.join(os.path.abspath(root),str(ro["filename"]+'.npy')))

In [114]:
def extract_mfcc_spects(root):
  dir_list = os.listdir(root)
  M = []
  files = []

  for file in dir_list:
    
    f = os.path.join(os.path.abspath(root),file)
    S_db = np.abs(np.load(f))
    S = librosa.db_to_power(S_db, ref=3)
    m = []
    
    for i in range(1,S.shape[1]):
      segement = S[:,i]
      m.append([librosa.feature.mfcc(y = None, S = segement,sr = Fs,n_mfcc = 20)])
    
    M.append([file,m])
  return  pd.DataFrame(M,columns=['filename','mfccs'])

In [106]:
# feat = extract_mfcc_spects(root)

In [107]:
# feat.head()

In [60]:
#rand = np.array(feat['mfccs'].tolist())

In [61]:
#rand.shape

In [115]:
def preprocessing(features_df,Model2):

  X_test = np.array(features_df['mfccs'].tolist())
  result= []

  for i in tqdm(range(len(X_test))):
    for j in range(X_test.shape[1]):
      a = Model2.predict(X_test[i][j].reshape(1,-1))
      result.append(a[0])
  
  res = np.array(result)
  result_dict = {}

  for i in range(res.shape[0]):
    arr = res[i]
    index = np.where(arr == np.amax(arr))[0][0]

    if index == 0:
      result_dict[i] = "music"
    elif index == 1:
      result_dict[i] = "silence"
    else:
      result_dict[i] = "speech"
  
  
  #window_length_time = window_length/Fs
  tempar = ['music','speech']
  arr = []

  for id in result_dict:
    if result_dict[id] in tempar:
      arr.append(id)
  
  for item in arr:
    if (item + 3) < len(result_dict):
      if result_dict[item] == result_dict[item+2]:
        result_dict[item+1] = result_dict[item]
  return result_dict

In [121]:
n_frames = 313

In [122]:
feat = extract_mfcc_spects(root)

In [123]:
feat.head()

Unnamed: 0,filename,mfccs
0,music_noisy4.npy,"[[[567416939.137284577, -390191092.4099758999,..."
1,music_noisy7.npy,"[[[529262748.31073777797, -539447786.576617820..."
2,music_noisy10.npy,"[[[655471747.6879592922, -370230752.1423757451..."
3,music+speech_noisy1.npy,"[[[441139413.56552941384, -339023436.418825423..."
4,music_noisy3.npy,"[[[583699347.03540663864, -388447669.911481164..."


In [124]:
r = preprocessing(feat,Model2)

100%|██████████| 30/30 [08:01<00:00, 16.04s/it]


In [125]:
r

{0: 'silence',
 1: 'silence',
 2: 'silence',
 3: 'silence',
 4: 'silence',
 5: 'silence',
 6: 'silence',
 7: 'silence',
 8: 'silence',
 9: 'silence',
 10: 'silence',
 11: 'silence',
 12: 'silence',
 13: 'silence',
 14: 'silence',
 15: 'silence',
 16: 'silence',
 17: 'silence',
 18: 'silence',
 19: 'silence',
 20: 'silence',
 21: 'silence',
 22: 'silence',
 23: 'silence',
 24: 'silence',
 25: 'silence',
 26: 'silence',
 27: 'silence',
 28: 'silence',
 29: 'silence',
 30: 'silence',
 31: 'silence',
 32: 'silence',
 33: 'silence',
 34: 'silence',
 35: 'silence',
 36: 'silence',
 37: 'silence',
 38: 'silence',
 39: 'silence',
 40: 'silence',
 41: 'silence',
 42: 'silence',
 43: 'silence',
 44: 'silence',
 45: 'silence',
 46: 'silence',
 47: 'silence',
 48: 'silence',
 49: 'silence',
 50: 'silence',
 51: 'silence',
 52: 'silence',
 53: 'silence',
 54: 'silence',
 55: 'silence',
 56: 'silence',
 57: 'silence',
 58: 'silence',
 59: 'silence',
 60: 'silence',
 61: 'silence',
 62: 'silence',
 6

In [99]:
# for i in r:
#   if r[i] == 'speech':
#     print(i)

In [126]:
tempar = ['music','speech']

arr2 = []

for id in r:
  if r[id] in tempar:
    arr2.append(id)

In [127]:
window_length_time = samples_to_duration(1)

In [None]:
# for item in farr:

#   if(item == 0):
#     if
#     onset = 0
#     continue

#   if(item>0):

#     if r[item-1] != r[item]:
#       onset = samples_to_duration(item%313)
#       continue
#     else: r[item] == r[item+1]:
#       onset+= window_length_time



In [131]:
final = os.listdir(root)

In [128]:
window_length_time = samples_to_duration(1)

In [138]:
def predictf(result_dict):
  onset_list = []
  offset_list = []
  clas = []
  cnt = []

  count = 0
  si = 'silence'

  result_list2 = []
  ina = 0


  for id in result_dict:
    tempv = result_dict[arr2[ina]]

    if result_dict[id] == si:
      if (id == 0) or (id >= len(result_dict)-1):
        continue
      elif (result_dict[id+1] == result_dict[id-1]) and (result_dict[id+1] != si):
        continue
      elif(result_dict[id] == result_dict[id+1]):
        
        if(count>0 and count < 4):
          ina += 1
          count = 0
          continue

        elif(count>=4):
          result_list2.append([final[id//313],samples_to_duration((id-count)%313),samples_to_duration(id%313),tempv])
          count = 0
          ina += 1
          continue
        else:
          continue


      else:
        continue

    elif(result_dict[id]==tempv):
      count+=1
      continue
      
    elif(result_dict[id]!=tempv):
      if(count>0 and count < 4):
        ina += 1
        count = 1
        continue

      elif(count>=4):
        result_list2.append([final[id//313],(((id - count)%313)*window_length_time)%10,((id%313)*window_length_time)%10,tempv])
        count = 1
        ina += 1
        continue

      else:
        continue
  df1 = pd.DataFrame(result_list2,columns=['filename','onset','offset','class'])
  return df1

In [139]:
df = predictf(r)

In [140]:
df.head()

Unnamed: 0,filename,onset,offset,class
0,music+speech_noisy10.npy,4.992,5.12,speech
1,music_noisy2.npy,7.648,7.84,speech
2,music_noisy2.npy,8.032,8.192,speech
3,S007.npy,4.544,4.992,speech
4,S007.npy,2.912,4.352,music
