In [3]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import numpy as np
import soundfile as sf
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 
print(tf.__version__)

2.7.0


In [4]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


# **Data Preprocessing**

In [5]:
SR = 16000

In [6]:
audio_dataset_path='/content/drive/MyDrive/Audio_tagging/audio_files'
train_df=pd.read_csv('/content/drive/MyDrive/Audio_tagging/TrainLabels.csv')
train_df.head()

Unnamed: 0,filename,onset,offset,class
0,sound0_.wav,0.0,1.6189,silence
1,sound0_.wav,1.6189,3.6189,speech
2,sound0_.wav,3.6189,8.8667,silence
3,sound0_.wav,8.8667,9.8667,music
4,sound0_.wav,9.8667,10.0,silence


In [7]:
SR = 16000

In [8]:
def features_extractor(file_name, onset, offset):
    audio,_ = librosa.load(file_name,sr = SR)
    sample_rate = SR
    audio = audio[int(onset*sample_rate): int(offset*sample_rate)] 
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    return np.mean(mfccs_features.T,axis=0)

In [9]:
extracted_features=[]
for i,row in tqdm(train_df.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path),str(row["filename"]))
    final_class_labels=row["class"]
    onset = row["onset"]
    offset = row["offset"]
    if (offset-onset)<0.13:
      continue;
    data=features_extractor(file_name, onset, offset)
    extracted_features.append([data,final_class_labels])

1094it [01:15, 14.46it/s]


In [10]:
features_df=pd.DataFrame(extracted_features,columns=['feature','class'])
features_df.head()

Unnamed: 0,feature,class
0,"[-1131.371, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",silence
1,"[-389.18173, 80.10276, -37.70203, -17.295511, ...",speech
2,"[-1129.0479, -1.5282719e-06, -5.393266e-07, 3....",silence
3,"[-820.6056, 72.38335, 20.072504, 4.8299403, 6....",music
4,"[-1098.3567, -5.4116546e-05, -1.8590968e-05, 0...",silence


In [11]:
X=np.array(features_df['feature'].tolist())
y=np.array(features_df['class'].tolist())

In [12]:
print(X.shape)
print(y.shape)

(1046, 40)
(1046,)


In [13]:
y[:3]

array(['silence', 'speech', 'silence'], dtype='<U7')

In [14]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

# **Model**

In [15]:
num_labels=3

#Model
model=Sequential()

model.add(Dense(128,input_shape=(40,)))
model.add(Activation('relu'))
model.add(Dropout(0.4))

model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.4))

model.add(Dense(num_labels))
model.add(Activation('softmax'))

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               5248      
                                                                 
 activation (Activation)     (None, 128)               0         
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 activation_1 (Activation)   (None, 256)               0         
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 128)               3

In [16]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [17]:
num_epochs = 60
num_batch_size = 64

In [18]:
checkpoint_path = '/content/drive/MyDrive/Audio_event_detection/Saved_model'
checkpoint_dir = os.path.dirname(checkpoint_path)
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, verbose=1)
model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[cp_callback], verbose=1)

Epoch 1/60
Epoch 00001: saving model to /content/drive/MyDrive/Audio_event_detection/Saved_model
Epoch 2/60
Epoch 00002: saving model to /content/drive/MyDrive/Audio_event_detection/Saved_model
Epoch 3/60
Epoch 00003: saving model to /content/drive/MyDrive/Audio_event_detection/Saved_model
Epoch 4/60
 1/13 [=>............................] - ETA: 0s - loss: 4.8848 - accuracy: 0.8438
Epoch 00004: saving model to /content/drive/MyDrive/Audio_event_detection/Saved_model
Epoch 5/60
 1/13 [=>............................] - ETA: 0s - loss: 2.1150 - accuracy: 0.8750
Epoch 00005: saving model to /content/drive/MyDrive/Audio_event_detection/Saved_model
Epoch 6/60
 1/13 [=>............................] - ETA: 0s - loss: 1.5909 - accuracy: 0.8750
Epoch 00006: saving model to /content/drive/MyDrive/Audio_event_detection/Saved_model
Epoch 7/60
 1/13 [=>............................] - ETA: 0s - loss: 3.0149 - accuracy: 0.9531
Epoch 00007: saving model to /content/drive/MyDrive/Audio_event_detection/S

<keras.callbacks.History at 0x7f9709c9b750>

In [119]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

0.9961832165718079


# **Prediction using audio files**

In [20]:
test_audio_dataset_path='/content/drive/MyDrive/Test-Data/wav'
test_df=pd.read_csv('/content/drive/MyDrive/Test-Data/labels.csv')
test_df.head()

Unnamed: 0,filename,onset,offset,class
0,S001,0.7545,1.963,speech
1,S001,3.033,4.365,speech
2,S001,5.285,6.591,speech
3,S001,7.634,9.019,speech
4,S002,0.158,1.06,speech


In [21]:
def test_features_extractor(audio):
    sample_rate = SR
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    return mfccs_scaled_features

In [22]:
window_length = 2500
test_features=[]
check =[]
count = 0
for i ,row in tqdm(test_df.iterrows()):
    file_name = os.path.join(os.path.abspath(test_audio_dataset_path),str(row["filename"]+'.wav'))
    if row["filename"] in check:
      continue;
    check.append(row["filename"])
    audio,_ = librosa.load(file_name, SR)
    for i in range(int(audio.shape[0]/window_length)):
      temp_audio = audio[i*window_length:(i+1)*window_length]
      data = test_features_extractor(temp_audio)
      name = str(row["filename"]) + str(i)
      test_features.append([name, data])
      count +=1

40it [00:15,  2.57it/s]


In [23]:
check

['S001',
 'S002',
 'S003',
 'S004',
 'music_noisy1',
 'music_noisy2',
 'music_noisy3',
 'music_noisy4',
 'music+speech_noisy1',
 'music+speech_noisy2',
 'music+speech_noisy3',
 'music+speech_noisy4']

In [24]:
test_features_df = pd.DataFrame(test_features,columns=['filename','feature'])
test_features_df.head(20)

Unnamed: 0,filename,feature
0,S0010,"[-499.80484, 97.405014, -6.242565, 29.702097, ..."
1,S0011,"[-501.80035, 103.22322, -4.691728, 31.590801, ..."
2,S0012,"[-511.67734, 102.85643, 3.918148, 26.303196, 8..."
3,S0013,"[-521.1769, 105.81555, 6.26138, 27.474567, 8.4..."
4,S0014,"[-463.32495, 110.38657, 16.875725, 25.956945, ..."
5,S0015,"[-241.54382, 113.09074, -17.936165, 41.87146, ..."
6,S0016,"[-261.03754, 101.65785, 5.5851126, 24.882952, ..."
7,S0017,"[-259.40717, 124.6599, 3.2924523, 32.076763, -..."
8,S0018,"[-337.75558, 127.10234, 17.054003, 44.520363, ..."
9,S0019,"[-283.3834, 116.027306, 56.556385, 4.9752417, ..."


In [25]:

test_features_df.shape

(762, 2)

In [26]:
X=np.array(test_features_df['feature'].tolist())

In [27]:
result= []
for i in range(len(X)):
  a = model.predict(X[i].reshape(1,-1))
  result.append(a[0])

In [28]:
res = np.array(result)
result_list = []
for i in range(res.shape[0]):
  arr = res[i]
  index = np.where(arr == np.amax(arr))[0][0]

  if index == 0:
    result_list.append("music")
  elif index == 1:
    result_list.append("silence")
  else:
    result_list.append("speech")

In [29]:
result_list

['speech',
 'silence',
 'speech',
 'silence',
 'silence',
 'music',
 'music',
 'music',
 'speech',
 'music',
 'speech',
 'speech',
 'speech',
 'speech',
 'speech',
 'speech',
 'silence',
 'silence',
 'silence',
 'silence',
 'music',
 'silence',
 'music',
 'music',
 'music',
 'speech',
 'speech',
 'speech',
 'speech',
 'speech',
 'speech',
 'speech',
 'silence',
 'silence',
 'speech',
 'music',
 'music',
 'silence',
 'music',
 'music',
 'music',
 'speech',
 'speech',
 'silence',
 'silence',
 'speech',
 'silence',
 'silence',
 'silence',
 'music',
 'music',
 'silence',
 'speech',
 'speech',
 'music',
 'music',
 'silence',
 'silence',
 'silence',
 'silence',
 'speech',
 'speech',
 'silence',
 'speech',
 'silence',
 'music',
 'silence',
 'speech',
 'music',
 'silence',
 'silence',
 'silence',
 'silence',
 'silence',
 'speech',
 'silence',
 'silence',
 'silence',
 'silence',
 'speech',
 'silence',
 'music',
 'silence',
 'speech',
 'music',
 'speech',
 'music',
 'silence',
 'speech',
 'speec

# **Convolution using silence padding**

In [30]:
# Silence padding
window_size = 3
for i in range(window_size):
  result_list.append("silence")

In [31]:
def maximum(a, b, c):
  
    if (a >= b) and (a >= c):
        largest = a
  
    elif (b >= a) and (b >= c):
        largest = b
    else:
        largest = c
          
    return largest

In [32]:
def max_window(index, array, window_size):
  count_music = 0
  count_silence = 0
  count_speech = 0
  for i in range(window_size):
    if array[i+index]=="music":
      count_music +=1
    elif array[i+index]=="speech":
      count_speech +=1
    else:
      count_silence +=1
  largest = maximum(count_music, count_silence, count_speech)
  if count_music == largest:
    return "music"
  elif count_speech == largest:
    return "speech"
  else:
    return "silence"

In [33]:

modified_result = []
result_array = np.array(result_list)
for i in range(result_array.shape[0]-window_size):
  modified_result.append(max_window(i, result_array, window_size))  
modified_result

['speech',
 'silence',
 'silence',
 'silence',
 'music',
 'music',
 'music',
 'music',
 'speech',
 'speech',
 'speech',
 'speech',
 'speech',
 'speech',
 'speech',
 'silence',
 'silence',
 'silence',
 'silence',
 'silence',
 'music',
 'music',
 'music',
 'music',
 'speech',
 'speech',
 'speech',
 'speech',
 'speech',
 'speech',
 'speech',
 'silence',
 'silence',
 'music',
 'music',
 'music',
 'music',
 'music',
 'music',
 'music',
 'speech',
 'speech',
 'silence',
 'silence',
 'silence',
 'silence',
 'silence',
 'silence',
 'music',
 'music',
 'music',
 'speech',
 'speech',
 'music',
 'music',
 'silence',
 'silence',
 'silence',
 'silence',
 'speech',
 'speech',
 'speech',
 'silence',
 'music',
 'silence',
 'music',
 'music',
 'music',
 'silence',
 'silence',
 'silence',
 'silence',
 'silence',
 'silence',
 'silence',
 'silence',
 'silence',
 'silence',
 'silence',
 'music',
 'silence',
 'music',
 'music',
 'speech',
 'music',
 'music',
 'music',
 'speech',
 'speech',
 'silence',
 'sil

In [40]:
modified_result_array = np.array(modified_result) #modified_result_array after convolution

In [41]:
modified_result_array.shape[0]

762

In [42]:
final_result = []
count = 0
onset_time = 0
offset_time = 0
frame = 2500/16000

for i in range(modified_result_array.shape[0]-1):
  var = modified_result_array[i]
  if modified_result_array[i+1] == modified_result_array[i]:
    offset_time += frame
  else:
    if (offset_time - onset_time)>0.5 and onset_time != offset_time and modified_result_array[i] != "silence":
      final_result.append([check[count], modified_result_array[i], onset_time, offset_time])
    offset_time += frame
    onset_time = offset_time

  if offset_time >= 10:
    dummy = offset_time - 10 
    offset_time = 10
    if onset_time == offset_time or modified_result_array[i] == "silence":
      continue
    if (offset_time - onset_time)>0.5 and onset_time != offset_time and modified_result_array[i] != "silence":
      final_result.append([check[count], modified_result_array[i], onset_time, offset_time])
    count +=1
    onset_time = 0
    offset_time = dummy



In [45]:
final_result[12:15]

[['music_noisy3', 'music', 0, 1.40625],
 ['music_noisy3', 'music', 2.03125, 7.03125],
 ['music_noisy3', 'music', 8.75, 10]]

# **Prediction using Spectrogram**

In [105]:
spectro_test_audio_dataset_path='/content/drive/MyDrive/new_test_data/spectrogram'

In [106]:
def spectro_mfcc(dir):
  dir_list = os.listdir(dir)
  spectro_test_audio_dataset = []
  files = []
  for file in dir_list:
    temp = []
    f = os.path.join(os.path.abspath(dir),file)
    S_db = np.load(f)
    S = librosa.db_to_power(S_db, ref=1.0)
    for i in range(1,S.shape[1]):
      part = S[:,i]
      temp.append([librosa.feature.mfcc(y = None, S = part,sr = SR,n_mfcc = 40)])
    f = file[:-4]
    spectro_test_audio_dataset.append([f,temp])
  return  pd.DataFrame(spectro_test_audio_dataset,columns=['filename','mfccs'])

In [107]:
spectro_test_audio_df = spectro_mfcc(spectro_test_audio_dataset_path)

In [108]:
spectro_test_audio_df.head(10)

Unnamed: 0,filename,mfccs
0,music+speech_noisy4,"[[[1.6800002870959215722e-05, 1.75402099189087..."
1,music+speech_noisy10,"[[[0.018144925504140484644, 0.0250460399718499..."
2,S001,"[[[3.9647090906431560457e-05, 5.51535419885171..."
3,music+speech_noisy5,"[[[3.952348594207683683e-05, 8.115720746082792..."
4,S004,"[[[1.8988564628101818995e-05, 2.60523862258827..."
5,music_noisy10,"[[[2.0155175956358044068e-05, 7.05915245646461..."
6,music_noisy9,"[[[2.4451935912818246589e-05, 4.71460114310301..."
7,music+speech_noisy3,"[[[1.4321536968189562371e-05, 3.01692441710436..."
8,music+speech_noisy1,"[[[3.4956031490022763402e-05, 3.19949882359048..."
9,S003,"[[[3.2586268740865896412e-05, 4.31008051100499..."


In [109]:
X=np.array(spectro_test_audio_df['mfccs'].tolist())

In [110]:
result= []
for i in tqdm(range(len(X))):
  for j in range(X.shape[1]):
    a = model.predict(X[i][j].reshape(1,-1))
    result.append(a[0])

100%|██████████| 30/30 [07:45<00:00, 15.53s/it]


In [111]:
res = np.array(result)
result_list = []
for i in range(res.shape[0]):
  arr = res[i]
  index = np.where(arr == np.amax(arr))[0][0]

  if index == 0:
    result_list.append("music")
  elif index == 1:
    result_list.append("silence")
  else:
    result_list.append("speech")

In [112]:
# Silence padding
window_size = 10
for i in range(window_size):
  result_list.append("silence")

In [113]:
modified_result = []
result_array = np.array(result_list)
for i in range(result_array.shape[0]-window_size):
  modified_result.append(max_window(i, result_array, window_size))  

In [114]:
modified_result_array = np.array(modified_result) #modified_result_array after convolution

In [120]:
final_result = []
count = 0
onset_time = 0
offset_time = 0
frame = 10/312

for i in range(modified_result_array.shape[0]-1):
  var = modified_result_array[i]
  if modified_result_array[i+1] == modified_result_array[i]:
    offset_time += frame
  else:
    if (offset_time - onset_time)>0.5 and onset_time != offset_time and modified_result_array[i] != "silence":
      final_result.append([spectro_test_audio_df['filename'][count], modified_result_array[i], onset_time, offset_time])
    offset_time += frame
    onset_time = offset_time

  if offset_time >= 10:
    dummy = offset_time - 10 
    offset_time = 10
    if onset_time == offset_time or modified_result_array[i] == "silence":
      continue
    if (offset_time - onset_time)>0.5 and onset_time != offset_time and modified_result_array[i] != "silence":
      final_result.append([spectro_test_audio_df['filename'][count],modified_result_array[i], onset_time, offset_time])
    count +=1
    onset_time = 0
    offset_time = dummy



In [121]:
df = pd.DataFrame(final_result,columns=['filename','event','onset','offset'])

In [122]:
df.to_csv('Task_1_Group_30.csv', index=False)