In [1]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import soundfile as sf
import pandas as pd
import tensorflow as tf
from operator import itemgetter
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
model = tf.keras.models.load_model('tfModel')

In [3]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 64)                1344      
_________________________________________________________________
dropout_8 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_9 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 128)               8320      
_________________________________________________________________
dropout_10 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 64)               

### Testing

In [4]:
import glob

In [5]:
def get_files(dirname, Fs=16000):
    signal = []
    test_filenames = []
    n =len(dirname) +1
    for name in glob.glob(dirname + '/*.npy'):
        data = readSpectrogram(name)
        temp2 = spec_to_aud(data,512,1024)
        signal.append(temp2)        
        filename = name[n:-4]            
        test_filenames.append(filename)
    signal = np.array(signal)
    test_filenames = np.array(test_filenames)
    return signal,test_filenames

In [6]:
def predict_class(x_val_test):
    x_ret = []
    curr_val_on = 0
    curr_val_off = len(x_val_test)
    j = curr_val_on
    while j<curr_val_off:        
        end = min(j+800,curr_val_off)
    #         print(" j and end "+ str(j)+ " "+ str(end))
        temp2 = x_val_test[j:end]
        mfcc_val = librosa.feature.mfcc(y=temp2,sr=16000,n_mfcc=20)
        mfcc_val_n =[]
        for k in range(mfcc_val.shape[0]):
            mfcc_val_n.append(mfcc_val[k].mean())
        mfcc_val_n = np.array(mfcc_val_n)
        x_ret.append(mfcc_val_n)
        j =end
    X_ret = np.array(x_ret)
    return X_ret

In [7]:
def argmax_classify(Y_res):
    y_ans = []
    for i in range(len(Y_res)):
        max_val = max(Y_res[i][0],Y_res[i][1],Y_res[i][2])
        if max_val== Y_res[i][0]:
            y_ans.append([1,0,0])
        if max_val== Y_res[i][1]:
            y_ans.append([0,1,0])
        if max_val== Y_res[i][2]:
            y_ans.append([0,0,1])
    y_ans = np.array(y_ans)
    return y_ans

In [8]:
def count_appearance(y_t,check):
    c =1
    c_time = []
    onset = -1
    offset = -1
    flag3=0
    for i in range(1,len(y_t)): 
        flag =1
        flag2 =1
        for j in range(0,3):
            if y_t[i][j]!=check[j]:
                flag=0
                break 
            elif y_t[i][j]!=y_t[i-1][j]:
                flag2 =0
                break
        if flag and flag2:
            c +=1
            if onset==-1:
                onset = i-1
        else:
            c =1
            offset = i-1        
            if onset!= -1:
                c_time.append([onset, offset])
                flag3=1
            onset = -1
    if onset!=-1:
        c_time.append([onset,len(y_t)-1])
    else:
        flag=0
        for j in range(0,3):
            n = len(y_t)-1
            if y_t[n][j]!=check[j]:
                flag=1
                break;
        if flag==0:
            c_time.append([n,n])
    return c_time

In [9]:
def audio_wav_files(dirname,Fs=16000):
    signal = []
    test_filenames = []
    n =len(dirname) +1
    for name in glob.glob(dirname + '/*.wav'):
        data,srr = librosa.load(name,sr=Fs)        
        signal.append(data)        
        filename = name[n:-4]            
        test_filenames.append(filename)
    signal = np.array(signal)
    test_filenames = np.array(test_filenames)
    return signal,test_filenames
    

In [10]:
def add_tag_sig(signal_sp,signal_mu,signal_si):
    b = []
    for i in range(len(signal_sp)):
        b.append([signal_sp[i][0],signal_sp[i][1],'speech'])    
    for i in range(len(signal_mu)):
        b.append([signal_mu[i][0],signal_mu[i][1],'music'])    
    for i in range(len(signal_si)):
        b.append([signal_si[i][0],signal_si[i][1],'silence'])
    b = sorted(b, key=itemgetter(0))
    return b

In [11]:
def leftover(tags_list,signal_class):
    i=1
    while i<len(tags_list):
        if tags_list[i][0] -tags_list[i-1][1]>=2:
            temp= tags_list[i-1][1] +1        
            index  = np.argmax(signal_class[temp])
            label = ""
            if index==0:
                label = 'speech'
            elif index==1:
                label = 'music'
            else:
                label = 'silence'
            temp_insert = [temp,temp,label]
            tags_list.insert(i,temp_insert)
        else:
            i +=1
    return tags_list

In [12]:
def split(tags_list):
    final_tags_list = []
    for i in range(len(tags_list)):
        for j in range(tags_list[i][0],tags_list[i][1]+1):
            final_tags_list.append([j,tags_list[i][2]])
    return final_tags_list

In [13]:
def edit(times):
    ans = []
    i=0
    count =0
    while i<len(times):
        #print(count)
        count_speech = 0
        count_music = 0
        count_silence = 0
        mini = min(i+3,len(times)-1)
        for j in range(i,mini):
            if(times[j][1]=='speech'):
                count_speech +=1
            if(times[j][1]=='music'):
                count_music +=1
            if(times[j][1]=='silence'):
                count_silence +=1
        max_event = max(count_speech,count_music,count_silence)        
        if max_event == count_speech:
            ans.append([i,mini,'speech'])
        elif max_event ==count_music:
             ans.append([i,mini,'music'])
        else:
             ans.append([i,mini,'silence'])
        i +=4
        count +=1
    return ans

In [14]:
def remove(tags_list):
    i=0
    while i<len(tags_list):
        if tags_list[i][2]=='silence' and tags_list[i][1]-tags_list[i][0]<3:
            tags_list.pop(i)
        elif tags_list[i][1]-tags_list[i][0]<10 and tags_list[i][2]!='silence':
            tags_list.pop(i)
        else:
            i +=1
    return tags_list

In [15]:
def time(tags_window):
    final_list = []    
    final_list.append([tags_window[0][0],tags_window[0][1],tags_window[0][2]])    
    for i in range(1,len(tags_window)):        
        if tags_window[i][2]==tags_window[i-1][2]:
            n = len(final_list)
            temp_list = [final_list[n-1][0],tags_window[i][1],tags_window[i][2]]
            final_list[n-1] = temp_list
        else:
            final_list.append([tags_window[i][0],tags_window[i][1],tags_window[i][2]])       
    return final_list

In [16]:
def time_stamps(final_tags_window,filename):
    predicted = []
    for i in range(len(final_tags_window)):
        if final_tags_window[i][2]!='silence':
            on_time = final_tags_window[i][0]*0.05
            off_time = (final_tags_window[i][1] +1)*0.05
            on_time = round(on_time,3)
            off_time = round(off_time,3)
            predicted.append([filename,on_time, off_time,final_tags_window[i][2]])
    return predicted

In [17]:
def audio_tagging(window,filename):
    speech_detected =0
    music_detected = 0
    tag =[]
    for i in range(len(window)):
        if window[i][2]=='speech':
            speech_detected = 1
        if window[i][2]=='music':
            music_detected = 1
    tag.append([filename, music_detected,speech_detected])
    return tag

In [18]:
def steps(signal,test_filenames):
    signal_time_stamps = []
    signal_tags = []
    for i in range(len(signal)):        
        signal_test = predict_class(signal[i])
        signal_pred = model.predict(signal_test)
        signal_class = argmax_classify(signal_pred)
        signal_sp = count_appearance(signal_class,[1,0,0])
        signal_mu = count_appearance(signal_class,[0,1,0])
        signal_si = count_appearance(signal_class,[0,0,1])
        tags = add_tag_sig(signal_sp,signal_mu,signal_si)
        tags_list = leftover(tags,signal_class)       
        final_tags_list = split(tags_list)
        tags_window = edit(final_tags_list)        
        final_tags_window = time(tags_window)
        final_tags_window2 = remove(final_tags_window)
        final_time_stamps = time_stamps(final_tags_window2,test_filenames[i])
        final_tags = audio_tagging(final_tags_window2,test_filenames[i])
        signal_time_stamps.append(final_time_stamps)
        signal_tags.append(final_tags)
    return signal_time_stamps,signal_tags


In [19]:
sig_wav,tf_wav = audio_wav_files('wav',Fs=16000)

In [20]:
sig_wav[0].shape

(160000,)

In [21]:
time_st,tags_st = steps(sig_wav,tf_wav)

In [22]:
time_st

[[['music+speech_noisy1', 0.2, 2.0, 'speech'],
  ['music+speech_noisy1', 2.8, 4.0, 'music'],
  ['music+speech_noisy1', 6.0, 8.4, 'speech']],
 [['music+speech_noisy10', 0.0, 2.0, 'music'],
  ['music+speech_noisy10', 2.6, 3.8, 'speech'],
  ['music+speech_noisy10', 4.0, 5.4, 'speech'],
  ['music+speech_noisy10', 6.4, 7.4, 'speech'],
  ['music+speech_noisy10', 8.0, 9.0, 'music']],
 [['music+speech_noisy2', 0.2, 1.6, 'music'],
  ['music+speech_noisy2', 2.8, 3.8, 'speech'],
  ['music+speech_noisy2', 5.4, 7.0, 'speech'],
  ['music+speech_noisy2', 8.4, 9.6, 'music']],
 [['music+speech_noisy3', 0.4, 1.6, 'music'],
  ['music+speech_noisy3', 3.0, 4.4, 'speech'],
  ['music+speech_noisy3', 5.4, 7.8, 'speech']],
 [['music+speech_noisy4', 0.4, 2.4, 'speech'],
  ['music+speech_noisy4', 3.2, 5.0, 'music'],
  ['music+speech_noisy4', 6.2, 8.0, 'music'],
  ['music+speech_noisy4', 8.4, 9.6, 'speech']],
 [['music+speech_noisy5', 0.2, 2.0, 'speech'],
  ['music+speech_noisy5', 2.6, 3.6, 'music'],
  ['music+sp

In [23]:
df_wav_event = pd.DataFrame(columns = ['filename', 'event', 'onset', 'offset'])

In [24]:
for i in range(len(time_st)):
    df_wav_event = df_wav_event.append(pd.DataFrame(time_st[i], columns=df_wav_event.columns))

In [25]:
df_wav_event.reset_index(inplace=True,drop=True)

In [26]:
df_wav_event[15:25]

Unnamed: 0,filename,event,onset,offset
15,music+speech_noisy4,0.4,2.4,speech
16,music+speech_noisy4,3.2,5.0,music
17,music+speech_noisy4,6.2,8.0,music
18,music+speech_noisy4,8.4,9.6,speech
19,music+speech_noisy5,0.2,2.0,speech
20,music+speech_noisy5,2.6,3.6,music
21,music+speech_noisy5,4.6,6.4,speech
22,music+speech_noisy5,7.2,8.6,speech
23,music+speech_noisy6,0.6,3.0,music
24,music+speech_noisy6,3.6,6.0,music


In [27]:
df_wav_tag = pd.DataFrame(columns = ['filename', 'Music', 'Speech'])

In [28]:
for i in range(len(tags_st)):
    df_wav_tag = df_wav_tag.append(pd.DataFrame(tags_st[i], columns=df_wav_tag.columns))

In [29]:
df_wav_tag.reset_index(inplace=True,drop=True)

In [30]:
df_wav_tag

Unnamed: 0,filename,Music,Speech
0,music+speech_noisy1,1,1
1,music+speech_noisy10,1,1
2,music+speech_noisy2,1,1
3,music+speech_noisy3,1,1
4,music+speech_noisy4,1,1
5,music+speech_noisy5,1,1
6,music+speech_noisy6,1,1
7,music+speech_noisy7,1,1
8,music+speech_noisy8,1,1
9,music+speech_noisy9,1,1


In [31]:
file_test = df_wav_tag.filename.values
music_test = df_wav_tag.Music.values
speech_test = df_wav_tag.Speech.values

In [32]:
df_tag2 = df_wav_tag.copy(deep=True)

In [33]:
df_tag2

Unnamed: 0,filename,Music,Speech
0,music+speech_noisy1,1,1
1,music+speech_noisy10,1,1
2,music+speech_noisy2,1,1
3,music+speech_noisy3,1,1
4,music+speech_noisy4,1,1
5,music+speech_noisy5,1,1
6,music+speech_noisy6,1,1
7,music+speech_noisy7,1,1
8,music+speech_noisy8,1,1
9,music+speech_noisy9,1,1


In [34]:
tag_m = [] 
tag_s = []

In [35]:
for i in range(len(file_test)):
    file_name = file_test[i]
    leng = len(file_name)    
    if leng==4:
        tag_m.append(0)
        tag_s.append(1)
    elif leng==12 or leng==13:
        tag_m.append(1)
        tag_s.append(0)
    else:
        tag_m.append(1)
        tag_s.append(1)

In [36]:
df_tag2['Tm'] = tag_m
df_tag2['Ts'] = tag_s

In [37]:
df_tag2

Unnamed: 0,filename,Music,Speech,Tm,Ts
0,music+speech_noisy1,1,1,1,1
1,music+speech_noisy10,1,1,1,1
2,music+speech_noisy2,1,1,1,1
3,music+speech_noisy3,1,1,1,1
4,music+speech_noisy4,1,1,1,1
5,music+speech_noisy5,1,1,1,1
6,music+speech_noisy6,1,1,1,1
7,music+speech_noisy7,1,1,1,1
8,music+speech_noisy8,1,1,1,1
9,music+speech_noisy9,1,1,1,1


In [38]:
file_test = df_tag2.filename.values
m_test = df_tag2.Music.values
s_test = df_tag2.Speech.values
m_act = df_tag2.Tm.values
s_act = df_tag2.Ts.values

In [39]:
conf_mat = np.zeros((3,3))
for i in range(len(file_test)):
    if s_act[i]==1 and m_act[i] ==0:
        if s_test[i]==1 and m_test[i]==0:
            conf_mat[0][0] +=1
        elif s_test[i]==0 and m_test[i]==1:
            conf_mat[0][1] +=1
        elif s_test[i]==1 and m_test[i]==1:
            conf_mat[0][2] +=1
    if s_act[i]==0 and m_act[i] ==1:
        if s_test[i]==1 and m_test[i]==0:
            conf_mat[1][0] +=1
        elif s_test[i]==0 and m_test[i]==1:
            conf_mat[1][1] +=1
        elif s_test[i]==1 and m_test[i]==1:
            conf_mat[1][2] +=1
    if s_act[i]==1 and m_act[i] ==1:
        if s_test[i]==1 and m_test[i]==0:
            conf_mat[2][0] +=1
        elif s_test[i]==0 and m_test[i]==1:
            conf_mat[2][1] +=1
        elif s_test[i]==1 and m_test[i]==1:
            conf_mat[2][2] +=1

    
            

In [40]:
conf_mat

array([[10.,  0.,  0.],
       [ 0.,  8.,  2.],
       [ 0.,  0., 10.]])

In [41]:
def rec_prec(conf_mat,index):
    totr=0;
    totc =0
    for i in range(3):
        totc += conf_mat[index][i]
        totr += conf_mat[i][index]
    recall = conf_mat[index][index]/totc
    precision = conf_mat[index][index]/totr
    return recall, precision

In [42]:
Recall_m, Precision_m= rec_prec(conf_mat,1)
Recall_s, Precision_s= rec_prec(conf_mat,0)
Recall_ms, Precision_ms= rec_prec(conf_mat,2)

In [43]:
avg_precision = (Precision_s +Precision_m +Precision_ms)/3

In [44]:
avg_precision

0.9444444444444445

In [45]:
avg_recall = (Recall_s + Recall_m + Recall_ms)/3

In [46]:
avg_recall

0.9333333333333332

In [47]:
def f_score(r,p):
    a = 2*p*r
    b = p+r
    f = a/b
    return f

In [48]:
macro_f_score= f_score(avg_recall,avg_precision)

In [49]:
macro_f_score

0.9388560157790926

In [50]:
def accuracy_score(conf_mat):
    a = np.trace(conf_mat)
    b= np.sum(conf_mat)
    return a/b

In [51]:
accuracy_score(conf_mat)

0.9333333333333333

In [52]:
#df_wav_event.to_csv('Event_Detection_Predicted.csv',index= False)

In [53]:
#df_wav_tag.to_csv('Audio_Tagging_Predicted.csv',index= False)