In [81]:
import librosa
import matplotlib.pyplot as plt
import IPython.display as ipd
import librosa.display
from scipy.io import wavfile as wav
%matplotlib inline
import numpy as np
import pandas as pd
import os

### Understanding Metadata


In [82]:
#loading maadata from csv file 
metadata = pd.read_csv('Audio_Data/BG_BR.csv')

#printing metadata 
metadata.head()

Unnamed: 0,slice_file_name,FSID,Maximum_Amplitude,Minimum_Amplitude,Start Time in original File(Sec),End Time in orignal File(Sec),Class ID,fold,class,Location on drive
0,Ball_Bounce_On_Ground.wav001_user.wav,211213,0.608783,-0.274387,[[4.65487528]],[[4.95482993]],2,1,Ball Ground Hit,C:/Users/Ankit Kumar/Audio_Data/BallBounceOnGr...
1,Ball_Bounce_On_Ground.wav002_user.wav,211213,0.688888,-0.962503,[[ 21.5752381 404.8645805]],[[21.87519274]],2,1,Ball Ground Hit,C:/Users/Ankit Kumar/Audio_Data/BallBounceOnGr...
2,Ball_Bounce_On_Ground.wav003_user.wav,211213,0.669433,-0.694868,[[23.8215873]],[[24.12154195]],2,1,Ball Ground Hit,C:/Users/Ankit Kumar/Audio_Data/BallBounceOnGr...
3,Ball_Bounce_On_Ground.wav004_user.wav,211213,0.597916,-0.766195,[[24.99750567]],[[25.29746032]],2,1,Ball Ground Hit,C:/Users/Ankit Kumar/Audio_Data/BallBounceOnGr...
4,Ball_Bounce_On_Ground.wav005_user.wav,211213,0.552153,-0.623119,[[30.62698413]],[[30.92693878]],2,1,Ball Ground Hit,C:/Users/Ankit Kumar/Audio_Data/BallBounceOnGr...


In [83]:
#check for imbalance dataset
#if the difference in numbers of files is significant then our data will be consider as imbalanced data
metadata['class'].value_counts()

Ball Ground Hit    754
Ball Racket Hit    694
Name: class, dtype: int64

### Data Preprocessing 

In [138]:
#loading csv file 
audio_dataset_path = 'Audio_Data/Audio/'
metadata=pd.read_csv('Audio_Data/BG_BR.csv')
#printing metadata 
metadata.head()

Unnamed: 0,slice_file_name,FSID,Maximum_Amplitude,Minimum_Amplitude,Start Time in original File(Sec),End Time in orignal File(Sec),Class ID,fold,class,Location on drive
0,Ball_Bounce_On_Ground.wav001_user.wav,211213,0.608783,-0.274387,[[4.65487528]],[[4.95482993]],2,1,Ball Ground Hit,C:/Users/Ankit Kumar/Audio_Data/BallBounceOnGr...
1,Ball_Bounce_On_Ground.wav002_user.wav,211213,0.688888,-0.962503,[[ 21.5752381 404.8645805]],[[21.87519274]],2,1,Ball Ground Hit,C:/Users/Ankit Kumar/Audio_Data/BallBounceOnGr...
2,Ball_Bounce_On_Ground.wav003_user.wav,211213,0.669433,-0.694868,[[23.8215873]],[[24.12154195]],2,1,Ball Ground Hit,C:/Users/Ankit Kumar/Audio_Data/BallBounceOnGr...
3,Ball_Bounce_On_Ground.wav004_user.wav,211213,0.597916,-0.766195,[[24.99750567]],[[25.29746032]],2,1,Ball Ground Hit,C:/Users/Ankit Kumar/Audio_Data/BallBounceOnGr...
4,Ball_Bounce_On_Ground.wav005_user.wav,211213,0.552153,-0.623119,[[30.62698413]],[[30.92693878]],2,1,Ball Ground Hit,C:/Users/Ankit Kumar/Audio_Data/BallBounceOnGr...


In [176]:
#the mel-frequency cepstrum (MFC) is a representation of the short-term power spectrum of a sound, based on a linear cosine transform of a log power spectrum on a nonlinear mel scale of frequency.
# n_mfcc = number of MFCCs to return
#sr = sample rate, y =audio time series

def features_extractor(file):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate,n_mfcc = 40)
    mfccs_scaled_features=np.mean(mfccs_features.T, axis=0)
    return mfccs_scaled_features


In [177]:
import numpy as np
from tqdm import tqdm
### Now we iterate through every audio file and extract features 
### using Mel-Frequency Cepstral Coefficients
extracted_features=[]
for index_num,row in tqdm(metadata.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str(row["fold"])+'\\',str(row["slice_file_name"]))
    final_class_labels=row["class"]
    data=features_extractor(file_name)
    extracted_features.append([data,final_class_labels])

1448it [00:07, 199.40it/s]


In [178]:
#creating dataframe  from the list of etracted features
extracted_features_df=pd.DataFrame(extracted_features,columns=['feature','class'])

In [179]:
#printing dataframe and saving it into csv for better understanding
extracted_features_df.head()

Unnamed: 0,feature,class
0,"[-218.40462, 153.12389, -25.449078, 22.280703,...",Ball Ground Hit
1,"[-252.92578, 142.6328, -23.307566, 17.159676, ...",Ball Ground Hit
2,"[-270.47318, 151.5141, -31.51296, 22.09384, 13...",Ball Ground Hit
3,"[-263.1113, 145.61594, -30.467701, 20.84265, 1...",Ball Ground Hit
4,"[-270.6276, 146.31517, -32.099884, 19.047443, ...",Ball Ground Hit


In [180]:
### Split the dataset into independent and dependent dataset
X=np.array(extracted_features_df['feature'].tolist())

In [181]:
### Split the dataset into independent and dependent dataset
y = np.array(extracted_features_df['class'].tolist())

In [182]:
#Label Encoding refers to converting the labels into numeric form so as to convert it into the machine-readable form. 
y=np.array(pd.get_dummies(y))

In [183]:
#spliting dataset into test and train data
#random state = 0 (random datasets would be taken each time we will run the program)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [184]:
#obtaining the shape of test and train datasets
X_train.shape
X_test.shape
y_train.shape
y_test.shape
print('X_shape: ',X_train.shape)
print('X_test: ',X_test.shape)
print('y_train: ',y_train.shape)
print('y_test: ',y_test.shape)
#print(No of training dataset or testing data set,no of features or labels)

X_shape:  (1158, 40)
X_test:  (290, 40)
y_train:  (1158, 2)
y_test:  (290, 2)


## Model 

In [185]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()

In [186]:
num_labels=y.shape[1]
num_labels

2

In [187]:
#Creating model
#here we have used 100,200,100 nurons in different layers
#we have use relu activation function
model=Sequential()
###first layer
model.add(Dense(100,input_shape=(40,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###second layer
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###third layer
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

###final layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [188]:
#model summary for better understanding
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_32 (Dense)             (None, 100)               4100      
_________________________________________________________________
activation_32 (Activation)   (None, 100)               0         
_________________________________________________________________
dropout_24 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_33 (Dense)             (None, 200)               20200     
_________________________________________________________________
activation_33 (Activation)   (None, 200)               0         
_________________________________________________________________
dropout_25 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_34 (Dense)             (None, 100)              

In [189]:
#selecting loss function adn optimizer
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [190]:
## Trianing the model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

num_epochs = 100
num_batch_size = 40

checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.13997, saving model to saved_models\audio_classification.hdf5
Epoch 2/100

Epoch 00002: val_loss did not improve from 0.13997
Epoch 3/100

Epoch 00003: val_loss did not improve from 0.13997
Epoch 4/100

Epoch 00004: val_loss improved from 0.13997 to 0.13148, saving model to saved_models\audio_classification.hdf5
Epoch 5/100

Epoch 00005: val_loss improved from 0.13148 to 0.11287, saving model to saved_models\audio_classification.hdf5
Epoch 6/100

Epoch 00006: val_loss improved from 0.11287 to 0.11170, saving model to saved_models\audio_classification.hdf5
Epoch 7/100

Epoch 00007: val_loss improved from 0.11170 to 0.10739, saving model to saved_models\audio_classification.hdf5
Epoch 8/100

Epoch 00008: val_loss improved from 0.10739 to 0.10478, saving model to saved_models\audio_classification.hdf5
Epoch 9/100

Epoch 00009: val_loss improved from 0.10478 to 0.10422, saving model to saved_models\audio_classification.hdf5
Epoch 10


Epoch 00037: val_loss did not improve from 0.04969
Epoch 38/100

Epoch 00038: val_loss did not improve from 0.04969
Epoch 39/100

Epoch 00039: val_loss did not improve from 0.04969
Epoch 40/100

Epoch 00040: val_loss improved from 0.04969 to 0.04506, saving model to saved_models\audio_classification.hdf5
Epoch 41/100

Epoch 00041: val_loss did not improve from 0.04506
Epoch 42/100

Epoch 00042: val_loss did not improve from 0.04506
Epoch 43/100

Epoch 00043: val_loss improved from 0.04506 to 0.04438, saving model to saved_models\audio_classification.hdf5
Epoch 44/100

Epoch 00044: val_loss did not improve from 0.04438
Epoch 45/100

Epoch 00045: val_loss did not improve from 0.04438
Epoch 46/100

Epoch 00046: val_loss did not improve from 0.04438
Epoch 47/100

Epoch 00047: val_loss improved from 0.04438 to 0.04364, saving model to saved_models\audio_classification.hdf5
Epoch 48/100

Epoch 00048: val_loss did not improve from 0.04364
Epoch 49/100

Epoch 00049: val_loss did not improve f


Epoch 00078: val_loss did not improve from 0.04364
Epoch 79/100

Epoch 00079: val_loss did not improve from 0.04364
Epoch 80/100

Epoch 00080: val_loss did not improve from 0.04364
Epoch 81/100

Epoch 00081: val_loss did not improve from 0.04364
Epoch 82/100

Epoch 00082: val_loss did not improve from 0.04364
Epoch 83/100

Epoch 00083: val_loss did not improve from 0.04364
Epoch 84/100

Epoch 00084: val_loss did not improve from 0.04364
Epoch 85/100

Epoch 00085: val_loss did not improve from 0.04364
Epoch 86/100

Epoch 00086: val_loss did not improve from 0.04364
Epoch 87/100

Epoch 00087: val_loss did not improve from 0.04364
Epoch 88/100

Epoch 00088: val_loss did not improve from 0.04364
Epoch 89/100

Epoch 00089: val_loss did not improve from 0.04364
Epoch 90/100

Epoch 00090: val_loss did not improve from 0.04364
Epoch 91/100

Epoch 00091: val_loss did not improve from 0.04364
Epoch 92/100

Epoch 00092: val_loss did not improve from 0.04364
Epoch 93/100

Epoch 00093: val_loss di

In [191]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print('Accuracy is',test_accuracy[1])

Accuracy is 0.9965517520904541


In [194]:

#Testing Some Test Audio Data


filename="C:/Users/Ankit Kumar/Audio_Data/Audio/Ball_Racket_Stairwell_01016_user.wav"
audio, sample_rate = librosa.load(filename, res_type='kaiser_fast') 
mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)

#print(mfccs_scaled_features)
mfccs_scaled_features=mfccs_scaled_features.reshape(1,-1)
#print(mfccs_scaled_features)
#print(mfccs_scaled_features.shape)
predicted_label=model.predict_classes(mfccs_scaled_features)
print(predicted_label)

[1]
