In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import os 
from IPython.display import Audio 
from scipy.io import wavfile 
import scipy
import scipy.signal
import soundfile as sf 
import tensorflow as tf 
import tensorflow_hub as hub 
import warnings 
warnings.filterwarnings('ignore') 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.model_selection import train_test_split 
import tensorflow as tf 
from tensorflow.keras import layers, models 
from sklearn.metrics import classification_report 





In [2]:
data_path = "./resampledData"


In [3]:
def ensure_sample_rate(original_sample_rate, waveform, desired_sample_rate=16000): 
	if original_sample_rate != desired_sample_rate: 
		desired_length = int( 
			round(float(len(waveform))/original_sample_rate * desired_sample_rate)) 
		waveform = scipy.signal.resample(waveform, desired_length) 
	return desired_sample_rate, waveform 


In [4]:
def read_audio(filename): 
	wav_data, sample_rate = sf.read(file=filename, dtype=np.int16) 
	if len(wav_data.shape) > 1: 
		wav_data = np.mean(wav_data, axis=1) 
	sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data) 
	return sample_rate, wav_data 


In [5]:
audio_data = [] 
for i in os.listdir(data_path): 
	filename = data_path+"/"+i 
	filename = filename.format(i=i) 
	if os.path.isdir(filename):
		for j in os.listdir(filename): 
			path = os.path.join(filename, j) 
			audio_data.append([read_audio(path)[1], i]) 


In [6]:
audio_dataframe = pd.DataFrame(audio_data, columns=["audio_data", "class"]) 
print(audio_dataframe.head())


                                          audio_data     class
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  acoustic
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  acoustic
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  acoustic
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  acoustic
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  acoustic


In [7]:
desired_length = 80000  # Adjust this length according to your needs; should be five seconds

# Function to pad or truncate audio data to the desired length
def pad_or_truncate(audio, length):
    if len(audio) > length:
        return audio[:length]
    else:
        return np.pad(audio, (0, length - len(audio)), 'constant')

# Apply the function to each row in the "audio_data" column
audio_dataframe['audio_data'] = audio_dataframe['audio_data'].apply(lambda x: pad_or_truncate(x, desired_length))

In [8]:
audio_data = np.array(audio_dataframe["audio_data"].to_list()) 


In [9]:


model_yamnet = hub.load('https://tfhub.dev/google/yamnet/1') 













In [10]:
audio_embeddings = [] 
for i in audio_data: 
    waveform = i / tf.int16.max
    scores, embeddings, spectrogram = model_yamnet(waveform) 
    audio_embeddings.append(embeddings) 

In [11]:
padded_audio_embeddings = [] 
for i in audio_embeddings: 
    padding_needed = 100-i.shape[0] 
    padded_tensor = tf.pad(i, [[0, padding_needed], [0, 0]]) 
    padded_audio_embeddings.append(padded_tensor) 

In [12]:
ohe = OneHotEncoder(sparse_output=False) 
classes = ohe.fit_transform(audio_dataframe[["class"]]) 

In [13]:
xtrain, xtest, ytrain, ytest = train_test_split( 
    np.array(padded_audio_embeddings), classes, random_state=42, test_size=0.2) 

In [14]:
model = models.Sequential([ 
	layers.Input(shape=(100, 1024)), 
	layers.Flatten(), 
	layers.Dense(16, activation='relu'), 
	layers.Dropout(0.1), 
	layers.Dense(16, activation='relu'), 
	layers.Dropout(0.1), 
	layers.Dense(16, activation='relu'), 
	layers.Dense(5, activation='softmax') 
]) 
model.compile(optimizer='adam', loss='categorical_crossentropy', 
			metrics=['accuracy']) 
model.fit(xtrain, ytrain, epochs=20)


Epoch 1/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.3044 - loss: 1.6270
Epoch 2/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.5540 - loss: 1.3067
Epoch 3/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.6613 - loss: 1.0446
Epoch 4/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.7162 - loss: 0.9336
Epoch 5/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.8080 - loss: 0.6791
Epoch 6/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.8052 - loss: 0.5798
Epoch 7/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.7944 - loss: 0.5782
Epoch 8/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8109 - loss: 0.5139
Epoch 9/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x2ed7a3d65d0>

In [15]:
loss, accuracy = model.evaluate(xtest, ytest) 
print(loss, accuracy) 


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8869 - loss: 0.3533  
0.36590659618377686 0.8794326186180115


In [21]:
def pipeline(filename): 
	audio_data = read_audio(filename)[1] 
	audio_data = audio_data/tf.int16.max
	scores, embeddings, spectrogram = model_yamnet(audio_data) 
	padding_needed = 100-embeddings.shape[0] 
	padded_tensor = tf.pad(embeddings, [[0, padding_needed], [0, 0]]) 
	# Reshape the padded tensor to match the input shape expected by the model 
	padded_tensor = tf.reshape(padded_tensor, (1, 100, 1024)) 
	prob = model.predict(padded_tensor)[0] 
	print(prob)
	max_index = np.argmax(prob) 
	print(max_index)
	if max_index == 0: 
		return "bird"
	elif max_index == 1: 
		return "cat"
	else: 
		return "dog"


print("the audio given is of", pipeline(r".\resampledData\electric\Beachside Electric 03.wav"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[0.12140419 0.17809524 0.5363594  0.11755665 0.04658454]
2
the audio given is of dog
