## Audio classification

classification of ***Capuchinbird*** chirps from other forest sounds and identifying the chirps in Forest recordings

>dataset downloaded from kaggle

***dataset contains***
- *forest recordings*
- *parsed Capuchinbird sounds*
- *parsed non Capuchinbird sounds*

>due to no proper support of latest version numpy with latest torch -lists are used as alternative at some spots

>due to oserror torchaudio is not used instead scipy.signal module is used


In [164]:
from scipy.io import wavfile
import numpy as np
from scipy.signal import spectrogram,resample
import os
import random as r
import torch
import torch.nn as nn
import torchvision.transforms.functional as F

In [165]:
def loadWav(path,freq=16e3):
    rate,wav=wavfile.read(path)
    if wav.ndim>1:
        wav=wav[:,0]
    nSamples=int((len(wav)/rate)*freq)
    nwav=resample(wav,nSamples)[:48000]
    padding=48000-nwav.shape[0]
    if padding:
        x=padding//2
        nwav=np.pad(nwav,(x,padding-x),'edge')
    return nwav

def getSpectrogram(wav):
    f,t,Sxx=spectrogram(wav)
    Sxx=np.expand_dims(Sxx,axis=0)
    mean=Sxx.mean()
    std=Sxx.std()
    tensor=torch.tensor(list(Sxx),dtype=torch.float32)
    tensor=F.normalize(tensor,mean=mean,std=std)
    return tensor
        

In [166]:
posPATH='./sounds/Parsed_Capuchinbird_Clips/'
negPATH='./sounds/Parsed_Not_Capuchinbird_Clips/'
posFiles=[posPATH+i for i in os.listdir(posPATH)]
r.shuffle(posFiles)
negFiles=[negPATH+i for i in os.listdir(negPATH)]
r.shuffle(negFiles)
posData=list(zip(map(getSpectrogram,map(loadWav,posFiles[:150])),torch.ones((150,1))))
negData=list(zip(map(getSpectrogram,map(loadWav,negFiles[:150])),torch.zeros((150,1))))
DATA=posData+negData
r.shuffle(DATA)

In [167]:
class MODEL(nn.Module):
    def __init__(self):
        super(MODEL,self).__init__()
        self.conv1=nn.Conv2d(1,16,3)
        self.conv2=nn.Conv2d(16,16,3)
        self.lin=nn.Linear(16*125*210,128)
        self.lin2=nn.Linear(128,1)
    def forward(self,x):
        x=nn.functional.relu(self.conv1(x))
        x=nn.functional.relu(self.conv2(x))
        x=x.flatten()
        x=nn.functional.relu(self.lin(x))
        return torch.sigmoid(self.lin2(x))


In [168]:
classifier=MODEL()
from torch.optim import SGD
optimizer=SGD(classifier.parameters(),lr=0.001)
lossFUNC=nn.BCELoss()

In [None]:
for i in range(4):
    tloss=0
    for spec,label in DATA:
        optimizer.zero_grad()
        output=classifier(spec)
        loss=lossFUNC(output,label)
        tloss+=loss
        loss.backward()
        optimizer.step()
    print(f'epoch {i} - loss:{tloss}')

- *epoch 0* loss : 46.48808288574219
- *epoch 1* loss : 7.791569709777832
- *epoch 2* loss : 3.0420384407043457
- *epoch 3* loss : 1.5733261108398438

In [174]:
results=[]
with torch.no_grad():
    for file in posFiles[150:]:
        spec=getSpectrogram(loadWav(file))
        out=classifier(spec).item()
        results.append(round(out)==1)
    for file in negFiles[150:]:
        spec=getSpectrogram(loadWav(file))
        out=classifier(spec).item()
        results.append(round(out)==0)

In [None]:
totaln=len(posFiles)+len(negFiles)-300
correct=sum(results)
accuracy=(correct/totaln)*100

print(f'test-accuracy : {accuracy:.2f}%')

*test accuracy* - 95.49%

In [184]:
torch.save(classifier.state_dict(), 'ears')

identifying chirps in forest recordings

In [None]:
model=MODEL()
model.load_state_dict(torch.load('ears'))
model.eval()

In [189]:
def loadWavfile(path,freq=16e3):
    rate,wav=wavfile.read(path)
    if wav.ndim>1:
        wav=wav[:,0]
    nSamples=int((len(wav)/rate)*freq)
    nwav=resample(wav,nSamples)
    return nwav

def getSpectrogramfile(wav):
    f,t,Sxx=spectrogram(wav)
    Sxx=np.expand_dims(Sxx,axis=0)
    mean=Sxx.mean()
    std=Sxx.std()
    tensor=torch.tensor(list(Sxx),dtype=torch.float32)
    tensor=F.normalize(tensor,mean=mean,std=std)
    return tensor
    
    

>created wav files using audacity

In [186]:
path=r'.\sounds\Forest Recordings\recording_08.wav'

In [None]:
testfile=loadWavfile(path)
spec=getSpectrogramfile(testfile)

In [190]:
results=[]
ncols=spec.shape[-1]//214
with torch.no_grad():
    for i in range(ncols):
        out=model(spec[:,:,214*i:214*(i+1)])
        results.append(out.item())
rounds=[round(i) for i in results]

In [192]:
from itertools import groupby
totalchirps=0
for i in groupby(rounds):
    totalchirps+=i[0]
print(f'expect {totalchirps} in recording') #value=25

expect 24 in recording
