In [15]:
import pandas as pd
import numpy as np
import librosa
import IPython
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
import glob
from pydub import AudioSegment
import pydub 

In [16]:
def feature_calc(time_feat):
    return np.mean(time_feat)

#function = 1 at x=1
def function(x):
    y = -(x-1)**2 + 1
    return y

In [17]:
def feature_extract(files,nbands=25,count_thr = 3):
    #--------------
    #--------------
    # PREPARING DATASETS
    #--------------
    #--------------

    #for ease of coding, we'll create df and then return the only row there is
    df_base = pd.DataFrame(columns = ["file","animal"])
    df_base["file"] = files
    df_base = df_base.set_index('file')

    for file in files:
        #get the animal
        if "dog_" in file:
            df_base.loc[file,"animal"] = 0
        elif "cat_" in file:
            df_base.loc[file,"animal"] = 1

    #create dataframess
    df_raw = df_base.copy()
    df_bands = df_base.copy()
    df_bands_nolin = df_base.copy()
    df_pitches = df_base.copy()

    #These values are a constant really but we are just gonna get them by analyzing the first file each tima
    #kinda inefficient but easy
    x, sr = librosa.load(files[0])
    chroma_stft = librosa.feature.chroma_stft(y=x, sr=sr,n_chroma=12, n_fft=500)
    X = librosa.stft(x,n_fft=500)

    #create columns for df
    #raw df
    for i in range(len(X)):
        df_raw['freq'+str(i)] = np.nan

    #pitch df
        for i in range(len(chroma_stft)):
            df_pitches['chroma_'+str(i)] = np.nan

    #bands lin
    for i in range(nbands):
        df_bands['band'+str(i)] = np.nan
        df_bands_nolin['band'+str(i)] = np.nan
    
    #--------------
    #--------------
    # EXTRACTING FEATURES FOR ALL FILES
    #--------------
    #--------------

    for file in files:
        #load file
        x, sr = librosa.load(file)

        #---
        #DETECTING INTERESTING PART OF AUDIO
        #---
        
        #first step, get the parts where the animal is making sound
        #get the chroma - pitch data
        chroma_stft = librosa.feature.chroma_stft(y=x, sr=sr,n_chroma=12, n_fft=500)
        X = librosa.stft(x,n_fft=500)
        Xdb = librosa.amplitude_to_db(abs(X))

        #get the activation counts 
        activation_count = np.array([0]*len(chroma_stft[0]))
        thr=0.5
        median = [np.median(chroma_stft[i]) for i in range(len(chroma_stft))] #meadian of activations over all image
        for i in range(len(chroma_stft[0])):
            for pitch in range(len(chroma_stft)):
                if(chroma_stft[pitch][i] <= median[pitch]*thr):
                    activation_count[i] += 1

        #these are the points on chroma points where the count is over threshold
        chroma_points = np.where(activation_count > count_thr)[0]
        #lenght of chroma analysis audio
        chr_len = len(chroma_stft[0]) #will be used to compute proportions 


        
        #Now that we have our cut references we can start by getting data for each df
        
        #-------
        #CHROMA PITCH FF
        #--------
        #let's start with what we already have cut
        for i in range(len(chroma_stft)):
            #for each chroma add points selected by activation
            df_pitches.loc[file,'chroma_'+str(i)] = feature_calc(chroma_stft[i][chroma_points])

        #--------
        # RAW FREQ DF
        #--------
        Xdb_len = len(Xdb[0]) #as Xdb is a matrix of (featuresx[samples] so only first feature), lenght of audio in Xdb
        phi = int(Xdb_len / chr_len)

        #to get our slices back we will directly append it to a new variable
        Xdb_cut = [np.nan]*len(Xdb)
        for ch_idx in chroma_points:
            #reference of center of slice in x points or "coordinates"
            Xdb_idx = ch_idx * phi

            #to get the whole slice get half and half around center, evade negative indexes
            slice_0 = Xdb_idx-int(phi/2)
            if slice_0 < 0 :
                slice_0 = 0
            elif slice_0 >= len(Xdb[0]):
                slice_0 = len(Xdb[0])-1

            slice_1 = Xdb_idx+int(phi/2)
            if slice_1 < 0:
                slice_1 = 0
            elif slice_1 >= len(Xdb[0]):
                slice_1 = len(Xdb[0])-1
            
            for a in range(len(Xdb)): #for all features, all frequencies 
                Xdb_cut[a] = np.append(Xdb_cut[a],Xdb[a][slice_0:slice_1])

        #add to the dataset the mean or other method that condensates all time-wise audio into a single value
        #do it for each feature
        for i in range(len(Xdb_cut)):
            #[1:] to avoid the first nan (little trick)
            df_raw.loc[file,'freq'+str(i)] = feature_calc(Xdb_cut[i][1:])


    #Now instead of calculating the bands for each file we can do the calculations
    #directly using pandas and convert one df to an other
    #--------
    #FREQ BANDS LINEAR DF
    #-------- 
    nfreq = len(Xdb)#same for all files because they all have the same sampling rate, so last file Xdb works for us
    Xdb_bnds = np.empty((nbands,len(Xdb_cut[0])))
    lamb = int(nfreq/(nbands)) #frequencies per band
    for i_b, band_txt in enumerate(df_bands.columns[1:]): #[1:] to avoid "animal" column
        cut1 = i_b*lamb
        cut2 = i_b*lamb+lamb 
        df_bands[band_txt] = df_raw.iloc[:,cut1+1:cut2+2].mean(axis=1) #get the mean over all selected frequencies #+1 to avoid "animals" column

    #--------
    #FREQ BANDS NON-LINEAR DF
    #--------
    for i_b, band_txt in enumerate(df_bands_nolin.columns[1:]): #[1:] to avoid "animal" column
        scalar1 = function(i_b/nbands) #scalar used to distort the cut1 so it is non linear over different bands
        cut1 = int(scalar1*lamb*nbands)
        scalar2 = function((i_b + 1)/nbands)#scalar used to distort the cut2 so it is non linear over different bands
        cut2 = int(scalar2*lamb*nbands)
        df_bands_nolin[band_txt] = df_raw.iloc[:,cut1+1:cut2+2].mean(axis=1) #get the mean over all selected frequencies #+1 to avoid "animals" column

    return df_raw , df_bands, df_bands_nolin, df_pitches

In [18]:
filename = "dataframes/bands_lin.csv"
df = pd.read_csv(filename)

X = df.drop(columns=['animal','file'], axis=1) 
y = df['animal']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.4, random_state=0)

cols = X_train.columns
scaler = RobustScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])

final_classifier = svm.SVC(kernel='linear',C=0.1)
final_classifier.fit(X_train, y_train)


In [19]:
#convert ogg to wav
oggs = glob.glob("demo_audios/*.ogg")
for ogg in oggs:
    print(ogg)
    audio = AudioSegment.from_ogg(ogg)
    audio.export(ogg[:-3]+"wav", format='wav')

demo_audios/WhatsApp Ptt 2022-12-13 at 15.09.49.ogg


In [20]:
#first, extract paths of all files in the folder to test:
audio_test_files = glob.glob("demo_audios/*.wav")

df_raw , df_bands, df_bands_nolin, df_pitches = feature_extract(audio_test_files)

df_final = df_bands.iloc[:,1:]

cols = df_final.columns

scaler = RobustScaler()

df_final = scaler.fit_transform(df_final)

df_final = pd.DataFrame(df_final, columns=[cols])

predictions = final_classifier.predict(df_final) # [1:] to have "animal" column out

  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['freq'+str(i)] = np.nan
  df_raw['

In [21]:
for i,pred in enumerate(predictions):
    #Recall that animal=0 is a dog and =1 is cat
    if pred == 0:
        animal = "DOG"
    else:
        animal = "CAT"
    print(audio_test_files[i])
    print("################")
    print("####"+animal+"####")
    print("################")
    IPython.display.display(IPython.display.Audio(audio_test_files[i]))
    print("")
    print("")

demo_audios/cat-meow-14536.wav
################
####DOG####
################




demo_audios/WhatsApp Ptt 2022-12-13 at 15.09.49.wav
################
####CAT####
################




demo_audios/DOG_BARKING_AND_WOOD_CLANKING_pfB.wav
################
####CAT####
################




