## Import and code

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import librosa
from tqdm import tqdm
import torchaudio
import math
from transformers import ASTFeatureExtractor
import torch

from scipy.io.wavfile import write
from sklearn.model_selection import train_test_split
from IPython.display import Audio
from google.colab import drive

In [5]:
def envelope(y, rate, threshold, w = 20): # this is the filter we use to remove the Silence from the audio
  ''' this filter normalizes the values of the amplituted and filters our all pathces of signal whre the mean is bellow the set thresshold.
the window patch depends on w in the following rate patch window = sample rate/ w
  '''

  mask = []
  y = pd.Series(y).apply(np.abs)
  y_mean = y.rolling(window=int(rate/w),
                       min_periods=1,
                       center=True).max()

  for mean in y_mean:
    if mean > threshold:
       mask.append(True)
    else:
      mask.append(False)
  return mask, y_mean

def test(l1,threshold=0.1,w = 10):
  mask, y_mean = envelope(l1[0], 16000, threshold=threshold,w = w)
  wav = l1[0]
  print('Sample rate points in this file:',len(wav))
  print('Listen to original file')
  write('test0.wav', 16000, wav)
  display(Audio(data = 'test0.wav', rate = 16000))

  fig, ax = plt.subplots(2,2, figsize=(20, 8))
  librosa.display.waveshow(wav, ax=ax[0,0], sr = 16000)
  librosa.display.waveshow(wav[mask], ax = ax[1,0],sr = 16000)
  ax[0,0].set_title('Original file')
  ax[0,0].set_ylabel('Amplitude (norm)')
  ax[0,0].set_xlabel('Time (seconds)')
  ax[1,0].set_title('No sielence file')
  ax[1,0].set_xlabel('Time (seconds)')
  ax[1,0].set_ylabel('Amplitude (norm)')
  # ax[2].style.use('ggplot')
  ax[0,1].plot(wav*np.array(np.logical_not(mask)), color='r', label='remove')
  ax[0,1].plot(wav*np.array(mask), color='c', label='keep')
  ax[0,1].plot(y_mean, color='m', label='envelope')
  ax[0,1].grid(False)
  ax[0,1].set_title('Effect of the filter')
  ax[0,1].set_xlabel('Sample Rate points')
  ax[0,1].set_ylabel('Amplitude (norm)')
  ax[0,1].legend(loc='best')
  ax[1,1].axis('off')
  plt.show()

  print('Listen to no sielence file')
  write('test.wav', 16000, wav[mask])
  display(Audio(data = 'test.wav', rate = 16000))

  return wav[mask]

def split_data(df, duration = 5 , overlap = 1, sr = 16000, threshold = 0.1, w = 10):
  ''' Split an audio file info fixed lenght wave tensors
  df -> dataframe with a list of file locations
  duration -> duration of teh resulting wave patches in seconds
  threshold -> signal strenght to identify as noise
  overlap -> overlap in seconds between the splits
  w-> Sample rate/w provides the window used to identify teh noise threshold
  '''
  split_dataframe = pd.DataFrame(columns = ['wave','label'])
  for row in tqdm(range(df.shape[0])):
    x = librosa.core.load('train_audio/' + df.iloc[row].filename,sr = sr)
    x1 = [x[0]/np.max(np.abs(x[0]),axis=0),x[1]]                    # we have to normalize the amplitutre so taht the thresshold makes sence for all entries
    mask, y_mean = envelope(x1[0], sr, threshold=threshold,w = w)

    wav = x1[0]
    wav = wav[mask]

    segments = math.ceil(len(wav)/(sr*duration))
    if segments == 1:
      while len(wav) != duration*sr:
        wav = np.concatenate([wav[:],wav[:duration*sr-len(wav)]])
      split_dataframe.loc[len(split_dataframe.index)] = [wav, df.iloc[row].primary_label]
      continue
    for i in range(segments):
      if (i+1) < segments:
        split_dataframe.loc[len(split_dataframe.index)] = [wav[i*duration*sr:(i+1)*duration*sr], df.iloc[row].primary_label]

      else :
        split_dataframe.loc[len(split_dataframe.index)] = \
         [np.concatenate([wav[i*duration*sr:],wav[:duration*sr-len(wav[i*duration*sr:])]]), df.iloc[row].primary_label]
        if np.concatenate([wav[i*duration*sr:],wav[:duration*sr-len(wav[i*duration*sr:])]]).shape[0]!=duration*sr :
          print(row)
          a = input()
  return split_dataframe



# Bird Classifier

[The Whole project with the data can be found here](https://drive.google.com/drive/folders/1YGw6GGCBEjsg3dFgiEruD7szzUpMcSVW?usp=sharing)

This project focuses on different ways to perform birdsong classification. The dataset used is from Kaggle [BirdCLEF 2023](#https://www.kaggle.com/competitions/birdclef-2023). As this project is mainly for learning purposses I have only considered 11 of the bird species. The names of the folders with audio recordings represent those species:<br>
afpfly1 <br>
bkctch1<br>
cibwar1<br>
grewoo2<br>
laudov1<br>
rindov<br>
strher<br>
varsun2<br>
witswa1<br>
yebapa1<br>
yewgre1<br>
<br>
I have decided to use AST pre-trained model that is programed in Pytorch. I have decide to prepare a tensor with all the available data, that i can save and load on the CPU and then run the training on the GPU. In this document i will show how i am preparing the dat before sending it as input to the AST model.


##Preparing the Train and Test datasets

we will do the following actins:<br><br>
###1. Loading the Records
Load records for birds that have between 100 and 120 recoded files (those are 10 in total). Also we will add a bird with 5 recorded files to see how the model will handle a label with less examples(the names of the birds are mentioned above).<br>


In [None]:
drive.mount('/content/gdrive')
%cd gdrive/MyDrive/Colab Notebooks/Birds2023
m_train = pd.read_csv('train_metadata.csv')
df = m_train[['filename', 'primary_label']].copy()
songs = list(df['primary_label'].value_counts()[(df['primary_label'].value_counts()>100) & (df['primary_label'].value_counts()<120)].index)
songs.append(list(df['primary_label'].value_counts()[df['primary_label'].value_counts() == 5].index)[-1])
df = df[df['primary_label'].isin(songs)]

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
[Errno 2] No such file or directory: 'gdrive/MyDrive/Colab Notebooks/Birds2023'
/content/gdrive/MyDrive/Colab Notebooks/Birds2023


###2. Apply a filter to remove the silence in the recording
An example is shown bellow.<br>You can hear the recording before and after the filter.

In [None]:
for i in range(df.shape[0]):
  x = librosa.core.load('train_audio/'+df.iloc[i].filename)
  x1 = [x[0]/np.max(np.abs(x[0]),axis=0),x[1]]
  test(x1, threshold = 0.125, w = 10)
  print('Press "s" to stop or any letter to continue')
  stop = input()
  if stop == 's' : break

Output hidden; open in https://colab.research.google.com to view.

###3. Save the filtered data

in the "split_data" line we load each file, resample it to 16000 samples in a second, apply the filter to it and store it in a dataframe.

In [6]:
# new_df0 = split_data(df, threshold = 0.125)
# new_df0.to_pickle('new_data.pkl')


drive.mount('/content/gdrive')
%cd gdrive/MyDrive/Colab Notebooks/Birds2023
new_df0 = pd.read_pickle('new_data.pkl')

Mounted at /content/gdrive
/content/gdrive/MyDrive/Colab Notebooks/Birds2023


###4. Create Spectograms from the wave data
we have our wave files and we should split them in a stratified fashion into train and test datasets. tHis is most easily done using train_test_split from the sllearn lybrary. this is not correct but as we will use other methods further in this project we will ease our efford for this one.<br>
Then we will use ASTFeatureExtractor() to convert those wave patches into spectograms and save them into tensor files.

In [8]:
type(new_df0)

pandas.core.frame.DataFrame

In [None]:
X_train, X_test, y_train, y_test = train_test_split(new_df0.loc[:,'wave'], new_df0.loc[:,'label'], test_size = 0.25, stratify = new_df0.loc[:,'label'])


In [None]:
feature_extractor = ASTFeatureExtractor()
dict_labels = dict(zip(y_train.unique(), range(len(y_train.unique()))))

y_train = y_train.map(dict_labels, na_action='ignore')
y_test = y_test.map(dict_labels, na_action='ignore')


X_train = feature_extractor(np.array(X_train.tolist()), sampling_rate=16000, padding="max_length", return_tensors="pt").input_values
y_train = torch.tensor(y_train.values, dtype=torch.int64)


X_test = feature_extractor(np.array(X_test.tolist()), sampling_rate=16000, padding="max_length", return_tensors="pt").input_values
y_test = torch.tensor(y_test.values, dtype=torch.int64)


###5. We save the datasets for further use

In [None]:
torch.save(X_train, 'X_train_tensor.pt')
torch.save(y_train, 'y_train_tensor.pt')
torch.save(X_test, 'X_test_tensor.pt')
torch.save(y_test, 'y_test_tensor.pt')
