# Baseline for Pytorch Lightning based submission 

**Step 1: For generating spectrograms :** https://www.kaggle.com/code/nischaydnk/split-creating-melspecs-stage-1

**Step 2: Training Notebook with Pytorch Lightning:** https://www.kaggle.com/code/nischaydnk/birdclef-2023-pytorch-lightning-training-w-cmap

Feel free to reach out in comments incase you find bugs or have doubts!!

In [1]:
!export OMP_NUM_THREADS=N

!export OMP_SCHEDULE=STATIC
!export OMP_PROC_BIND=CLOSE
!export GOMP_CPU_AFFINITY="N-M"

In [2]:
!pip install /kaggle/input/openvino-wheels/openvino-2022.3.0-9052-cp37-cp37m-manylinux_2_17_x86_64.whl --no-index --find-links /kaggle/input/openvino-wheels

Looking in links: /kaggle/input/openvino-wheels
Processing /kaggle/input/openvino-wheels/openvino-2022.3.0-9052-cp37-cp37m-manylinux_2_17_x86_64.whl
Installing collected packages: openvino
Successfully installed openvino-2022.3.0
[0m

In [3]:
import numpy as np
import pandas as pd
import os
import warnings
import joblib
import torch

In [4]:
class Config:
    num_classes = 264
 
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')    

    data_root = "/kaggle/input/birdclef-2023/"
    train_path = "/kaggle/input/bc2023-train-val-df/train.csv"
    valid_path = "/kaggle/input/bc2023-train-val-df/valid.csv"
    
    train_path = "/kaggle/input/bc2023-train-val-df/train.csv"
    valid_path = "/kaggle/input/bc2023-train-val-df/valid.csv"
    test_path = '/kaggle/input/birdclef-2023/test_soundscapes/'

    SR = 32000
    DURATION = 5

    
    infer_duration=5
    
    train_duration=10
    
    # Sed model
    model_ckpt = [
        '/kaggle/input/birdclef-openvino-comp/sed_v2s_final_30s_finetune/sed3_120.xml', #v2s
        '/kaggle/input/birdclef-openvino-comp/sed_se_half_ce/sed_se_120.xml', #seresnext26t
        '/kaggle/input/birdclef-openvino-comp/sed_b3ns_30s_finetune/sed3_b3ns_120.xml', #b3ns
    ]
    
    # CNN model
    re_model_ckpt = [
        '/kaggle/input/birdclef-openvino-comp/openvino_models_comp_half/re_120.xml', #resnet34d
        '/kaggle/input/birdclef-openvino-comp/re_b3ns_ce/re_b3ns_120.xml', #b3ns
        '/kaggle/input/birdclef-openvino-comp/re_v2s_30s_finetune/re_v2s_120.xml', #v2s
        '/kaggle/input/birdclef-openvino-comp/re_b0ns_final/re_b0ns_120.xml', #b0ns
    ]
    

In [5]:
df_train = pd.read_csv(Config.train_path)
Config.num_classes = len(df_train.primary_label.unique())

In [6]:
'''
def sigmoid(a):
    return 1 / (1 + np.exp(-a))
def odds(p):
    return p / (1 - p)
def logit(p):
    return np.log(odds(p))
'''

'\ndef sigmoid(a):\n    return 1 / (1 + np.exp(-a))\ndef odds(p):\n    return p / (1 - p)\ndef logit(p):\n    return np.log(odds(p))\n'

In [7]:
def pred(df_test,num_workers=1,sleep=0,batch_size=1):
    import openvino.runtime as ov
    core = ov.Core()
    
    import numpy as np
    import pandas as pd
    import torch
    import os
    from torch.utils.data import Dataset, DataLoader
    import warnings

    warnings.filterwarnings('ignore')
    import torch.nn as nn
    import timm
    import librosa as lb
    import soundfile as sf
    from  soundfile import SoundFile 
    import torchaudio

    import torch.nn as nn
    import time
    from torch.nn import functional as F
    from torch.distributions import Beta
    from torch.nn.parameter import Parameter
    from joblib.externals.loky.backend.context import get_context
    #torch.jit.enable_onednn_fusion(True)


    class BirdDatasetSED(torch.utils.data.Dataset):

        def __init__(self, df, sr = Config.SR,n_mels=128, fmin=0, fmax=None, step=None, res_type="kaiser_fast",resample=True, duration = Config.DURATION, train = True):

            self.df = df
            self.sr = sr 
            self.n_mels = n_mels
            self.fmin = fmin
            self.fmax = fmax or self.sr//2

            self.train = train
            self.duration = duration

            self.audio_length = self.duration*self.sr
            self.step = step or self.audio_length

            self.res_type = res_type
            self.resample = resample   

        def __len__(self):
            return len(self.df)

        def read_file(self, filepath):
            #audio, orig_sr = torchaudio.load(filepath)
            #if orig_sr != self.sr:
            #    # sinc_interpolation
            #    resample_transform = torchaudio.transforms.Resample(orig_sr, self.sr, resampling_method="kaiser_window")
            #    audio = resample_transform(audio)

            audio, orig_sr = sf.read(filepath, dtype="float32")

            if self.resample and orig_sr != self.sr:
                audio = lb.resample(audio, orig_sr, self.sr, res_type=self.res_type)

            seconds = []
            for i in range(self.audio_length, len(audio) + self.step, self.step):
                start = max(0, i - self.audio_length)
                end = start + self.audio_length
                if end > len(audio):
                    pass
                else:
                    seconds.append(int(end/self.sr))

            audio = np.concatenate([audio,audio,audio])
            audios = []
            for i,second in enumerate(seconds):
                end_seconds = int(second)
                start_seconds = int(end_seconds - Config.DURATION)

                end_index = int(self.sr * (end_seconds + (Config.train_duration - Config.DURATION) / 2) ) + len(audio) // 3
                start_index = int(self.sr * (start_seconds - (Config.train_duration - Config.DURATION) / 2) ) + len(audio) // 3
                end_pad = int(self.sr * (Config.train_duration - Config.DURATION) / 2) 
                start_pad = int(self.sr * (Config.train_duration - Config.DURATION) / 2) 
                y = audio[start_index:end_index].astype(np.float32)
                if i==0:
                    y[:start_pad] = 0
                elif i==(len(seconds)-1):
                    y[-end_pad:] = 0
                audios.append(y)
            audios = np.stack(audios)
            audios = torch.tensor(audios).float().unsqueeze(1)
            spec384,spec256,spec300_another,spec_rev2s=transform_to_spec(audios,train=False)
            return spec384,spec256,spec300_another,spec_rev2s

        def __getitem__(self, idx):

            return self.read_file(self.df.loc[idx, "path"])
        

    hop_length384 = Config.infer_duration*Config.SR // (384-1)
    melspec_transform = torchaudio.transforms.MelSpectrogram(sample_rate=Config.SR, hop_length=hop_length384, n_mels=128, f_min=0, f_max=Config.SR//2, n_fft=2048, center=True, pad_mode='constant',norm='slaney',onesided=True,mel_scale='slaney')
    hop_length256 = Config.infer_duration*Config.SR // (256-1)
    melspec_transform256 = torchaudio.transforms.MelSpectrogram(sample_rate=Config.SR, hop_length=hop_length256, n_mels=128, f_min=0, f_max=Config.SR//2, n_fft=2048, center=True, pad_mode='constant',norm='slaney',onesided=True,mel_scale='slaney')
    #hop_length224 = Config.infer_duration*Config.SR // (224-1)
    #melspec_transform224 = torchaudio.transforms.MelSpectrogram(sample_rate=Config.SR, hop_length=hop_length224, n_mels=128, f_min=0, f_max=Config.SR//2, n_fft=2048, center=True, pad_mode='constant',norm='slaney',onesided=True,mel_scale='slaney')
    hop_length300 = Config.infer_duration*Config.SR // (300-1)
    melspec_transform300 = torchaudio.transforms.MelSpectrogram(sample_rate=Config.SR, hop_length=hop_length300, n_mels=128, f_min=50, f_max=14000, n_fft=1024, center=True, pad_mode='constant',norm='slaney',onesided=True,mel_scale='slaney')
    melspec_transform_rev2s = torchaudio.transforms.MelSpectrogram(sample_rate=Config.SR, hop_length=320, n_mels=64, f_min=50, f_max=14000, n_fft=1024, center=True, pad_mode='constant',norm='slaney',onesided=True,mel_scale='slaney')
    
    db_transform = torchaudio.transforms.AmplitudeToDB(stype='power',top_db=80)

    def transform_to_spec(audio,train=True):
        import math
        amin=1e-10
        ref_value=1.0
        db_multiplier = math.log10(max(amin, ref_value))
        spec = melspec_transform(audio)     
        #spec = torchaudio.functional.amplitude_to_DB(spec,multiplier=10,amin=amin,db_multiplier=db_multiplier,top_db=80)
        spec = db_transform(spec)
        spec256 = melspec_transform256(audio)
        spec256 = db_transform(spec256)
        
        #spec224 = melspec_transform224(audio)
        #spec224 = db_transform(spec224)
        
        spec300_another = melspec_transform300(audio)
        spec300_another = db_transform(spec300_another)
        
        spec_rev2s = melspec_transform_rev2s(audio)
        spec_rev2s = db_transform(spec_rev2s)
        
        spec384 = (spec+80)/80
        spec256 = spec256/255
        #spec224 = spec224/255
        spec300_another = spec300_another/255
        spec_rev2s = (spec_rev2s+80)/80
        return spec384,spec256,spec300_another,spec_rev2s

    
    
    def openvino_infer(model,data,tta):
        outputs = model.infer(inputs=[data,tta])
        outputs = torch.tensor(outputs[list(outputs.keys())[0]])
        return outputs
    
    def openvino_infer_re(model,data):
        outputs = model.infer(inputs=[data])
        outputs = torch.tensor(outputs[list(outputs.keys())[0]])
        return outputs
    
    def compute_deltas(
            specgram: torch.Tensor,
            win_length: int = 5,
            mode: str = "replicate"
    ) -> torch.Tensor:
        r"""Compute delta coefficients of a tensor, usually a spectrogram:

        .. math::
           d_t = \frac{\sum_{n=1}^{\text{N}} n (c_{t+n} - c_{t-n})}{2 \sum_{n=1}^{\text{N}} n^2}

        where :math:`d_t` is the deltas at time :math:`t`,
        :math:`c_t` is the spectrogram coeffcients at time :math:`t`,
        :math:`N` is ``(win_length-1)//2``.

        Args:
            specgram (Tensor): Tensor of audio of dimension (..., freq, time)
            win_length (int, optional): The window length used for computing delta (Default: ``5``)
            mode (str, optional): Mode parameter passed to padding (Default: ``"replicate"``)

        Returns:
            Tensor: Tensor of deltas of dimension (..., freq, time)

        Example
            >>> specgram = torch.randn(1, 40, 1000)
            >>> delta = compute_deltas(specgram)
            >>> delta2 = compute_deltas(delta)
        """
        device = specgram.device
        dtype = specgram.dtype

        # pack batch
        shape = specgram.size()
        specgram = specgram.reshape(1, -1, shape[-1])

        assert win_length >= 3

        n = (win_length - 1) // 2

        # twice sum of integer squared
        denom = n * (n + 1) * (2 * n + 1) / 3

        specgram = torch.nn.functional.pad(specgram, (n, n), mode=mode)

        kernel = torch.arange(-n, n + 1, 1, device=device, dtype=dtype).repeat(specgram.shape[1], 1, 1)

        output = torch.nn.functional.conv1d(specgram, kernel, groups=specgram.shape[1]) / denom

        # unpack batch
        output = output.reshape(shape)

        return output

    def make_delta(
        input_tensor: torch.Tensor
    ):
        input_tensor = input_tensor.transpose(3,2)
        input_tensor = compute_deltas(input_tensor)
        input_tensor = input_tensor.transpose(3,2)
        return input_tensor


    def image_delta(x):
        delta_1 = make_delta(x)
        delta_2 = make_delta(delta_1)
        x = torch.cat([x,delta_1,delta_2], dim=1)
        return x
    
    def reshp(images):
        bs,clip_len,channel_num,mel_num,time_len = images.size()
        images=images.reshape((bs*clip_len,channel_num,mel_num,time_len))
        return images
    
    def predict(data_loader, models,re_models):   
        predictions = []
        pred_binary = []
        dl_test = DataLoader(ds_test, batch_size=batch_size,num_workers = num_workers, multiprocessing_context=get_context('loky'))
        
        for spec384,spec256,spec300_another,spec_rev2s in dl_test:
            spec384 = reshp(spec384)
            spec256 = reshp(spec256)
            spec300_another = reshp(spec300_another)
            spec300_80 = (spec300_another*255+80)/80
            spec_rev2s = reshp(spec_rev2s)
            
            out = []
            for i,model in enumerate(models):
                if i==0:
                    images2_3chan = image_delta(spec384).numpy()

                    if images2_3chan.shape[0]>120:
                        output1 = openvino_infer(model,images2_3chan[:120,:,:,:],3)
                        output2 = openvino_infer(model,images2_3chan[120:240,:,:,:],3)
                        outputs = torch.cat([output1,output2],dim=0)
                    else:
                        outputs = openvino_infer(model,images2_3chan,3)
                elif i==2:
                    images_3chan = image_delta(spec300_another).numpy()
                    if images_3chan.shape[0]>120:
                        output1 = openvino_infer(model,images_3chan[:120,:,:,:],3)
                        output2 = openvino_infer(model,images_3chan[120:240,:,:,:],3)
                        outputs = torch.cat([output1,output2],dim=0)
                    else:
                        outputs = openvino_infer(model,images_3chan,3)
                else:
                    image_res = spec256.numpy()

                    if image_res.shape[0]>120:
                        output1 = openvino_infer(model,image_res[:120,:,:,:],2)
                        output2 = openvino_infer(model,image_res[120:240,:,:,:],2)
                        outputs = torch.cat([output1,output2],dim=0)
                    else:
                        outputs = openvino_infer(model,image_res,2)

                out.append(outputs)
            for i,model in enumerate(re_models):
                if (i==0):
                    images_center_resize1 = image_delta(spec256)[:,:,:,128:384].numpy()
                    if images_center_resize1.shape[0]>120:
                        output1 = openvino_infer_re(model,images_center_resize1[:120,:,:,:])
                        output2 = openvino_infer_re(model,images_center_resize1[120:240,:,:,:])
                        outputs = torch.cat([output1,output2],dim=0)
                    else:
                        outputs = openvino_infer_re(model,images_center_resize1)
                elif (i==1):
                    images_center_resize2 = image_delta(spec300_80)[:,:,:,150:450].numpy()
                    if images_center_resize2.shape[0]>120:
                        output1 = openvino_infer_re(model,images_center_resize2[:120,:,:,:])
                        output2 = openvino_infer_re(model,images_center_resize2[120:240,:,:,:])
                        outputs = torch.cat([output1,output2],dim=0)
                    else:
                        outputs = openvino_infer_re(model,images_center_resize2)
                elif (i==2):
                    images_re_v2s = image_delta(spec_rev2s)[:,:,:,250:750].numpy()
                    if images_re_v2s.shape[0]>120:
                        output1 = openvino_infer_re(model,images_re_v2s[:120,:,:,:])
                        output2 = openvino_infer_re(model,images_re_v2s[120:240,:,:,:])
                        outputs = torch.cat([output1,output2],dim=0)
                    else:
                        outputs = openvino_infer_re(model,images_re_v2s)
                elif (i==3):
                    image_b0ns = spec256[:,:,:,128:384].numpy()
                    if image_b0ns.shape[0]>120:
                        output1 = openvino_infer_re(model,image_b0ns[:120,:,:,:])
                        output2 = openvino_infer_re(model,image_b0ns[120:240,:,:,:])
                        outputs = torch.cat([output1,output2],dim=0)
                    else:
                        outputs = openvino_infer_re(model,image_b0ns)    
                else:
                    outputs = model(images_center_resize3)
    
                out.append(outputs)
                
            predictions.append(out)
        return predictions

    import gc

    print(f"Create Dataloader...")

    ds_test = BirdDatasetSED(
        df_test, 
        sr = Config.SR,
        duration = Config.DURATION,
        train = False
    )

    
    #print("Model Creation")
    models = []
    for i,ckpt in enumerate(Config.model_ckpt):
        #if i==0:
        #    model = load_mdl(name,ckpt,size,sed_3chan=True)
        #else:
        #    model = load_mdl(name,ckpt,size)

        model = core.read_model(model=ckpt)
        model = core.compile_model(model, device_name="CPU")
        model = model.create_infer_request()
        models.append(model)
        
    re_models = []
    for i,ckpt in enumerate(Config.re_model_ckpt):

        model = core.read_model(model=ckpt)
        model = core.compile_model(model, device_name="CPU")
        model = model.create_infer_request()
        re_models.append(model)

    print("Running Inference..")
    time.sleep(sleep)
    preds = predict(ds_test, models,re_models)   

    return preds

In [8]:
import pandas as pd
from pathlib import Path
df_test = pd.DataFrame(
     [(path.stem, *path.stem.split("_"), path) for path in Path(Config.test_path).glob("*.ogg")],
    columns = ["filename", "name" ,"id", "path"]
)
print(df_test.shape)
df_test.head()

(1, 4)


Unnamed: 0,filename,name,id,path
0,soundscape_29201,soundscape,29201,/kaggle/input/birdclef-2023/test_soundscapes/s...


In [9]:
#df_test = pd.concat([df_test]*200,axis=0).reset_index(drop=True)

In [10]:
cpu_num=2

In [11]:
num_job = min([cpu_num,len(df_test)])
split = len(df_test)//num_job
num_job,split

(1, 1)

In [12]:
dfs_test = []
df_test_left = None
for i in range(num_job):
    df_test_split = df_test.iloc[i*split:(i+1)*split].reset_index(drop=True)
    dfs_test.append(df_test_split)
    if i==num_job-1:
        df_test_left = df_test.iloc[(i+1)*split:].reset_index(drop=True)
len(dfs_test),len(df_test_left)

(1, 0)

In [13]:
import time
t1=time.time()
#results1 = joblib.Parallel(n_jobs=num_job, backend='loky')(joblib.delayed(pred)(df_test) for df_test in dfs_test)
results1 = joblib.Parallel(n_jobs=num_job, backend='loky')(joblib.delayed(pred)(df_test,num_workers,sl,batch_size) for df_test,num_workers,sl,batch_size in zip(dfs_test,[2,2],[0,5],[2,2]))
t2=time.time()
print(t2-t1)

Create Dataloader...
Running Inference..
76.37850403785706


In [14]:
t1=time.time()
results2 = []
if len(df_test_left)>0:
    results2 = joblib.Parallel(n_jobs=num_job, backend='loky')(joblib.delayed(pred)(df_test_left.iloc[i:i+1].reset_index(drop=True)) for i in range(len(df_test_left)))
t2=time.time()
print(t2-t1)

0.0002052783966064453


In [15]:
results = results1+results2

In [16]:
preds=[]
for r in results:
    preds+=r
len(preds)

1

In [17]:
preds1=[]
preds2=[]
preds3=[]
preds4=[]
preds5=[]
preds6=[]
preds7=[]
for r1,r2,r3,r4,r5,r6,r7 in preds:
    preds1.append(r1)
    preds2.append(r2)
    preds3.append(r3)
    preds4.append(r4)
    preds5.append(r5)
    preds6.append(r6)
    preds7.append(r7)

In [18]:
filenames = df_test.filename.values.tolist()

bird_cols = list(pd.get_dummies(df_train['primary_label']).columns)
sub_df = pd.DataFrame(columns=['row_id']+bird_cols)

In [19]:
sub_df

Unnamed: 0,row_id,abethr1,abhori1,abythr1,afbfly1,afdfly1,afecuc1,affeag1,afgfly1,afghor1,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1


# Generate Submission csv

In [20]:
def make_row_ids(file):
    num_rows = 120
    row_ids = np.array([f'{file}_{(i+1)*5}' for i in range(num_rows)])
    return row_ids

In [21]:
#row_ids = joblib.Parallel(n_jobs=4, backend='loky')(joblib.delayed(make_row_ids)(preds[i],file) for i, file in enumerate(filenames))
row_ids = joblib.Parallel(n_jobs=4, backend='loky')(joblib.delayed(make_row_ids)(file) for i, file in enumerate(filenames))
row_ids = np.concatenate(row_ids,axis=0)
#data = np.concatenate(preds,axis=0)
data1 = torch.cat(preds1,dim=0).logit()
data2 = torch.cat(preds2,dim=0).logit()
data3 = torch.cat(preds3,dim=0).logit()
data4 = torch.cat(preds4,dim=0)
data5 = torch.cat(preds5,dim=0)
data6 = torch.cat(preds6,dim=0)
data7 = torch.cat(preds7,dim=0)
#data_binary = np.concatenate(preds_binary,axis=0)

In [22]:
def ensemble(sed_pred,sed_pred2,sed_pred3,re_pred,re_pred2,re_pred3,re_pred4):
    
    sed_pred[:,:] = 0.25*sed_pred[:,:] + 0.1*sed_pred2 + 0.21*sed_pred3 + 0.1*re_pred[:,:] + 0.15*re_pred2[:,:] + 0.15*re_pred3[:,:] + 0.04*re_pred4[:,:]
    
    return sed_pred

In [23]:
data = ensemble(data1,data2,data3,data4,data5,data6,data7).sigmoid().numpy()

In [24]:
sub_df['row_id'] = row_ids
sub_df[bird_cols] = data
#sub_df = pd.concat(dfs).reset_index(drop=True)
sub_df

Unnamed: 0,row_id,abethr1,abhori1,abythr1,afbfly1,afdfly1,afecuc1,affeag1,afgfly1,afghor1,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1
0,soundscape_29201_5,0.015879,0.015950,0.009024,0.006060,0.001950,0.007939,0.012441,0.004348,0.019459,...,0.007391,0.004979,0.004348,0.005263,0.020303,0.010041,0.006097,0.005021,0.012830,0.008851
1,soundscape_29201_10,0.006525,0.019365,0.004583,0.013513,0.001532,0.006741,0.009000,0.003172,0.007602,...,0.004965,0.005576,0.014945,0.017807,0.009958,0.009822,0.004264,0.004036,0.004701,0.006040
2,soundscape_29201_15,0.001874,0.007680,0.003788,0.003989,0.000505,0.004772,0.004129,0.002459,0.004296,...,0.002039,0.004400,0.004512,0.007469,0.004012,0.004736,0.001747,0.008886,0.008005,0.004691
3,soundscape_29201_20,0.001835,0.004417,0.003125,0.006368,0.001262,0.005583,0.002646,0.003330,0.004892,...,0.002492,0.008698,0.003230,0.002122,0.007496,0.011564,0.003144,0.005387,0.011032,0.009354
4,soundscape_29201_25,0.002562,0.006282,0.005061,0.011566,0.002033,0.005863,0.003340,0.004916,0.005113,...,0.003605,0.007702,0.004085,0.004642,0.010074,0.013033,0.003792,0.006818,0.014551,0.011374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,soundscape_29201_580,0.002593,0.005253,0.002667,0.006530,0.001388,0.005288,0.004196,0.004137,0.005507,...,0.003225,0.008011,0.003422,0.002826,0.007431,0.011528,0.004252,0.005403,0.007810,0.007802
116,soundscape_29201_585,0.004641,0.008179,0.004031,0.009059,0.001911,0.009275,0.012153,0.004938,0.006990,...,0.005787,0.005538,0.003850,0.004462,0.009571,0.014725,0.011624,0.007671,0.006069,0.019955
117,soundscape_29201_590,0.003719,0.007982,0.004254,0.009560,0.001848,0.010125,0.011808,0.005558,0.007640,...,0.003897,0.006354,0.004389,0.004118,0.007534,0.012009,0.008195,0.006543,0.009682,0.013014
118,soundscape_29201_595,0.003561,0.007186,0.002867,0.008423,0.003444,0.006225,0.003906,0.005485,0.007533,...,0.001454,0.007427,0.005109,0.005659,0.005596,0.014919,0.004194,0.003636,0.010204,0.007681


In [25]:
sub_df.to_csv('submission.csv',index=False)