In [1]:
import torch
import librosa
from pydub import AudioSegment
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import torch.optim as optim
import time
import os

In [2]:
def GetMfccsFromPath(path):
    y, sr = librosa.load(path,sr=10000)
    mfccs = librosa.feature.mfcc(y=y, sr=sr,n_mfcc=24)
    max_size=200
    if mfccs.shape[1]<200:
        pad_size=max_size-mfccs.shape[1]
        mfccs=np.pad(mfccs,((0,0),(0,pad_size)))
    else:
        mfccs=mfccs[:,:max_size]
    res=torch.from_numpy(mfccs)
    res=res.unsqueeze(0)
    #res=res.reshape((24*200))
    #cache[key]=res
    return res
import random,glob        
row_list=list(glob.glob("/kaggle/input/birdclef-2023/test_soundscapes/*.ogg"))

In [3]:
# from torch.utils.data import Dataset
# class TestDataset(Dataset):
#     def __init__(self, path_list):
#         self.path_list=path_list
#     def __getitem__(self, index):
#         data=GetMfccsFromPath(self.path_list[index])
#         return data
#     def __len__(self):
#         return len(self.path_list)
# test_dataset=TestDataset(row_list)

In [4]:
# from torch.utils.data import DataLoader
# test_loader = DataLoader(dataset=test_dataset, batch_size=94, shuffle=False,drop_last=False)

In [5]:
class Net(nn.Module):

    def __init__(self):
        super(Net,self).__init__()
        self.conv1 = nn.Conv2d(1,64,3,padding=1)
        self.conv2 = nn.Conv2d(64,64,3,padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu1 = nn.ReLU()

        self.conv3 = nn.Conv2d(64,128,3,padding=1)
        self.conv4 = nn.Conv2d(128, 128, 3,padding=1)
        self.pool2 = nn.MaxPool2d(2, 2, padding=1)
        self.bn2 = nn.BatchNorm2d(128)
        self.relu2 = nn.ReLU()

        self.conv5 = nn.Conv2d(128,128, 3,padding=1)
        self.conv6 = nn.Conv2d(128, 128, 3,padding=1)
        self.conv7 = nn.Conv2d(128, 128, 1,padding=1)
        self.pool3 = nn.MaxPool2d(2, 2, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.relu3 = nn.ReLU()

        self.fc14 = nn.Linear(128*5*27,1024)
        self.drop1 = nn.Dropout2d()
        self.fc15 = nn.Linear(1024,1024)
        self.drop2 = nn.Dropout2d()
        self.fc16 = nn.Linear(1024,264)


    def forward(self,x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.pool1(x)
        x = self.bn1(x)
        x = self.relu1(x)


        x = self.conv3(x)
        x = self.conv4(x)
        x = self.pool2(x)
        x = self.bn2(x)
        x = self.relu2(x)

        x = self.conv5(x)
        x = self.conv6(x)
        x = self.conv7(x)
        x = self.pool3(x)
        x = self.bn3(x)
        x = self.relu3(x)

        #print(" x shape ",x.size())
        x = x.view(-1,128*5*27)
        x = F.relu(self.fc14(x))
        #x = self.drop1(x)
        x = F.relu(self.fc15(x))
        #x = self.drop2(x)
        x = self.fc16(x)

        return x

In [6]:
net=torch.load('/kaggle/input/models/model.h5',map_location=torch.device('cpu'))
# net=net.cuda()

In [7]:
import  torch.nn.functional as F

In [8]:
import pandas as pd
s=time.time()
names=os.listdir("/kaggle/input/birdclef-2023/train_audio")
names.sort()
csv_dic={"row_id":[]}
for key in names:
    csv_dic[key]=[]
    
net.eval()    
for index in range(len(row_list)):
    soundspace=row_list[index].split("/")[-1].replace(".ogg","")
    y, sr = librosa.load(row_list[index],sr=20000)
    res=y.reshape(120,y.shape[0]//120)
    
    mfccss=torch.zeros(120,1,24,200)
    order=0
    for i in res:
        mfccs = librosa.feature.mfcc(y=i, sr=sr,n_mfcc=24)
        mfccs=np.pad(mfccs,((0,0),(0,4)))
        mfccs=torch.from_numpy(mfccs)
        mfccs=mfccs.unsqueeze(0)
        mfccss[order]=mfccs
        order+=1
    res=net(mfccss)
    outputs = F.softmax(res,dim=1)
    frame=5
    for j in range(outputs.shape[0]):
        for i,row in enumerate(names):
            csv_dic[row].append(int(outputs[j][i].item()*100000)/100000)
        csv_dic["row_id"].append(soundspace+"_"+str(frame))
        frame+=5

print(time.time()-s)

20.760175466537476


In [9]:
dataframe = pd.DataFrame(csv_dic)

In [10]:
dataframe.head()

Unnamed: 0,row_id,abethr1,abhori1,abythr1,afbfly1,afdfly1,afecuc1,affeag1,afgfly1,afghor1,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1
0,soundscape_29201_5,0.0,0.0,0.0,0.0,0.0,0.00131,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,soundscape_29201_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00054,0.0,0.0,0.0,0.0,0.0,0.0
2,soundscape_29201_15,0.0,0.00878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,soundscape_29201_20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,soundscape_29201_25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
dataframe.to_csv("submission.csv",index=False,sep=',')