In [102]:
import torch
import os
import io
import librosa
import torchaudio
import numpy as np
import matplotlib.pyplot as plt    
from torch.utils.data import Dataset, TensorDataset, DataLoader
import glob
import pandas as pd 
from torch.utils.data import Dataset, TensorDataset, DataLoader
import torch
import librosa
from tqdm import tqdm
import matplotlib.pyplot as plt    
import numpy as np
from pathlib import Path
import torchvision
from torchvision import datasets, models, transforms
device = torch.device("cpu")

In [103]:

# test_samples = list(os.listdir("./kaggle/input/birdclef-2023/test_soundscapes/*.ogg"))
test_samples = list(glob.glob("./kaggle/input/birdclef-2023/test_soundscapes/*.ogg"))
test_samples
 

['./kaggle/input/birdclef-2023/test_soundscapes/soundscape_29201.ogg']

In [104]:
class TrainAudioDataset(Dataset):
    def __init__(self):
        self.test_samples = list(glob.glob("./kaggle/input/birdclef-2023/test_soundscapes/*.ogg"))
         
    def __len__(self):
        return len(self.test_samples)

    def __getitem__(self, idx): 
        path = self.test_samples[idx]
        file_id = path.split(".ogg")[0].split("/")[-1]
        
        waveform, sample_rate = torchaudio.load(path)
        return ((waveform[0], sample_rate), file_id)

In [105]:
def stride_trick(a, stride_length, stride_step):
     """
     apply framing using the stride trick from numpy.

     Args:
         a (array) : signal array.
         stride_length (int) : length of the stride.
         stride_step (int) : stride step.

     Returns:
         blocked/framed array.
     """
     nrows = ((a.size - stride_length) // stride_step) + 1
     n = a.strides[0]
     return np.lib.stride_tricks.as_strided(a,
                                            shape=(nrows, stride_length),
                                            strides=(stride_step*n, n))

def framing(sig, fs=16000, win_len=0.025, win_hop=0.01):
     """
     transform a signal into a series of overlapping frames (=Frame blocking).

     Args:
         sig     (array) : a mono audio signal (Nx1) from which to compute features.
         fs        (int) : the sampling frequency of the signal we are working with.
                           Default is 16000.
         win_len (float) : window length in sec.
                           Default is 0.025.
         win_hop (float) : step between successive windows in sec.
                           Default is 0.01.

     Returns:
         array of frames.
         frame length.

     Notes:
     ------
         Uses the stride trick to accelerate the processing.
     """
     # run checks and assertions
     if win_len < win_hop: print("ParameterError: win_len must be larger than win_hop.")

     # compute frame length and frame step (convert from seconds to samples)
     frame_length = win_len * fs
     frame_step = win_hop * fs
     signal_length = len(sig)
     frames_overlap = frame_length - frame_step

     # compute number of frames and left sample in order to pad if needed to make
     # sure all frames have equal number of samples  without truncating any samples
     # from the original signal
     rest_samples = np.abs(signal_length - frames_overlap) % np.abs(frame_length - frames_overlap)
     pad_signal = np.append(sig, np.array([0] * int(frame_step - rest_samples) * int(rest_samples != 0.)))

     # apply stride trick
     frames = stride_trick(pad_signal, int(frame_length), int(frame_step))
     return frames, frame_length

In [106]:
class DataProcessing: 
    @staticmethod
    def record_to_frames(waveform, sample_rate, frame_size=5):
        p1d = (1, sample_rate * frame_size)
        out = torch.nn.functional.pad(waveform, p1d, "constant", 0)
        return out.unfold(0, sample_rate * frame_size, sample_rate * frame_size)

    @staticmethod
    def my_collate(batch):
        frames = []
        labels = []
        for (data,file_id) in batch: 
                (waveform, sample_rate) = data
                l_frames, frame_length = framing(waveform, sample_rate, 5.0, 5.0)
   
                for index in range(len(l_frames)):
                    frame = l_frames[index]
                    # audio_spectogram = spectogram(frame)
                    # audio_spectogram = audio_spectogram.repeat(3, 1, 1)
                    frames.append((frame, sample_rate, f'{file_id}_{(index+1)*5}'))
                    labels.append(file_id) 
        return [frames, labels]
    
    @staticmethod
    def melgram_v2(audio, sample_rate,  to_file):
         
        plt.figure(figsize=(3,2))
        plt.axis('off')  # no axis
        plt.axes([0., 0., 1., 1.], frameon=False, xticks=[], yticks=[])  # Remove the white edge
        melspectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate)
       
        # height = S.shape[0]  
        # image_cropped = S[int(height*0.2  ):int(height*0.8 ),:]
        librosa.display.specshow(librosa.power_to_db(melspectrogram, ref=np.max))
        plt.savefig(to_file, bbox_inches=None, pad_inches=0)
        plt.close()

In [107]:
dataSet = TrainAudioDataset()
batch_size = 1
dataLoader = DataLoader(dataSet, batch_size=batch_size, collate_fn=DataProcessing.my_collate)

In [108]:
it = iter(dataLoader)

In [109]:
# (X, y) = next(it)
# print(y)

In [110]:

for (X, Y) in dataLoader:
    for batch, x in tqdm(enumerate(X)):
        fileName = Y[batch]
        (frame, sample_rate, name) = x
        path =  f'./kaggle/input/birdclef-2023/test_melspectrogram/{fileName}/'
        Path(path).mkdir(parents=True, exist_ok=True)
        DataProcessing.melgram_v2(frame , sample_rate, f'{path}/{name}.png')
            

120it [00:13,  8.98it/s]


In [111]:
transform = transforms.Compose([ transforms.ToTensor() ])
dataset = datasets.ImageFolder('./kaggle/input/birdclef-2023/test_melspectrogram', transform=transform,  )
testloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=False)

In [112]:
class_names = sorted(os.listdir('./kaggle/input/birdclef-2023/train_audio'))

model_ft = models.resnet18(pretrained=False)
num_ftrs = model_ft.fc.in_features
model_ft.fc = torch.nn.Linear(num_ftrs, len(class_names))
model_ft = model_ft.to(device)
criterion = torch.nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = torch.optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

checkpoint = torch.load('./models/model2023-04-028.pth', map_location=torch.device('cpu'))
model_ft.load_state_dict(checkpoint['model_state_dict'])
optimizer_ft.load_state_dict(checkpoint['optimizer_state_dict'])

model_ft.eval() 



ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [113]:
sample_sub:pd.DataFrame = pd.read_csv("./kaggle/input/birdclef-2023/sample_submission.csv")
sample_sub[class_names] = sample_sub[class_names].astype(np.float32)
sample_sub.drop(sample_sub.index, inplace=True)
sample_sub.head()

Unnamed: 0,row_id,abethr1,abhori1,abythr1,afbfly1,afdfly1,afecuc1,affeag1,afgfly1,afghor1,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1


In [114]:
indexToClass = {v: k for k, v in dataset.class_to_idx.items()}
columns = []
 
with torch.no_grad():
    for batch, (X, y) in tqdm(enumerate(testloader)):
        pred = model_ft(X)
        for index, predItem in enumerate(pred):
            m = torch.nn.Softmax(dim=0)
            output = m(predItem) 
            print(output.argmax(0))
            fileName = dataset.imgs[batch * 64 + index][0].split('/')[-1].split('.')[0]
             
            columns = sample_sub.columns
            
            collection = np.append([fileName], output.detach().numpy())
    
            newRow = pd.DataFrame([collection], columns=sample_sub.columns)
            pd.DataFrame(newRow)
            sample_sub = pd.concat([sample_sub, newRow], ignore_index=True) 

0it [00:00, ?it/s]

tensor(89)
tensor(221)
tensor(8)
tensor(229)
tensor(221)
tensor(8)
tensor(221)
tensor(8)
tensor(8)
tensor(8)
tensor(229)
tensor(132)
tensor(169)
tensor(230)
tensor(31)
tensor(30)
tensor(25)
tensor(168)
tensor(168)
tensor(25)
tensor(247)
tensor(221)
tensor(20)
tensor(221)
tensor(221)
tensor(165)
tensor(221)
tensor(247)
tensor(221)
tensor(233)
tensor(163)
tensor(70)
tensor(70)
tensor(56)
tensor(70)
tensor(137)
tensor(47)
tensor(47)
tensor(132)
tensor(221)
tensor(221)
tensor(20)
tensor(20)
tensor(20)
tensor(165)
tensor(20)
tensor(20)
tensor(20)


1it [00:01,  1.55s/it]

tensor(20)
tensor(224)
tensor(225)
tensor(73)
tensor(221)
tensor(20)
tensor(20)
tensor(165)
tensor(20)
tensor(131)
tensor(73)
tensor(73)
tensor(73)
tensor(165)
tensor(73)
tensor(56)
tensor(17)
tensor(20)
tensor(132)
tensor(20)
tensor(135)
tensor(73)
tensor(73)
tensor(163)
tensor(73)
tensor(20)
tensor(165)
tensor(73)
tensor(221)
tensor(53)
tensor(225)
tensor(132)
tensor(20)
tensor(245)
tensor(20)
tensor(10)
tensor(214)
tensor(132)
tensor(221)
tensor(165)
tensor(93)
tensor(113)
tensor(20)
tensor(73)
tensor(84)
tensor(221)
tensor(221)
tensor(132)
tensor(165)
tensor(73)
tensor(20)
tensor(20)
tensor(10)
tensor(131)
tensor(132)
tensor(27)
tensor(20)
tensor(20)
tensor(131)
tensor(221)
tensor(20)
tensor(20)
tensor(221)


2it [00:02,  1.42s/it]

tensor(8)
tensor(128)
tensor(132)
tensor(132)
tensor(113)
tensor(6)
tensor(131)
tensor(221)
tensor(221)





In [115]:
sample_sub.to_csv("submission.csv", index=False)

In [116]:
print(dic)

{'soundscape_29201_10': tensor(89), 'soundscape_29201_100': tensor(221), 'soundscape_29201_105': tensor(8), 'soundscape_29201_110': tensor(229), 'soundscape_29201_115': tensor(221), 'soundscape_29201_120': tensor(8), 'soundscape_29201_125': tensor(221), 'soundscape_29201_130': tensor(8), 'soundscape_29201_135': tensor(8), 'soundscape_29201_140': tensor(8), 'soundscape_29201_145': tensor(229), 'soundscape_29201_15': tensor(132), 'soundscape_29201_150': tensor(169), 'soundscape_29201_155': tensor(230), 'soundscape_29201_160': tensor(31), 'soundscape_29201_165': tensor(30), 'soundscape_29201_170': tensor(25), 'soundscape_29201_175': tensor(168), 'soundscape_29201_180': tensor(168), 'soundscape_29201_185': tensor(25), 'soundscape_29201_190': tensor(247), 'soundscape_29201_195': tensor(221), 'soundscape_29201_20': tensor(20), 'soundscape_29201_200': tensor(221), 'soundscape_29201_205': tensor(221), 'soundscape_29201_210': tensor(165), 'soundscape_29201_215': tensor(221), 'soundscape_29201_2

In [117]:
torch.FloatTensor(sample_sub.loc[9][1:].values.astype(np.float)).argmax(0)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  torch.FloatTensor(sample_sub.loc[9][1:].values.astype(np.float)).argmax(0)


tensor(8)

In [119]:
sample_sub.columns[8]

'afgfly1'