In [59]:
import numpy as np
import pandas as pd
from glob import glob
import onnxruntime as nxrun
from torch.utils.data import Dataset, DataLoader
import torchaudio
import torch
import librosa
import timm
from torchsummary import summary

### Configuration

In [60]:
# torch.set_num_threads(1)


class CFG:
    sample_rate = 32000
    output_window_duration = 5
    input_window_duration = 5
    batch_size = 2
    image_width = 157
    audio_len = sample_rate * input_window_duration
    hop_length = audio_len // (image_width - 1)

### Prepare data

In [61]:
BASE_PATH = "../data/birdclef-2024"
MODEL_PATH = "../model2.onnx"

In [62]:
test_paths = glob(f"{BASE_PATH}/test_soundscapes/*ogg")
# During commit use `unlabeled` data as there is no `test` data.
# During submission `test` data will automatically be populated.
if len(test_paths) == 0:
    test_paths = glob(f"{BASE_PATH}/unlabeled_soundscapes/*ogg")[:40]
test_df = pd.DataFrame(test_paths, columns=["filepath"])

### Load model

In [63]:
onnx_model = nxrun.InferenceSession(MODEL_PATH)

In [64]:
model = torch.jit.load("../test_www.pt")

# model = timm.create_model(
#     "tf_efficientnet_b0_ns",
#     pretrained=True,
#     num_classes=182,
#     global_pool="avg",
#     in_chans=3,
# )

# model = model.eval()

### Data preparation

1. Create data loader
2. We need to classify track every 5 seconds, but model trained on 10 seconds windows. 
 - if track is less than 10 seconds, we will duplicate it to 10 seconds
 - if track is more than 10 seconds, we will split it to 10 seconds windows
 - we pass 10 seconds windows
 - new windows position is after 5 seconds from start of previous window.
 - last window will be 5 seconds long, we will duplicate it to 10 seconds

In [65]:
def generate_mel_spectrogram(
    waveform, sample_rate, n_mels, n_fft, hop_length, f_min, f_max, top_db
):
    mel_spectrogram_transform = torchaudio.transforms.MelSpectrogram(
        sample_rate=sample_rate,
        n_mels=n_mels,
        n_fft=n_fft,
        hop_length=hop_length,
        f_min=f_min,
        f_max=f_max,
    )
    amplitude_db_transform = torchaudio.transforms.AmplitudeToDB(top_db=top_db)

    mel_spectrogram = mel_spectrogram_transform(waveform)
    mel_spectrogram_db = amplitude_db_transform(mel_spectrogram)

    return mel_spectrogram_db


def generate_mfcc(
    waveform, sample_rate, n_mfcc, n_mels, n_fft, hop_length, f_min, f_max
):
    mfcc_transform = torchaudio.transforms.MFCC(
        sample_rate=sample_rate,
        n_mfcc=n_mfcc,
        melkwargs={
            "n_mels": n_mels,
            "n_fft": n_fft,
            "hop_length": hop_length,
            "f_min": f_min,
            "f_max": f_max,
        },
    )
    mfcc = mfcc_transform(waveform)
    return mfcc


def generate_chroma_feature(waveform, sr, n_fft, hop_length, n_chroma, epsilon=1e-6):
    try:
        stft = torch.stft(
            waveform,
            n_fft=n_fft,
            hop_length=hop_length,
            return_complex=True,
            win_length=n_fft,
            # window=torch.hann_window(n_fft),
        )
        magnitude = stft.abs() + epsilon  # Adding epsilon to avoid log(0) issues
        chroma_filter = librosa.filters.chroma(sr=sr, n_fft=n_fft, n_chroma=n_chroma)
        chroma_filter = torch.tensor(chroma_filter, dtype=torch.float32)
        chroma = torch.matmul(chroma_filter, magnitude.squeeze(0))
        chroma = chroma / torch.max(chroma) + epsilon
        return chroma
    except Exception as e:
        print(e)
        # return torch.zeros((n_chroma, 1))


class MonoToThreeChannel(torch.nn.Module):
    def __init__(
        self,
        sample_rate,
        n_mels,
        n_fft,
        hop_length,
        f_min,
        f_max,
        top_db,
        n_mfcc,
        n_chroma,
    ):
        super(MonoToThreeChannel, self).__init__()
        self.sample_rate = sample_rate
        self.n_mels = n_mels
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.f_min = f_min
        self.f_max = f_max
        self.top_db = top_db
        self.n_mfcc = n_mfcc
        self.n_chroma = n_chroma

    def forward(self, waveform):
        mel_time = time.time()
        # Generate Mel Spectrogram
        mel_spectrogram_db = generate_mel_spectrogram(
            waveform,
            self.sample_rate,
            self.n_mels,
            self.n_fft,
            self.hop_length,
            self.f_min,
            self.f_max,
            self.top_db,
        )
        # print(f"Mel Time: {time.time() - mel_time}")

        mmc_time = time.time()
        # Generate MFCC
        mfcc = generate_mfcc(
            waveform,
            self.sample_rate,
            self.n_mfcc,
            self.n_mels,
            self.n_fft,
            self.hop_length,
            self.f_min,
            self.f_max,
        )
        # Resize MFCC to match Mel Spectrogram dimensions
        mfcc_resized = torch.nn.functional.interpolate(
            mfcc.unsqueeze(0), size=mel_spectrogram_db.shape[1:], mode="bilinear"
        ).squeeze(0)
        # print(f"MMC Time: {time.time() - mmc_time}")

        chroma_time = time.time()
        # Generate Chroma Features
        chroma = generate_chroma_feature(
            waveform,
            sr=self.sample_rate,
            n_fft=self.n_fft,
            hop_length=self.hop_length,
            n_chroma=self.n_chroma,
        ).unsqueeze(0)

        # Resize Chroma to match Mel Spectrogram dimensions
        chroma_resized = torch.nn.functional.interpolate(
            chroma.unsqueeze(0), size=mel_spectrogram_db.shape[1:], mode="bilinear"
        ).squeeze(0)
        # print(f"Chroma Time: {time.time() - chroma_time}")

        stack_time = time.time()
        # Stack to create a 3-channel image
        final_output = torch.stack(
            [mel_spectrogram_db, mfcc_resized, chroma_resized], dim=0
        ).squeeze(1)
        # print(f"Stack Time: {time.time() - stack_time}")
        return final_output


class NormalizeData(torch.nn.Module):
    def __init__(self):
        super(NormalizeData, self).__init__()

    def forward(self, x):
        min_val = torch.min(x)
        max_val = torch.max(x)
        if max_val - min_val == 0:
            return x
        return (x - min_val) / (max_val - min_val)

In [66]:
class DataLoader:
    def __init__(self, path: str):
        self.path = path
        waveform, sample_rate = torchaudio.load(path)
        waveform = self.standardize_waveform(waveform, sample_rate)
        self.frames = self.get_frames(waveform)
        # self.frames = self.to_model_input(self.frames)

    def get_frames(self, waveform):
        predict_frame_size = CFG.sample_rate * CFG.output_window_duration
        model_frame_size = CFG.sample_rate * CFG.input_window_duration
        waveform = torch.cat(
            [
                waveform,
                waveform[:, -1 * predict_frame_size :],
            ],
            dim=-1,
        )
        waveform_with_padding = torch.nn.functional.pad(
            waveform, (0, predict_frame_size - waveform.shape[1] % predict_frame_size)
        )

        windows = []
        for i in range(
            0,
            waveform_with_padding.shape[1] - predict_frame_size,
            predict_frame_size,
        ):
            window = waveform_with_padding[
                :, i : i + CFG.sample_rate * CFG.input_window_duration
            ]
            window = self.to_model_input(window)
            windows.append(window)

        return torch.stack(windows)

    def to_model_input(self, frames):
        preparedWawe = torch.nn.Sequential(
            *[
                MonoToThreeChannel(
                    sample_rate=32000,
                    n_mels=128,
                    n_fft=2048,
                    hop_length=CFG.hop_length,
                    top_db=80,
                    f_min=0,
                    f_max=16000,
                    n_mfcc=20,
                    n_chroma=12,
                ),
                NormalizeData(),
            ]
        )
        return preparedWawe(frames)

    def standardize_waveform(
        self, waveform: torch.Tensor, sample_rate: int
    ) -> torch.Tensor:
        if len(waveform) > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        if sample_rate != CFG.sample_rate:
            waveform = torchaudio.transforms.Resample(
                sample_rate, CFG.sample_rate, dtype=waveform.dtype
            )(waveform)

        return waveform

    def __iter__(self):
        for data in self.dataloader:
            yield data

    def __len__(self):
        return len(self.dataloader)

In [67]:
class DataLoader:
    def __init__(self, path: str):
        self.path = path
        waveform, sample_rate = torchaudio.load(path)
        waveform = self.standardize_waveform(waveform, sample_rate)
        self.frames = self.get_frames(waveform)
        # self.frames = self.to_model_input(self.frames)

    def get_frames(self, waveform):
        predict_frame_size = CFG.sample_rate * CFG.output_window_duration
        model_frame_size = CFG.sample_rate * CFG.input_window_duration
        waveform = torch.cat(
            [
                waveform,
                waveform[:, -1 * predict_frame_size :],
            ],
            dim=-1,
        )
        waveform_with_padding = torch.nn.functional.pad(
            waveform, (0, predict_frame_size - waveform.shape[1] % predict_frame_size)
        )

        windows = []
        for i in range(
            0,
            waveform_with_padding.shape[1] - predict_frame_size,
            predict_frame_size,
        ):
            window = waveform_with_padding[
                :, i : i + CFG.sample_rate * CFG.input_window_duration
            ]
            window = self.to_model_input(window)
            windows.append(window)

        return torch.stack(windows)

    def forward(self, waveform):
        return None

    def to_model_input(self, frames):
        preparedWawe = torch.nn.Sequential(
            *[
                MonoToThreeChannel(
                    sample_rate=32000,
                    n_mels=128,
                    n_fft=2048,
                    hop_length=CFG.hop_length,
                    top_db=80,
                    f_min=0,
                    f_max=16000,
                    n_mfcc=20,
                    n_chroma=12,
                ),
                NormalizeData(),
            ]
        )
        return preparedWawe(frames)

    def standardize_waveform(
        self, waveform: torch.Tensor, sample_rate: int
    ) -> torch.Tensor:
        if len(waveform) > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        if sample_rate != CFG.sample_rate:
            waveform = torchaudio.transforms.Resample(
                sample_rate, CFG.sample_rate, dtype=waveform.dtype
            )(waveform)

        return waveform

    def __iter__(self):
        for data in self.dataloader:
            yield data

    def __len__(self):
        return len(self.dataloader)

In [68]:
import cProfile

with cProfile.Profile() as pr:
    DataLoader("../data/birdclef-2024/unlabeled_soundscapes/460830.ogg")
    pr.print_stats()

         62925 function calls (57780 primitive calls) in 0.284 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.284    0.284 3526370188.py:2(__init__)
       49    0.001    0.000    0.209    0.004 3526370188.py:40(to_model_input)
        1    0.000    0.000    0.000    0.000 3526370188.py:59(standardize_waveform)
        1    0.001    0.001    0.215    0.215 3526370188.py:9(get_frames)
       49    0.000    0.000    0.059    0.001 517676389.py:1(generate_mel_spectrogram)
       49    0.000    0.000    0.000    0.000 517676389.py:142(__init__)
       49    0.003    0.000    0.004    0.000 517676389.py:145(forward)
       49    0.000    0.000    0.063    0.001 517676389.py:20(generate_mfcc)
       49    0.002    0.000    0.065    0.001 517676389.py:38(generate_chroma_feature)
       49    0.000    0.000    0.002    0.000 517676389.py:60(__init__)
       49    0.002    0.000    0.200    0.004 51

In [69]:
class DataLoaderV2:
    preparedWawe = torch.nn.Sequential(
        *[
            MonoToThreeChannel(
                sample_rate=32000,
                n_mels=128,
                n_fft=2048,
                hop_length=CFG.hop_length,
                top_db=80,
                f_min=0,
                f_max=16000,
                n_mfcc=20,
                n_chroma=12,
            ),
            NormalizeData(),
        ]
    )

    def get_frames(self, waveform):
        predict_frame_size = CFG.sample_rate * CFG.output_window_duration
        waveform = torch.cat(
            [
                waveform,
                waveform[:, -1 * predict_frame_size :],
            ],
            dim=-1,
        )
        waveform_with_padding = torch.nn.functional.pad(
            waveform, (0, predict_frame_size - waveform.shape[1] % predict_frame_size)
        )

        windows = []
        for i in range(
            0,
            waveform_with_padding.shape[1] - predict_frame_size,
            predict_frame_size,
        ):
            window = waveform_with_padding[
                :, i : i + CFG.sample_rate * CFG.input_window_duration
            ]
            window = self.to_model_input(window)
            windows.append(window)

        return torch.stack(windows)

    def forward(self, path):
        self.path = path
        load_time = time.time()
        waveform, sample_rate = torchaudio.load(path)
        load_time = time.time() - load_time

        standardize_time = time.time()
        waveform = self.standardize_waveform(waveform, sample_rate)
        standardize_time = time.time() - standardize_time

        get_frames_time = time.time()
        frames = self.get_frames(waveform)
        get_frames_time = time.time() - get_frames_time

        # print(f"Load Time: {load_time}")
        # print(f"Standardize Time: {standardize_time}")
        # print(f"Get Frames Time: {get_frames_time}")
        return frames

    def to_model_input(self, frames):

        return self.preparedWawe(frames)

    def standardize_waveform(
        self, waveform: torch.Tensor, sample_rate: int
    ) -> torch.Tensor:
        if len(waveform) > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        if sample_rate != CFG.sample_rate:
            waveform = torchaudio.transforms.Resample(
                sample_rate, CFG.sample_rate, dtype=waveform.dtype
            )(waveform)

        return waveform

    def __iter__(self):
        for data in self.dataloader:
            yield data

    def __len__(self):
        return len(self.dataloader)

In [51]:
import cProfile

with cProfile.Profile() as pr:
    for i in range(100):
        DataLoader("../data/birdclef-2024/unlabeled_soundscapes/460830.ogg")
    pr.print_stats()

Mel Time: 0.0015118122100830078
MMC Time: 0.0014104843139648438
Chroma Time: 0.0013234615325927734
Stack Time: 3.743171691894531e-05
Mel Time: 0.0011112689971923828
MMC Time: 0.0013842582702636719
Chroma Time: 0.0013213157653808594
Stack Time: 3.409385681152344e-05
Mel Time: 0.0010654926300048828
MMC Time: 0.0012118816375732422
Chroma Time: 0.0012011528015136719
Stack Time: 3.0994415283203125e-05
Mel Time: 0.0012981891632080078
MMC Time: 0.0013170242309570312
Chroma Time: 0.0012440681457519531
Stack Time: 3.1948089599609375e-05
Mel Time: 0.0010657310485839844
MMC Time: 0.0012264251708984375
Chroma Time: 0.0014605522155761719
Stack Time: 3.5762786865234375e-05
Mel Time: 0.0011031627655029297
MMC Time: 0.0012440681457519531
Chroma Time: 0.0012421607971191406
Stack Time: 3.24249267578125e-05
Mel Time: 0.0010609626770019531
MMC Time: 0.001374959945678711
Chroma Time: 0.0013039112091064453
Stack Time: 3.4332275390625e-05
Mel Time: 0.0011110305786132812
MMC Time: 0.0012269020080566406
Chroma

KeyboardInterrupt: 

In [70]:
import cProfile
import time

loader = DataLoaderV2()
with cProfile.Profile() as pr:
    for i in range(1):
        loader.forward("../data/birdclef-2024/unlabeled_soundscapes/460830.ogg")
    pr.print_stats()

Load Time: 0.06781530380249023
Standardize Time: 2.4557113647460938e-05
Get Frames Time: 0.2005605697631836
         57962 function calls (53258 primitive calls) in 0.269 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.001    0.001    0.201    0.201 3219777498.py:19(get_frames)
        1    0.000    0.000    0.269    0.269 3219777498.py:46(forward)
       49    0.000    0.000    0.196    0.004 3219777498.py:65(to_model_input)
        1    0.000    0.000    0.000    0.000 3219777498.py:69(standardize_waveform)
       49    0.000    0.000    0.057    0.001 517676389.py:1(generate_mel_spectrogram)
       49    0.003    0.000    0.004    0.000 517676389.py:145(forward)
       49    0.000    0.000    0.060    0.001 517676389.py:20(generate_mfcc)
       49    0.002    0.000    0.061    0.001 517676389.py:38(generate_chroma_feature)
       49    0.002    0.000    0.191    0.004 517676389.py:83(forward)
      196    

### Prepare paths

In [52]:
test_paths = glob(f"{BASE_PATH}/test_soundscapes/*ogg")
# During commit use `unlabeled` data as there is no `test` data.
# During submission `test` data will automatically be populated.
if len(test_paths) == 0:
    test_paths = glob(f"{BASE_PATH}/unlabeled_soundscapes/*ogg")[:90]
test_df = pd.DataFrame(test_paths, columns=["filepath"])
test_df.head()

Unnamed: 0,filepath
0,../data/birdclef-2024/unlabeled_soundscapes/13...
1,../data/birdclef-2024/unlabeled_soundscapes/92...
2,../data/birdclef-2024/unlabeled_soundscapes/13...
3,../data/birdclef-2024/unlabeled_soundscapes/19...
4,../data/birdclef-2024/unlabeled_soundscapes/91...


In [53]:
# test_dataset = BirdCLEFDataset(test_df["filepath"].tolist())
# test_loader = DataLoader(
#     test_dataset, batch_size=CFG.batch_size, shuffle=False, num_workers=4
# )\

loader = DataLoader(test_df["filepath"].loc[0])
print(loader.frames.shape)

Mel Time: 0.0014636516571044922
MMC Time: 0.0014731884002685547
Chroma Time: 0.0013968944549560547
Stack Time: 4.00543212890625e-05
Mel Time: 0.0010900497436523438
MMC Time: 0.0012691020965576172
Chroma Time: 0.0013380050659179688
Stack Time: 3.6716461181640625e-05
Mel Time: 0.0010447502136230469
MMC Time: 0.001226186752319336
Chroma Time: 0.0012736320495605469
Stack Time: 3.170967102050781e-05
Mel Time: 0.00103759765625
MMC Time: 0.0012187957763671875
Chroma Time: 0.0012547969818115234
Stack Time: 3.218650817871094e-05
Mel Time: 0.0010285377502441406
MMC Time: 0.0014379024505615234
Chroma Time: 0.0018012523651123047
Stack Time: 5.435943603515625e-05
Mel Time: 0.0014197826385498047
MMC Time: 0.0017201900482177734
Chroma Time: 0.0017979145050048828
Stack Time: 5.626678466796875e-05
Mel Time: 0.0014812946319580078
MMC Time: 0.0017647743225097656
Chroma Time: 0.0018541812896728516
Stack Time: 5.888938903808594e-05
Mel Time: 0.0014052391052246094
MMC Time: 0.0017461776733398438
Chroma Time

### Prepare dataset

In [54]:
classMapperDF = pd.read_csv(f"../data/processed/fine_tune_mapper.csv")

In [55]:
pred_df = pd.DataFrame(columns=np.concatenate((["row_id"], classMapperDF["species"])))

In [None]:
pred_df

Unnamed: 0,row_id,asbfly,ashdro1,ashpri1,ashwoo2,asikoe2,asiope1,aspfly1,aspswi1,barfly1,...,whbwoo2,whcbar1,whiter2,whrmun,whtkin2,woosan,wynlau1,yebbab1,yebbul3,zitcis1


In [56]:
model = model.eval()

In [73]:
from datetime import datetime

loader = DataLoaderV2()
start_time = datetime.now()
new_rows = []
for path in test_df["filepath"]:
    base_row_id = path.split("/")[-1].split(".")[0]
    # data_loader = DataLoader(path)
    frames = loader.forward(path)
    with torch.no_grad():
        start_p_time = datetime.now()

        # output_array = onnx_model.run(None, {"input": frames.numpy()})[0]
        output_array = model(frames)
        output_array = torch.randn(49, 182)
        end_p_time = datetime.now()
        # print("Duration: {}".format(end_p_time - start_p_time))

        # result = model.run(None, {"input": frames.numpy()})
        # output_array = result[0]
        for frame_id in range(0, len(output_array)):
            row_id = base_row_id + f"_{(frame_id+1) * CFG.output_window_duration}"
            new_row_data = np.concatenate(([row_id], output_array[frame_id]))
            new_rows.append(new_row_data)

pred_df = pd.DataFrame(new_rows, columns=pred_df.columns)

end_time = datetime.now()
print("Duration: {}".format(end_time - start_time))

Load Time: 0.07143640518188477
Standardize Time: 1.52587890625e-05
Get Frames Time: 0.18832969665527344
Load Time: 0.07385754585266113
Standardize Time: 1.621246337890625e-05
Get Frames Time: 0.18227362632751465
Load Time: 0.07208895683288574
Standardize Time: 1.8358230590820312e-05
Get Frames Time: 0.17629432678222656
Load Time: 0.07301926612854004
Standardize Time: 1.7404556274414062e-05
Get Frames Time: 0.18079328536987305
Load Time: 0.07032513618469238
Standardize Time: 1.71661376953125e-05
Get Frames Time: 0.19057464599609375
Load Time: 0.07773184776306152
Standardize Time: 1.5497207641601562e-05
Get Frames Time: 0.17754673957824707
Load Time: 0.07842230796813965
Standardize Time: 1.9073486328125e-05
Get Frames Time: 0.1820204257965088
Load Time: 0.0733346939086914
Standardize Time: 1.71661376953125e-05
Get Frames Time: 0.1946108341217041
Load Time: 0.07330870628356934
Standardize Time: 1.8596649169921875e-05
Get Frames Time: 0.1809825897216797
Load Time: 0.08031153678894043
Stand

### Predict

In [22]:
for path in test_df["filepath"]:
    base_row_id = path.split("/")[-1].split(".")[0]
    data_loader = DataLoader(path)
    frames = data_loader.frames
    with torch.no_grad():
        output_array = model(frames)
        # output_array = torch.randn(49, 182)
        #     result = model.run(None, {"input": frames.numpy()})
        #     output_array = result[0]
        for frame_id in range(0, len(output_array)):
            row_id = base_row_id + f"_{(frame_id+1) * CFG.output_window_duration}"
            new_row_data = np.concatenate(([row_id], output_array[frame_id]))
            new_row = pd.DataFrame([new_row_data], columns=pred_df.columns)

            pred_df = pd.concat([pred_df, new_row], ignore_index=True)

In [88]:
import time

new_rows = []
# Initialize a dictionary to store execution times
execution_times = {
    "DataLoader": 0,
    "Frames extraction": 0,
    "Output array generation": 0,
    "Row ID generation": 0,
    "New row data concatenation": 0,
    "New row creation": 0,
    "Dataframe concatenation": 0,
}

for path in test_df["filepath"]:
    base_row_id = path.split("/")[-1].split(".")[0]

    start_time = time.time()
    data_loader = DataLoader(path)
    execution_times["DataLoader"] += time.time() - start_time

    start_time = time.time()
    frames = data_loader.frames
    execution_times["Frames extraction"] += time.time() - start_time

    with torch.no_grad():
        start_time = time.time()
        output_array = torch.randn(49, 182)
        execution_times["Output array generation"] += time.time() - start_time

        for frame_id in range(0, len(output_array)):
            start_time = time.time()
            row_id = base_row_id + f"_{(frame_id+1) * CFG.output_window_duration}"
            execution_times["Row ID generation"] += time.time() - start_time

            start_time = time.time()
            new_row_data = np.concatenate(([row_id], output_array[frame_id]))
            execution_times["New row data concatenation"] += time.time() - start_time

            # start_time = time.time()
            # new_row = pd.DataFrame([new_row_data], columns=pred_df.columns)
            # execution_times["New row creation"] += time.time() - start_time

            # start_time = time.time()
            # # pred_df = pd.concat([pred_df, new_row], ignore_index=True)
            # execution_times["Dataframe concatenation"] += time.time() - start_time
            new_rows.append(new_row_data)

# Print the total execution time for each operation
for operation, total_time in execution_times.items():
    print(f"{operation} total execution time: {total_time} seconds")

concat_start_time = time.time()
# pred_df = pd.concat([pred_df] + new_rows, ignore_index=True)
pred_df = pd.DataFrame(new_rows, columns=pred_df.columns)
concat_time = time.time() - concat_start_time
print(f"Dataframe concatenation total execution time: {concat_time} seconds")
# Print the final total execution time
print(f"Final total execution time: {sum(execution_times.values())} seconds")

DataLoader total execution time: 24.436153888702393 seconds
Frames extraction total execution time: 0.00016736984252929688 seconds
Output array generation total execution time: 0.0044710636138916016 seconds
Row ID generation total execution time: 0.0023369789123535156 seconds
New row data concatenation total execution time: 0.21045184135437012 seconds
New row creation total execution time: 0 seconds
Dataframe concatenation total execution time: 0 seconds
Dataframe concatenation total execution time: 0.27167797088623047 seconds
Final total execution time: 24.653581142425537 seconds


In [89]:
pred_df

Unnamed: 0,row_id,asbfly,ashdro1,ashpri1,ashwoo2,asikoe2,asiope1,aspfly1,aspswi1,barfly1,...,whbwoo2,whcbar1,whiter2,whrmun,whtkin2,woosan,wynlau1,yebbab1,yebbul3,zitcis1
0,1384345978_5,1.0776007,0.6741012,0.0544685,-0.78157264,0.53743535,0.09485035,1.8617955,1.0633079,0.86832786,...,0.7763495,-0.29598126,0.3969708,1.7034777,-2.2145097,1.4041456,-0.5184641,-0.120054275,-0.060386438,0.7225028
1,1384345978_10,0.6494597,-0.2901487,-0.48894414,-0.30237877,0.1788953,-0.4497408,0.22455451,-2.0998518,0.15366834,...,-0.08087396,0.26804015,-0.70137197,-0.11277991,1.4401516,0.7698393,0.4070188,-0.70735306,-2.2009292,0.9274572
2,1384345978_15,0.97071475,0.8023224,0.38441458,-0.15527017,-0.25191098,-0.23336534,-0.049061753,-1.5023974,0.47976297,...,2.2641761,-1.0962859,0.04842051,0.3163507,-1.1614702,-0.83594054,0.51708436,0.6178873,0.92735726,0.010133981
3,1384345978_20,-0.38427103,0.7773897,-2.4634264,1.2180364,1.113323,-0.16030781,-0.7830721,-0.84748226,0.86722046,...,0.15073575,-0.9688482,-0.13074529,0.47583234,-3.7785246,1.3148599,-1.4430399,0.7339534,-0.28226522,-0.011543943
4,1384345978_25,0.98909324,0.066606216,0.3482165,-0.37295425,-0.13830522,0.480619,0.8879841,-0.28469414,-1.047104,...,-1.1245956,0.16563845,-0.27964205,-0.80478144,-1.0982355,1.0347165,-0.6127792,0.43175706,-0.33373922,0.50800693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4405,236907488_225,0.5067533,-0.75665045,1.6858311,0.705261,0.34859753,-0.10771835,-0.17235884,0.04258785,-0.5867075,...,-0.15179333,-0.6391763,0.6333532,0.8112291,0.71963865,-0.78802913,-0.48651102,1.0615046,0.8402284,2.0393589
4406,236907488_230,-0.0044750934,1.5766262,0.13250346,0.4398087,0.89394915,1.0765836,-0.43309766,-0.49229658,-1.0057758,...,0.42885593,0.83300817,-0.979975,0.31617653,1.8032532,0.6013318,0.027082508,0.22106645,0.032018907,-0.48135352
4407,236907488_235,0.875149,-0.32174176,-0.5571391,0.29552817,0.38993287,0.37641156,-1.1992022,-0.5449401,-0.40128762,...,-0.14508022,-0.42784664,0.7519274,-0.010662538,-0.8135343,-0.020056078,-2.5095742,1.1171914,0.024066571,-1.6416885
4408,236907488_240,-0.1078298,1.83347,0.004952802,-0.896093,-1.6775184,-0.09543034,2.1938798,0.34432304,-0.2907345,...,-1.1534637,2.12393,-1.9457498,1.3559031,1.5873266,1.8450834,0.7023333,-0.45089382,-0.6395919,-0.50324607


In [34]:
for path in test_df["filepath"]:
    base_row_id = path.split("/")[-1].split(".")[0]
    data_loader = DataLoader(path)
    frames = data_loader.frames
    with torch.no_grad():
        output_array = onnx_model.run(None, {"input": frames.numpy()})[0]
        # output_array = model(frames)
        #     result = model.run(None, {"input": frames.numpy()})
        #     output_array = result[0]
        for frame_id in range(0, len(output_array)):
            row_id = base_row_id + f"_{(frame_id+1) * CFG.output_window_duration}"
            new_row_data = np.concatenate(([row_id], output_array[frame_id]))
            new_row = pd.DataFrame([new_row_data], columns=pred_df.columns)

            pred_df = pd.concat([pred_df, new_row], ignore_index=True)

In [35]:
output_array[0].shape[0]

182

In [None]:
model_to_save = timm.create_model(
    "tf_efficientnet_b0_ns",
    pretrained=True,
    num_classes=180,
    global_pool="avg",
    in_chans=3,
)

  model = create_fn(


In [None]:
model_to_save(torch.randn(150, 3, 128, 626))

tensor([[-0.2166,  0.3844,  0.0238,  ..., -0.2025,  0.6759,  0.1642],
        [-0.2514,  0.0551,  0.0518,  ..., -0.0594,  0.5920, -0.0513],
        [ 0.3398,  0.2339,  0.2568,  ...,  0.1496,  0.0366, -0.0982],
        ...,
        [-0.0703, -0.0876,  0.1078,  ...,  0.0125,  0.1958,  0.2140],
        [ 0.0760,  0.2633, -0.0925,  ...,  0.0630,  0.4420,  0.1603],
        [ 0.0654,  0.0424,  0.2009,  ...,  0.0391,  0.2000, -0.0471]],
       grad_fn=<AddmmBackward0>)

In [None]:
input_name = "input"
output_name = "output"

torch.onnx.export(
    model_to_save,
    torch.randn(1, 3, 128, 626),
    "test.onnx",
    input_names=[input_name],
    output_names=[output_name],
    dynamic_axes={input_name: {0: "batch_size"}, output_name: {0: "batch_size"}},
)

In [None]:
import onnxruntime as nxrun

onnx_model = nxrun.InferenceSession("test.onnx")

In [None]:
# output_array = onnx_model.run(None, {"input": torch.randn(1500, 3, 128, 626).numpy()})

In [None]:
pred_df.to_csv("submission.csv", index=False)

In [None]:
loader = DataLoader("../data/birdclef-2024/unlabeled_soundscapes/460830.ogg")

In [None]:
loader.frames.shape

torch.Size([49, 3, 128, 626])

In [None]:
loader.cframes.shape

AttributeError: 'DataLoader' object has no attribute 'cframes'

In [None]:
input_names = [input.name for input in model.get_inputs()]
print("Model input names: ", input_names)

Model input names:  ['input']


In [None]:
with torch.no_grad():
    output = model(loader.frames)

In [None]:
output.argmax(dim=1)

tensor([ 44,  44,  44,  39,  39,  39,  39,  44,  44,  39,  39,  44,  39,  43,
        136,  43,  43,  70,  43,  73,   4,   4,   4,   4,  73, 138, 140, 140,
        138,  73,  76,  76,  38, 155, 155,  90,   4,  80,  38, 155,  90,   4,
          4,   4,   4,   4,   4,   4,  73])

In [None]:
print("The model expects input shape: ", onnx_model.get_inputs()[0].shape)

batch_size = 32
input_tensor = torch.randn(batch_size, 3, 128, 626).numpy()

result = onnx_model.run(None, {"input": loader.frames.numpy()})
prob = result[0]

The model expects input shape:  ['batch_size', 3, 128, 626]


In [36]:
len(pred_df)

4410