In [2]:
# %load_ext autoreload
# %autoreload 2

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import tqdm
import librosa
import numpy as np
import matplotlib.pyplot as plt

import dcase_dataset
import sed_utils
import models
import train_model

import torch
import torch.utils.tensorboard

In [3]:
# TODO: maybe scaling and filterbanks should always be part of the neural network?
# 1. add trainable filterbanks
# 2. add trainable scaling

In [8]:
bioacoustic_conf = sed_utils.get_bioacoustic_pcen_conf()
speech_conf      = sed_utils.get_speech_pcen_conf()

sample_rate = 8000
root_path = '/mnt/storage_1/datasets/bioacoustics_dcase2022/Development_Set_{}Hz/Training_Set/'.format(sample_rate)
window_size = 4096
hop_size = window_size // 2
include_background = True
n_classes = 48
n_time = 8

# TODO: I need to fix the memory issues with the data loading.....
n_background = 500

# Hacky....
audio_scaling = 1 #2**32 # TODO: Why does this scaling have such an effect?
transform_mel      = lambda x: sed_utils.wav_to_mel(x - (np.sum(x)/np.size(x)), sample_rate)
transform_pcen_bio = lambda x: sed_utils.wav_to_pcen(x - (np.sum(x)/np.size(x)), sample_rate, bioacoustic_conf)
transform_pcen_sch = lambda x: sed_utils.wav_to_pcen(x - (np.sum(x)/np.size(x)), sample_rate, speech_conf)

base_dataset_pcen_bio = dcase_dataset.BioacousticDataset(
    root_dir           = root_path,
    window_size        = window_size,
    hop_size           = hop_size,
    sample_rate        = sample_rate,
    n_classes          = n_classes,
    n_time             = n_time,
    n_background       = n_background,
    transform          = transform_pcen_bio,
    cache              = True,
    is_validation_data = False
)

# base_dataset_pcen_sch = dcase_dataset.BioacousticDataset(
#     root_dir           = root_path,
#     window_size        = window_size,
#     hop_size           = hop_size,
#     sample_rate        = sample_rate,
#     n_classes          = n_classes,
#     n_time             = n_time,
#     include_background = include_background, 
#     transform          = transform_pcen_sch,
#     cache              = True,
#     is_validation_data = False
# )

# base_dataset_mel = dcase_dataset.BioacousticDataset(
#     root_dir           = root_path,
#     window_size        = window_size,
#     hop_size           = hop_size,
#     sample_rate        = sample_rate,
#     n_classes          = n_classes,
#     n_time             = n_time,
#     include_background = include_background, 
#     transform          = transform_mel,
#     cache              = True,
#     is_validation_data = False
# )

datasets = {
    'pcen_bio' : base_dataset_pcen_bio,
#     'pcen_sch' : base_dataset_pcen_sch,
#     'mel'      : base_dataset_mel,
}

  0%|          | 0/174 [00:00<?, ?it/s]

building dataset ...


100%|██████████| 174/174 [05:31<00:00,  1.90s/it] 


In [9]:
x, y = datasets['pcen_bio'][0]
print("x shape: ", x.shape)
print("y shape: ", y.shape)
my_net = models.get_model(y.shape[0], y.shape[1]).double()
pred, x_rep = my_net(torch.from_numpy(x.reshape((1, 1, x.shape[0], x.shape[1]))).double())
print(pred.shape)
print(x_rep.shape)

x shape:  (40, 33)
y shape:  (48, 8)
torch.Size([1, 48, 8])
torch.Size([1, 128])


In [10]:
loudness_scaling = 'pcen_bio'

base_dataset = datasets[loudness_scaling]
train_size = int(0.8 * len(base_dataset))
valid_size = len(base_dataset) - train_size
base_train, base_valid = torch.utils.data.random_split(base_dataset, [train_size, valid_size])

In [11]:
train_model.main(base_train, base_valid, experiment_dir='experiments/n_background/{}_n_background_{}'.format(loudness_scaling, n_background))

100%|██████████| 585/585 [00:13<00:00, 42.64it/s]

train loss: 0.05028160913638198



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.02447362700132225
saving best model ...


100%|██████████| 585/585 [00:13<00:00, 43.92it/s]

train loss: 0.022699303840958442



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.0185644855440369
saving best model ...


100%|██████████| 585/585 [00:13<00:00, 43.52it/s]

train loss: 0.018058210603771583



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.015700914625383967
saving best model ...


100%|██████████| 585/585 [00:13<00:00, 43.42it/s]

train loss: 0.015911918758028866



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.014727717516184248
saving best model ...


100%|██████████| 585/585 [00:13<00:00, 43.31it/s]

train loss: 0.014657389021355183



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.013902836294076437
saving best model ...


100%|██████████| 585/585 [00:13<00:00, 43.30it/s]

train loss: 0.013810872901084901



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.013080429522495125
saving best model ...


100%|██████████| 585/585 [00:13<00:00, 43.24it/s]

train loss: 0.01325481969901901



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.013245315464199646


100%|██████████| 585/585 [00:13<00:00, 43.10it/s]

train loss: 0.012737464202567281



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.012720492328524789
saving best model ...


100%|██████████| 585/585 [00:13<00:00, 43.07it/s]

train loss: 0.012282248606430292



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.012849390475197389


100%|██████████| 585/585 [00:13<00:00, 43.09it/s]

train loss: 0.01194933235164351



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.012287481262894
saving best model ...


100%|██████████| 585/585 [00:13<00:00, 43.06it/s]

train loss: 0.011671284037199406



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.01244331204566239


100%|██████████| 585/585 [00:13<00:00, 42.86it/s]

train loss: 0.011391253676826887



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.012002724233721868
saving best model ...


100%|██████████| 585/585 [00:13<00:00, 43.06it/s]

train loss: 0.011065526550622061



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.01190802973261648
saving best model ...


100%|██████████| 585/585 [00:13<00:00, 42.81it/s]

train loss: 0.010879418561360758



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.012155836330584294


100%|██████████| 585/585 [00:13<00:00, 42.78it/s]


train loss: 0.010655802699407893


  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.011970943421704792


100%|██████████| 585/585 [00:13<00:00, 42.44it/s]

train loss: 0.01037121345730524



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.012281269135092464


100%|██████████| 585/585 [00:13<00:00, 42.17it/s]

train loss: 0.010126050659985826



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.01200015345264559


100%|██████████| 585/585 [00:14<00:00, 40.97it/s]

train loss: 0.009902454797752784



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.012378343960174891


100%|██████████| 585/585 [00:14<00:00, 40.81it/s]

train loss: 0.009766851640203664



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.01217424848843974


100%|██████████| 585/585 [00:14<00:00, 40.26it/s]

train loss: 0.009592551938088832



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.011607488766107305
saving best model ...


100%|██████████| 585/585 [00:15<00:00, 38.98it/s]

train loss: 0.009399139932223152



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.012378637444288671


100%|██████████| 585/585 [00:15<00:00, 38.00it/s]

train loss: 0.009179181733669495



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.012312493985721897


100%|██████████| 585/585 [00:15<00:00, 37.59it/s]

train loss: 0.009023287799816914



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.01227619641099083


100%|██████████| 585/585 [00:15<00:00, 37.00it/s]

train loss: 0.008915101190525231



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.012295606213376338


100%|██████████| 585/585 [00:16<00:00, 35.62it/s]

train loss: 0.008602069534937055



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.012245527623797714


100%|██████████| 585/585 [00:16<00:00, 34.71it/s]

train loss: 0.008510373192150164



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.012695475457494035


100%|██████████| 585/585 [00:16<00:00, 34.51it/s]

train loss: 0.008265192419514917



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.012458331801220374


100%|██████████| 585/585 [00:17<00:00, 33.42it/s]

train loss: 0.00815147807432994



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.012473434848472935


100%|██████████| 585/585 [00:17<00:00, 33.39it/s]

train loss: 0.008045038078402627



  0%|          | 0/585 [00:00<?, ?it/s]

valid loss: 0.01270528146961996


100%|██████████| 585/585 [00:17<00:00, 32.99it/s]

train loss: 0.007891773653332998





valid loss: 0.013019155154467114
