In [1]:
from glob import glob
import librosa
import soundfile as sf
import numpy as np
import pandas as pd
import sys
import time
import datetime
from tqdm import tqdm
import random
import os
import gc
import cv2

from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, AdamW, lr_scheduler
from torch.distributions import Uniform
from torch.utils.data import DataLoader, Dataset

from torchlibrosa.stft import Spectrogram, LogmelFilterBank
from torchlibrosa.augmentation import SpecAugmentation


from efficientnet_pytorch import EfficientNet
from torchvision.models import resnet34, resnet50

import sys
sys.path.append('..')
from libs import transform as tr
from libs import spectrogram as spec
from libs import criterion as cr

import warnings
warnings.filterwarnings("ignore")



In [2]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = False  # type: ignore
set_seed(53)

In [3]:
train_tp = pd.read_csv('../../data/train_tp.csv')
submission = pd.read_csv('../../data/sample_submission.csv')

pred_target = list(submission.columns)[1:]


SR = 48000

In [4]:
def normalize_melspec(X: np.ndarray):
    eps = 1e-6
    mean = X.mean()
    X = X - mean
    std = X.std()
    Xstd = X / (std + eps)
    norm_min, norm_max = Xstd.min(), Xstd.max()
    if (norm_max - norm_min) > eps:
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

In [5]:
class RfcxDataSet(Dataset):
    def __init__(self,
                 df:pd.DataFrame,
                 train: bool,
                 data_path:str,
                 pcen_parameters:dict,
                 n_mels=128
    ):
        self.df = df
        self.path = data_path
        self.img_size = 128
        self.train = train
        self.n_mels = n_mels
        
        self.duration = 10
        self.stride = 10
        
        self.pcen_parameters = pcen_parameters
        
        self.transform = tr.Compose([
          tr.OneOf([
            tr.GaussianNoiseSNR(min_snr=10),
            tr.PinkNoiseSNR(min_snr=10)
          ]),
          tr.PitchShift(max_steps=2, sr=SR),
          #tr.TimeStretch(),
          #tr.TimeShift(sr=SR),
          tr.VolumeControl(mode="sine")
        ])
        
    def __len__(self):
        return len(self.df)
    
    def load(self, record_path):
        y, orig_sr = sf.read(record_path)
        
        if orig_sr != SR:
            y = librosa.resample(y, orig_sr=orig_sr, target_sr=SR, res_type="kaiser_best")
        return y
    
    def calc_melspectrogram(self, y):
        melspec = librosa.feature.melspectrogram(
            y,
            sr=SR,
            fmin=self.f_min,
            fmax=self.f_max,
            n_mels=self.n_mels
        )
        pcen = librosa.pcen(melspec, sr=SR, **self.pcen_parameters)
        clean_mel = librosa.power_to_db(melspec ** 1.5)
        melspec = librosa.power_to_db(melspec)
        
        norm_melspec = normalize_melspec(melspec)
        norm_pcen = normalize_melspec(pcen)
        norm_clean_mel = normalize_melspec(clean_mel)
        image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1)

        height, width, _ = image.shape
        image = cv2.resize(image, (int(width * self.img_size * 0.5 / height), self.img_size))
        image = np.moveaxis(image, 2, 0)
        image = (image / 255.0).astype(np.float32)
        
        return image
    
    def __getitem__(self, idx: int):
        sample = self.df.iloc[idx, :]
        recording_id = sample['recording_id']
        
        if self.train:
            species_id = sample['species_id']
            self.f_min = int(round(sample['f_min']))
            self.f_max = int(round(sample['f_max']))
        else:
            self.f_min = 0
            self.f_max = SR // 2
        
        record_path = self.path + recording_id + '.flac'
        y = self.load(record_path)
        
        if self.train:
            y = self.transform(y)
        
        window = self.duration*SR
        stride = self.stride*SR
        
        y = np.stack([y[i:i+window] for i in range(0, 60*SR+stride-window, stride)])#[:-2])

        image = np.stack([self.calc_melspectrogram(_y) for _y in y])
        
        if self.train:
            target = torch.zeros([24], dtype=torch.float32)
            target[species_id] = 1
            torch.save(
                {
                    'img': torch.from_numpy(image),
                    'target': target
                },
                f'/home/yuigahama/kaggle/rfcx/data/train_bbox_tensor/{recording_id}.tensor',
            )
            return image, target
        else:
            torch.save(
                torch.from_numpy(image),
                f'/home/yuigahama/kaggle/rfcx/data/test_bbox_tensor/{recording_id}.tensor',
            )
            return image

In [6]:
class SimpleRfcx(Dataset):
    def __init__(self, df, path, train=True):
        self.df = df
        self.path = path
        self.train = train
        
    def __len__(self):
        return len(self.df)
        
    def __getitem__(self, idx: int):
        sample = self.df.iloc[idx, :]
        recording_id = sample['recording_id']
        data = torch.load(
                self.path + recording_id + '.tensor'
            )
        
        if self.train:
            return data['img'], data['target']
        else:
            return data

In [7]:
TEST_NAME = 'baseline-pic-res'

n_splits = 5
random_state = 1
epochs = 55

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_params = {
    'model_type': 'res', # res or efficientnet
    'model_name': 'resnet50',
    'output_size': 25,
}

optim_params = {
    'lr': 1e-3,
    'weight_decay': 5e-5,
    'betas': (0.9, 0.999)
}

scheduler_params = {
    'mode': 'max',
    'patience': 1,
    'factor': 0.6,
    'verbose': False
}

pcen_parameters = {
    'gain': 0.98,
    'bias': 2,
    'power': 0.5,
    'time_constant': 0.4,
    'eps': 0.000001,
}

data_params = {
    'train': True,
    'path': '/home/yuigahama/kaggle/rfcx/data/train_bbox_tensor/'
}

test_data_params = {
    'train': False,
    'path': '/home/yuigahama/kaggle/rfcx/data/test_bbox_tensor/'
}



dataloder_params = {
    'batch_size': 13,
    'num_workers': 13,
    'pin_memory': False,
}

test_dataloder_params = {
    'batch_size': 13,
    'num_workers': 13,
    'pin_memory': False,
    'shuffle':False
}

train_path = '/home/yuigahama/kaggle/rfcx/data/train/'
test_path = '/home/yuigahama/kaggle/rfcx/data/test/'

In [8]:
train_dataset = RfcxDataSet(train_tp, True, train_path, pcen_parameters)
test_dataset = RfcxDataSet(submission, False, test_path, pcen_parameters)
train_dataloader = DataLoader(train_dataset, shuffle=False, **dataloder_params)
test_dataloader = DataLoader(test_dataset, shuffle=False, **dataloder_params)

In [9]:
for i in tqdm(train_dataloader):
    pass
for j in tqdm(test_dataloader):
    pass

100%|██████████| 94/94 [02:51<00:00,  1.83s/it]
100%|██████████| 154/154 [01:09<00:00,  2.21it/s]
