In [1]:
from glob import glob
import librosa
import soundfile as sf
import numpy as np
import pandas as pd
import sys
import time
import datetime
from tqdm import tqdm
import random
import os
import gc
import cv2

from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, AdamW, lr_scheduler
from torch.distributions import Uniform
from torch.utils.data import DataLoader, Dataset

from torchlibrosa.stft import Spectrogram, LogmelFilterBank
from torchlibrosa.augmentation import SpecAugmentation

from efficientnet_pytorch import EfficientNet
from torchvision.models import resnet34, resnet50

import sys
sys.path.append('..')
from libs import transform as tr
from libs import spectrogram as spec
from libs import criterion as cr

import warnings
warnings.filterwarnings("ignore")



In [2]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = False  # type: ignore
set_seed(53)

In [3]:
train_tp = pd.read_csv('../../data/train_tp.csv')
train_fp = pd.read_csv('../../data/train_fp.csv')
submission = pd.read_csv('../../data/sample_submission.csv')

pred_target = list(submission.columns)[1:]

SR = 48000

In [4]:
def normalize_melspec(X: np.ndarray):
    eps = 1e-6
    mean = X.mean()
    X = X - mean
    std = X.std()
    Xstd = X / (std + eps)
    norm_min, norm_max = Xstd.min(), Xstd.max()
    if (norm_max - norm_min) > eps:
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

def save(fold, model, optim, criterion, file_path="../../model/"):
    if not TEST_NAME in os.listdir(file_path):
        os.mkdir(file_path+TEST_NAME)
    
    
    output_path = file_path + TEST_NAME + '/' + f"{TEST_NAME}_{fold}.model"
    
    torch.save(
        {
            'epoch': epoch,
            'model_state_dict': model.cpu().state_dict(),
            'optimizer_state_dict': optim.state_dict(),
            'criterion': criterion
        },
        output_path)
    
    model.to(device)
    
    return output_path

In [5]:
class RfcxDataSet(Dataset):
    def __init__(self,
                 tp:pd.DataFrame,
                 train: bool,
                 data_path:str,
                 pcen_parameters:dict,
                 pre_calc=True,
                 n_mels=128
    ):
        self.tp = tp
        self.path = data_path
        self.img_size = 256
        self.train = train
        self.n_mels = n_mels
        self.pre_calc = pre_calc
        
        self.transform = tr.Compose([
            tr.OneOf([
                tr.GaussianNoiseSNR(min_snr=10),
                tr.PinkNoiseSNR(min_snr=10)
            ]),
            tr.PitchShift(max_steps=2, sr=SR),
            #tr.TimeStretch(),
            #tr.TimeShift(sr=sr),
            tr.VolumeControl(mode="sine")
        ])
        
        self.pcen_parameters = pcen_parameters
        
    def __len__(self):
        return len(self.tp)
    
    def load(self, record_path):
        y, orig_sr = sf.read(record_path)
        
        if orig_sr != SR:
            y = librosa.resample(y, orig_sr=orig_sr, target_sr=SR, res_type="kaiser_best")
        return y
    
    def get_random_duration(self, duration=10):
        start_sec = random.randint(0, 60-duration)
        end_sec = start_sec + 10
            
        return start_sec, end_sec
    
    def get_duration(self, t_min, t_max, duration=10):
        annotated_duration = t_max - t_min
        
        if annotated_duration > duration:
            limit_sec = t_max - duration
            start_sec = random.randint(t_min, limit_sec)
            end_sec = start_sec + duration

        else:
            res_time = duration - annotated_duration
            front_limit = res_time if res_time < t_min else t_min
            
            front_time = random.randint(0, front_limit)
            
            back_limit = 60 - t_max
            
            tmp_time = res_time - front_time
            back_time = tmp_time if tmp_time < back_limit else back_limit
            
            if not tmp_time < back_limit:
                front_time += tmp_time - back_limit
            
            start_sec = t_min - front_time
            end_sec = t_max + back_time
            
        return start_sec, end_sec
    
    def create_mel(self, y):
        y = self.transform(y)
        
        melspec = librosa.feature.melspectrogram(
            y,
            sr=SR,
            fmin=0,
            fmax=15000,
            n_mels=128
        )

        pcen = librosa.pcen(melspec, sr=SR, **self.pcen_parameters)
        clean_mel = librosa.power_to_db(melspec ** 1.5)
        melspec = librosa.power_to_db(melspec)

        norm_melspec = normalize_melspec(melspec)
        norm_pcen = normalize_melspec(pcen)
        norm_clean_mel = normalize_melspec(clean_mel)

        image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1)

        return image
    
    def __getitem__(self, idx: int):
        sample = self.tp.iloc[idx, :]
        recording_id = sample['recording_id']
        t_min = int(round(sample['t_min']))
        t_max = int(round(sample['t_max']))
        
        start_sec, end_sec = self.get_duration(t_min, t_max, 10)
            
        record_path = self.path + recording_id + '.flac'
        y = self.load(record_path)
        y =  y[start_sec*SR:end_sec*SR]
        
        if self.train:
            y = self.transform(y)
        
        species_id = sample['species_id']
        target = torch.zeros([24], dtype=torch.float32)
        target[species_id] = 1
        
        return y, target

In [6]:
def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)


def init_bn(bn):
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.0)


def init_weights(model):
    classname = model.__class__.__name__
    if classname.find("Conv2d") != -1:
        nn.init.xavier_uniform_(model.weight, gain=np.sqrt(2))
        model.bias.data.fill_(0)
    elif classname.find("BatchNorm") != -1:
        model.weight.data.normal_(1.0, 0.02)
        model.bias.data.fill_(0)
    elif classname.find("GRU") != -1:
        for weight in model.parameters():
            if len(weight.size()) > 1:
                nn.init.orghogonal_(weight.data)
    elif classname.find("Linear") != -1:
        model.weight.data.normal_(0, 0.01)
        model.bias.data.zero_()


def do_mixup(x: torch.Tensor, mixup_lambda: torch.Tensor):
    """Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes
    (1, 3, 5, ...).
    Args:
      x: (batch_size * 2, ...)
      mixup_lambda: (batch_size * 2,)
    Returns:
      out: (batch_size, ...)
    """
    out = (x[0::2].transpose(0, -1) * mixup_lambda[0::2] +
           x[1::2].transpose(0, -1) * mixup_lambda[1::2]).transpose(0, -1)
    return out


class Mixup(object):
    def __init__(self, mixup_alpha, random_seed=1234):
        """Mixup coefficient generator.
        """
        self.mixup_alpha = mixup_alpha
        self.random_state = np.random.RandomState(random_seed)

    def get_lambda(self, batch_size):
        """Get mixup random coefficients.
        Args:
          batch_size: int
        Returns:
          mixup_lambdas: (batch_size,)
        """
        mixup_lambdas = []
        for n in range(0, batch_size, 2):
            lam = self.random_state.beta(self.mixup_alpha, self.mixup_alpha, 1)[0]
            mixup_lambdas.append(lam)
            mixup_lambdas.append(1. - lam)

        return torch.from_numpy(np.array(mixup_lambdas, dtype=np.float32))


def interpolate(x: torch.Tensor, ratio: int):
    """Interpolate data in time domain. This is used to compensate the
    resolution reduction in downsampling of a CNN.
    Args:
      x: (batch_size, time_steps, classes_num)
      ratio: int, ratio to interpolate
    Returns:
      upsampled: (batch_size, time_steps * ratio, classes_num)
    """
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    return upsampled


def pad_framewise_output(framewise_output: torch.Tensor, frames_num: int):
    """Pad framewise_output to the same length as input frames. The pad value
    is the same as the value of the last frame.
    Args:
      framewise_output: (batch_size, frames_num, classes_num)
      frames_num: int, number of frames to pad
    Outputs:
      output: (batch_size, frames_num, classes_num)
    """
    pad = framewise_output[:, -1:, :].repeat(
        1, frames_num - framewise_output.shape[1], 1)
    """tensor for padding"""

    output = torch.cat((framewise_output, pad), dim=1)
    """(batch_size, frames_num, classes_num)"""

    return output

In [7]:
class AttBlock(nn.Module):
    def __init__(self,
                 in_features: int,
                 out_features: int,
                 activation="linear",
                 temperature=1.0):
        super().__init__()

        self.activation = activation
        self.temperature = temperature
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)

        self.bn_att = nn.BatchNorm1d(out_features)
        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)
        init_bn(self.bn_att)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.clamp(self.att(x), -10, 10), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == 'linear':
            return x
        elif self.activation == 'sigmoid':
            return torch.sigmoid(x)


class AttBlockV2(nn.Module):
    def __init__(self,
                 in_features: int,
                 out_features: int,
                 activation="linear"):
        super().__init__()

        self.activation = activation
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)

        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == 'linear':
            return x
        elif self.activation == 'sigmoid':
            return torch.sigmoid(x)

In [8]:
class EfficientNetSED(nn.Module):
    def __init__(
        self,
        base_model_name: str,
        pretrained=False,
        num_classes=24,
        spectrogram_params={},
        logmel_extractor_params={},
        spec_augmenter_params={},
        pce_params={}
    ):
        super().__init__()
        self.spectrogram_extractor = Spectrogram(**spectrogram_params)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(**logmel_extractor_params)
        
        #Pcen converter
        self.pcen_converter = spec.pcen(**pce_params)

        # Spec augmenter
        #self.spec_augmenter = SpecAugmentation(**spec_augmenter_params)
        
        self.interpolate_ratio = 30  # Downsampled ratio
        self.mixup_alpha = 0.2
        self.random_state = np.random.RandomState(123)
        
        if pretrained:
            self.base_model = EfficientNet.from_pretrained(base_model_name)
        else:
            self.base_model = EfficientNet.from_name(base_model_name)

        in_features = self.base_model._fc.in_features

        self.fc1 = nn.Linear(in_features, in_features, bias=True)
        self.att_block = AttBlockV2(in_features, num_classes, activation="sigmoid")

        self.init_weight()
        
    def mixup(self, x):
        sizws = x.size()
        #lam = torch.from_numpy(self.random_state.beta(self.mixup_alpha, self.mixup_alpha, (sizws[0], 1))).cuda()
        lam = self.random_state.beta(self.mixup_alpha, self.mixup_alpha, 1)[0]
        index = list(range(x.size(0)))
        random.shuffle(index)
        #out = (x.view(sizws[0], -1) * lam + x[index].squeeze().view(sizws[0], -1) * (1-lam)).view(sizws[0], sizws[1], sizws[2], sizws[3],)
        out = (x * lam + x[index].squeeze() * (1-lam))
        return out.float(), {'lam': lam, 'index': index}

    def init_weight(self):
        init_layer(self.fc1)

    def forward(self, input):        
        x = self.spectrogram_extractor(input)
        x = self.logmel_extractor(x)
        
        x_mels = self.logmel_extractor.power_to_db(x)
        x_pcen = self.pcen_converter(x) 
        x_clear = self.logmel_extractor.power_to_db(x ** 1.5)
        
        
        x = torch.cat((x_mels,x_pcen,x_clear),1)
        #x = torch.cat((x,x,x),1)
        
        frames_num = x.size(2)
        
        if self.training:
            x, mix_info = self.mixup(x)
            #x = self.spec_augmenter(x)
        else:
            mix_info = None
                
        # (batch_size, channels, freq, frames)
        x = self.base_model.extract_features(x)

        # (batch_size, channels, frames)
        x = torch.mean(x, dim=3)

        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2

        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       self.interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, frames_num)

        framewise_logit = interpolate(segmentwise_logit, self.interpolate_ratio)
        framewise_logit = pad_framewise_output(framewise_logit, frames_num)

        output_dict = {
            "framewise_output": framewise_output,
            "segmentwise_output": segmentwise_output,
            "logit": logit,
            "framewise_logit": framewise_logit,
            "clipwise_output": clipwise_output
        }

        return output_dict, mix_info

In [9]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2):
        super().__init__()
        self.gamma = gamma

    def forward(self, logit, target, mixup_info=None):
        target = target.float()
        max_val = (-logit).clamp(min=0)
        loss = logit - logit * target + max_val + \
            ((-max_val).exp() + (-logit - max_val).exp()).log()

        invprobs = F.logsigmoid(-logit * (target * 2.0 - 1.0))
        loss = (invprobs * self.gamma).exp() * loss
        if len(loss.size()) == 2:
            loss = loss.sum(dim=1)
            
        if mixup_info is None:
            return loss.mean()
        
        target = target[mixup_info['index']].float()
        logit = logit[mixup_info['index']]
        
        max_val = (-logit).clamp(min=0)
        loss2 = logit - logit * target + max_val + \
            ((-max_val).exp() + (-logit - max_val).exp()).log()

        invprobs2 = F.logsigmoid(-logit * (target * 2.0 - 1.0))
        loss2 = (invprobs2 * self.gamma).exp() * loss2
        if len(loss2.size()) == 2:
            loss2 = loss2.sum(dim=1)
        
        return (loss * mix_info['lam'] + loss2 * (1 - mix_info['lam'])).mean()


class ImprovedPANNsLoss(nn.Module):
    def __init__(self, output_key="logit", weights=[1, 1]):
        super().__init__()

        self.output_key = output_key
        if output_key == "logit":
            self.normal_loss = nn.BCEWithLogitsLoss()
        else:
            self.normal_loss = nn.BCELoss()

        self.bce = nn.BCELoss()
        self.weights = weights

    def forward(self, input, target):
        input_ = input[self.output_key]
        target = target.float()

        framewise_output = input["framewise_output"]
        clipwise_output_with_max, _ = framewise_output.max(dim=1)

        normal_loss = self.normal_loss(input_, target)
        auxiliary_loss = self.bce(clipwise_output_with_max, target)

        return self.weights[0] * normal_loss + self.weights[1] * auxiliary_loss


class ImprovedFocalLoss(nn.Module):
    def __init__(self, weights=[1, 1]):
        super().__init__()

        self.focal = FocalLoss()
        self.weights = weights

    def forward(self, input, target, mixup_info=None):
        input_ = input["logit"]
        target = target.float()

        framewise_output = input["framewise_logit"]
        clipwise_output_with_max, _ = framewise_output.max(dim=1)
        
        if mixup_info is None:
            normal_loss = self.focal(input_, target)
            auxiliary_loss = self.focal(clipwise_output_with_max, target)
        else:
            normal_loss = self.focal(input_, target, mixup_info)
            auxiliary_loss = self.focal(clipwise_output_with_max, target, mixup_info)

        return self.weights[0] * normal_loss + self.weights[1] * auxiliary_loss

In [10]:
# LRAP. Instance-level average
# Assume float preds [BxC], labels [BxC] of 0 or 1
def LRAP(preds, labels):
    # Ranks of the predictions
    ranked_classes = torch.argsort(preds, dim=-1, descending=True)
    # i, j corresponds to rank of prediction in row i
    class_ranks = torch.zeros_like(ranked_classes)
    for i in range(ranked_classes.size(0)):
        for j in range(ranked_classes.size(1)):
            class_ranks[i, ranked_classes[i][j]] = j + 1
    # Mask out to only use the ranks of relevant GT labels
    ground_truth_ranks = class_ranks * labels + (1e6) * (1 - labels)
    # All the GT ranks are in front now
    sorted_ground_truth_ranks, _ = torch.sort(ground_truth_ranks, dim=-1, descending=False)
    pos_matrix = torch.tensor(np.array([i+1 for i in range(labels.size(-1))])).unsqueeze(0)
    score_matrix = pos_matrix / sorted_ground_truth_ranks
    score_mask_matrix, _ = torch.sort(labels, dim=-1, descending=True)
    scores = score_matrix * score_mask_matrix
    score = (scores.sum(-1) / labels.sum(-1)).mean()
    return score.item()

# label-level average
# Assume float preds [BxC], labels [BxC] of 0 or 1
def LWLRAP(preds, labels):
    # Ranks of the predictions
    ranked_classes = torch.argsort(preds, dim=-1, descending=True)
    # i, j corresponds to rank of prediction in row i
    class_ranks = torch.zeros_like(ranked_classes)
    for i in range(ranked_classes.size(0)):
        for j in range(ranked_classes.size(1)):
            class_ranks[i, ranked_classes[i][j]] = j + 1
    # Mask out to only use the ranks of relevant GT labels
    ground_truth_ranks = class_ranks * labels + (1e6) * (1 - labels)
    # All the GT ranks are in front now
    sorted_ground_truth_ranks, _ = torch.sort(ground_truth_ranks, dim=-1, descending=False)
    # Number of GT labels per instance
    num_labels = labels.sum(-1)
    pos_matrix = torch.tensor(np.array([i+1 for i in range(labels.size(-1))])).unsqueeze(0)
    score_matrix = pos_matrix / sorted_ground_truth_ranks
    score_mask_matrix, _ = torch.sort(labels, dim=-1, descending=True)
    scores = score_matrix * score_mask_matrix
    score = scores.sum() / labels.sum()
    return score.item()

def mixup_socre(cor, x, y, mix_info):
    return (cor(x, y) * mix_info['lam'] + cor(x, y[mix_info['index']].squeeze()) * (1-mix_info['lam'])).mean()

In [11]:
TEST_NAME = 'efficient-b0-sed-powd'

n_splits = 5
random_state = 1
epochs = 35

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

spectrogram_params = {
    'n_fft': 2048,
    'hop_length': 512,
    'win_length': 2048,
    'window': 'hann',
    'center': True,
    'pad_mode': 'reflect',
    'freeze_parameters': True
}

logmel_extractor_params = {
    'sr': SR,
    'n_fft': 2048,
    'n_mels': 256,
    'fmin': 0,
    'fmax': 15000,
    'ref' : 1.0,
    'amin': 1e-10,
    'top_db': None,
    'is_log': False,
    'freeze_parameters': True
}

spec_augmenter_params = {
    'time_drop_width':  64,
    'time_stripes_num': 2,
    'freq_drop_width':  8,
    'freq_stripes_num': 2
}

pce_params = {
    'gain': 0.98,
    'bias': 2,
    'power': 0.5,
    'time_constant': 0.4,
    'eps': 0.000001,
}

model_params = {
    'base_model_name': 'efficientnet-b1',
    'pretrained': True,
    'num_classes': 24,
    'spectrogram_params': spectrogram_params,
    'logmel_extractor_params': logmel_extractor_params,
    'spec_augmenter_params': spec_augmenter_params,
    'pce_params': pce_params
}


optim_params = {
    'lr': 1e-3,
    'weight_decay': 5e-5,
    'betas': (0.9, 0.999)
}

scheduler_params = {
    'mode': 'max',
    'patience': 1,
    'factor': 0.6,
    'verbose': False
}

pcen_parameters = {
    'gain': 0.98,
    'bias': 2,
    'power': 0.5,
    'time_constant': 0.4,
    'eps': 0.000001,
}

train_params = {
    'pcen_parameters': pcen_parameters,
    'pre_calc': False,
    'train': True,
    'data_path': '/home/yuigahama/kaggle/rfcx/data/train/'  
}

val_params = {
    'train': True,
    'pre_calc': True,
    'pcen_parameters': pcen_parameters,
    'data_path': '/home/yuigahama/kaggle/rfcx/data/train/'
}

test_data_params = {
    'train': False,
    'path': '/home/yuigahama/kaggle/rfcx/data/test_wo_fp/'
}

dataloder_params = {
    'batch_size': 16,
    'num_workers': 15,
    'pin_memory': False,
}

test_dataloder_params = {
    'batch_size': 32,
    'num_workers': 15,
    'pin_memory': False,
    'shuffle':False
}

In [12]:
tta = np.zeros((len(submission), 24))
cv_score = 0
lrs = np.arange(5e-4,1.0e-3,5e-5).tolist()

for fold_id, (train_index, val_index) in enumerate(skf.split(train_tp, train_tp.species_id)):
    print(f'---------- fold {fold_id} ----------')
    
    model = EfficientNetSED(**model_params).to(device)
    optim = Adam(model.parameters(), **optim_params)
    #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer=optim, **scheduler_params)
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer=optim, T_max=epochs-10)
    
    pos_weights = torch.ones(24)
    pos_weights = pos_weights * 24
    criterion = cr.ImprovedFocalLoss(weights=[1, 0.5])
    
    train_dataset = RfcxDataSet(train_tp.iloc[train_index], **train_params)
    val_dataset   = RfcxDataSet(train_tp.iloc[val_index], **val_params)

    train_dataloader = DataLoader(train_dataset, shuffle=True, **dataloder_params)
    val_dataloader = DataLoader(val_dataset, shuffle=False, **dataloder_params)
    
    es = 50
    bast_score = 0
    for epoch in range(1, epochs):
        if es <= 0:
            break
        
        
        if epoch <= 10:
            for g in optim.param_groups:
                g['lr'] = lrs[epoch-1]
        else:
            scheduler.step(val_score)
        
        
        start_time = time.time()
        
        # train
        model.train()
        train_loss = 0
        train_score = 0
        train_corr = 0
        
        for data in train_dataloader:
            image = data[0].float().to(device)
            label = data[1]
            
            optim.zero_grad()
            output, mix_info = model(image)
            
            mix_info['lam'] = mix_info['lam'].cpu()
            output = {k:v.cpu() for k,v in output.items()}
            
            #loss = mixup_socre(criterion, output, label, mix_info)
            loss = criterion(output, label, mix_info)
            pred_labels = output["framewise_output"].max(1)[0]
            score = mixup_socre(LWLRAP, pred_labels, label, mix_info)
            
            #score = LWLRAP(pred_labels, label)
                        
            loss.backward()
            optim.step()
            
            vals, answers = torch.max(pred_labels, 1)
            vals, targets = torch.max(label, 1)
            vals, targets2 = torch.max(label[mix_info['index']], 1)
            
            corrects = 0
            for i in range(0, len(answers)):
                if answers[i] == targets[i]:
                    corrects = corrects + 1
                if answers[i] == targets2[i]:
                    corrects = corrects + 1
                
                corrects = 1 if corrects > 0 else 0
                    
            
            train_corr += corrects
            train_loss += loss.item()
            train_score += score
            
        train_loss  /= len(train_dataloader)
        train_score /= len(train_dataloader)
        
        # val
        model.eval()
        val_loss = 0
        val_score = 0
        val_corr = 0

        with torch.no_grad():
            for val_data in val_dataloader:
                image = val_data[0].float().to(device)
                label = val_data[1]

                output, mix_info = model(image)
                output = {k:v.cpu() for k,v in output.items()}
                
                pred_labels = output["framewise_output"].max(1)[0]
                vals, answers = torch.max(pred_labels, 1)
                vals, targets = torch.max(label, 1)
                
                corrects = 0
                for i in range(0, len(answers)):
                    if answers[i] == targets[i]:
                        corrects = corrects + 1

                val_corr += corrects
                val_loss += criterion(output, label)
                val_score += LWLRAP(pred_labels, label)
                
        val_loss  /= len(val_dataloader)
        val_score /= len(val_dataloader)
        
        duration = str(datetime.timedelta(seconds=time.time() - start_time))[:7]
        print(f'E {epoch:3}| T | L: {train_loss:.3} | S: {train_score:.3} | C: {train_corr}/{len(train_dataset)} | V | L: {val_loss:.3} | S: {val_score:.3} | C: {val_corr}/{len(val_dataset)} | T: {duration} | es: {es}')

        if bast_score < val_score:
            bast_score = val_score
            bast_path = save(fold_id, model, optim, criterion)
        else:
            es -= 1
        
        if es <= 0:
            break
            
        #scheduler.step(val_score)
    
    print(f"bast score: {bast_score}")
    del model, train_dataset, val_dataset, train_dataloader, val_dataloader, optim
    gc.collect()

---------- fold 0 ----------
Loaded pretrained weights for efficientnet-b0
E   1| T | L: 3.92 | S: 0.179 | C: 50/972 | V | L: 2.53 | S: 0.168 | C: 11/244 | T: 0:00:37 | es: 50
E   2| T | L: 2.05 | S: 0.202 | C: 53/972 | V | L: 3.32 | S: 0.228 | C: 21/244 | T: 0:00:36 | es: 50
E   3| T | L: 1.95 | S: 0.208 | C: 51/972 | V | L: 2.94 | S: 0.286 | C: 35/244 | T: 0:00:36 | es: 50
E   4| T | L: 1.9 | S: 0.223 | C: 55/972 | V | L: 2.22 | S: 0.257 | C: 22/244 | T: 0:00:36 | es: 50
E   5| T | L: 1.86 | S: 0.238 | C: 55/972 | V | L: 1.75 | S: 0.327 | C: 32/244 | T: 0:00:36 | es: 49
E   6| T | L: 1.84 | S: 0.251 | C: 55/972 | V | L: 1.77 | S: 0.327 | C: 43/244 | T: 0:00:36 | es: 49
E   7| T | L: 1.83 | S: 0.266 | C: 58/972 | V | L: 1.63 | S: 0.351 | C: 41/244 | T: 0:00:36 | es: 49
E   8| T | L: 1.79 | S: 0.295 | C: 60/972 | V | L: 1.55 | S: 0.404 | C: 56/244 | T: 0:00:36 | es: 49
E   9| T | L: 1.77 | S: 0.304 | C: 60/972 | V | L: 1.53 | S: 0.417 | C: 61/244 | T: 0:00:36 | es: 49
E  10| T | L: 1.7

In [13]:
def create_mel(y, img_size=256): 
    melspec = librosa.feature.melspectrogram(
        y,
        sr=SR,
        fmin=0,
        fmax=15000,
        n_mels=128
    )

    pcen = librosa.pcen(melspec, sr=SR, **pcen_parameters)
    clean_mel = librosa.power_to_db(melspec ** 1.5)
    melspec = librosa.power_to_db(melspec)

    norm_melspec = normalize_melspec(melspec)
    norm_pcen = normalize_melspec(pcen)
    norm_clean_mel = normalize_melspec(clean_mel)

    image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1)
    
    height, width, _ = image.shape
    image = cv2.resize(image, (img_size * 2, img_size))
    image = np.moveaxis(image, 2, 0)
    image = (image / 255.0).astype(np.float32)

    return image

def prediction_for_clip(audio_id: str,
                        clip: np.ndarray, 
                        model: EfficientNetSED,
                        threshold=0.5):
    PERIOD = 10
    audios = []
    y = clip.astype(np.float32)
    len_y = len(y)
    start = 0
    end = PERIOD * SR
    while True:
        y_batch = y[start:end].astype(np.float32)

        start = end - (5 * SR)
        end += 5 * SR
        
        #mel = create_mel(y_batch)
        audios.append(y_batch)
        
        if len_y < end:
            break
            
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    array = np.asarray(audios)
    image = torch.from_numpy(array).to(device)
    
    model.eval()
    estimated_event_list = []
    global_time = 0.0
    

    with torch.no_grad():
        prediction, _ = model(image)
        frame_pred = torch.sum(
            torch.sigmoid(torch.max(prediction["framewise_output"], 1)[0]), 0
        ).detach().cpu().numpy()
        framewise_outputs = torch.max(prediction["framewise_output"], 0)[0].detach(
            ).cpu().numpy()
        
    thresholded = framewise_outputs >= threshold
    
    for target_idx in range(thresholded.shape[1]):
        if thresholded[:, target_idx].mean() == 0:
            pass
        else:
            detected = np.argwhere(thresholded[:, target_idx]).reshape(-1)
            head_idx = 0
            tail_idx = 0
            while True:
                if (tail_idx + 1 == len(detected)) or (
                        detected[tail_idx + 1] - 
                        detected[tail_idx] != 1):
                    onset = 0.01 * detected[
                        head_idx] + global_time
                    offset = 0.01 * detected[
                        tail_idx] + global_time
                    onset_idx = detected[head_idx]
                    offset_idx = detected[tail_idx]
                    max_confidence = framewise_outputs[
                        onset_idx:offset_idx, target_idx].max()
                    mean_confidence = framewise_outputs[
                        onset_idx:offset_idx, target_idx].mean()
                    estimated_event = {
                        "audio_id": audio_id,
                        "ebird_code": target_idx,
                        "onset": onset,
                        "offset": offset,
                        "max_confidence": max_confidence,
                        "mean_confidence": mean_confidence
                    }
                    estimated_event_list.append(estimated_event)
                    head_idx = tail_idx + 1
                    tail_idx = tail_idx + 1
                    if head_idx >= len(detected):
                        break
                else:
                    tail_idx += 1
        global_time += PERIOD
        
    prediction_df = pd.DataFrame(estimated_event_list)
    return prediction_df, frame_pred

In [14]:
def prediction(test_df: pd.DataFrame,
               model: dict,
               threshold=0.5):
    #model = get_model(model_config, weights_path)
    unique_audio_id = test_df.recording_id.unique()

    warnings.filterwarnings("ignore")
    prediction_dfs = []
    frame_dict = dict()
    for audio_id in tqdm(unique_audio_id):
        clip, _ = librosa.load(
            f'/home/yuigahama/kaggle/rfcx/data/test/{audio_id}.flac',
            sr=SR,
            mono=True,
            res_type="kaiser_fast"
        )
        
        test_df_for_audio_id = test_df.query(
            f"recording_id == '{audio_id}'").reset_index(drop=True)
        prediction_df, frame_pred = prediction_for_clip(
            audio_id,
            clip=clip,
            model=model,
            threshold=threshold
        )
        frame_dict[audio_id] = frame_pred
        prediction_dfs.append(prediction_df)
    
    prediction_df = pd.concat(prediction_dfs, axis=0, sort=False).reset_index(drop=True)
    return prediction_df, frame_dict

In [15]:
preds = []
frams = [] 
for i in range(5):

    model = EfficientNetSED(**model_params).to(device)
    params = torch.load(f'/home/yuigahama/kaggle/rfcx/model/{TEST_NAME}/{TEST_NAME}_{i}.model')
    model.load_state_dict(params['model_state_dict'])
    
    prediction_df, frame_dict = prediction(
        test_df=submission,
        model=model,
        threshold=0.5
    )
    print(len(prediction_df.audio_id.unique()))
    preds.append(prediction_df)
    frams.append(pd.DataFrame(frame_dict).T)

  0%|          | 0/1992 [00:00<?, ?it/s]

Loaded pretrained weights for efficientnet-b0


100%|██████████| 1992/1992 [02:58<00:00, 11.15it/s]


1144


  0%|          | 0/1992 [00:00<?, ?it/s]

Loaded pretrained weights for efficientnet-b0


100%|██████████| 1992/1992 [02:57<00:00, 11.21it/s]


980


  0%|          | 0/1992 [00:00<?, ?it/s]

Loaded pretrained weights for efficientnet-b0


100%|██████████| 1992/1992 [02:56<00:00, 11.29it/s]


860


  0%|          | 0/1992 [00:00<?, ?it/s]

Loaded pretrained weights for efficientnet-b0


100%|██████████| 1992/1992 [02:57<00:00, 11.21it/s]


988


  0%|          | 0/1992 [00:00<?, ?it/s]

Loaded pretrained weights for efficientnet-b0


100%|██████████| 1992/1992 [02:57<00:00, 11.25it/s]


879


In [19]:
sub = pd.DataFrame(np.zeros((len(submission), 24)),columns=pred_target, index=submission['recording_id'])
for p,j in zip(frams, [0.1, 0.233, 0.1, 0.233, 0.234]):
    p.columns = pred_target
    sub += (p * j)
#sub /= 5
sub.reset_index().to_csv('submission.csv', index=False)
sub

Unnamed: 0_level_0,s0,s1,s2,s3,s4,s5,s6,s7,s8,s9,...,s14,s15,s16,s17,s18,s19,s20,s21,s22,s23
recording_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000316da7,5.536902,5.428552,5.398044,5.666194,5.455990,5.579698,5.493617,5.528298,5.443487,5.436513,...,5.445397,5.501407,5.508829,5.512059,5.598620,5.435609,5.434749,5.431371,5.435540,5.563958
003bc2cb2,5.357302,5.459712,5.364480,5.604771,5.408080,5.477409,5.444747,5.527529,5.391532,5.397219,...,5.412023,5.435459,6.025512,5.513920,5.413760,5.381675,5.397167,5.401389,5.406348,5.530141
0061c037e,5.445302,5.472865,5.455980,5.592596,5.463711,5.553733,5.510018,5.648340,5.446731,5.444982,...,5.495149,5.516629,5.571565,5.572390,5.468091,5.546052,5.485317,5.478624,5.497894,5.665424
010eb14d3,5.971642,5.446492,5.372373,5.485849,5.487978,5.503754,5.454111,5.486998,5.860940,5.429837,...,5.420577,5.491217,5.431955,5.484108,5.632305,5.417309,5.389531,5.512753,5.397875,5.533357
011318064,5.440906,5.468907,5.410848,5.581291,5.437844,5.511499,5.461808,5.537067,5.443908,5.413940,...,6.074458,5.612804,5.481584,5.536711,5.543149,5.436938,5.450940,5.467243,5.436522,5.532111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ff68f3ac3,5.530192,5.399386,5.401857,5.546243,5.458876,6.027565,5.453616,5.557828,5.453488,5.393347,...,5.432716,5.820098,5.434603,5.494606,5.482264,5.421736,5.404167,5.400099,5.394108,5.917825
ff973e852,5.420998,5.432414,5.389553,5.551100,5.405830,5.525096,5.494099,5.921758,5.417062,5.584220,...,5.479837,5.614740,5.488991,5.836863,5.448909,5.447275,5.499389,5.401888,5.463620,5.581737
ffa5cf6d6,5.409090,5.475642,5.441834,5.638551,5.404203,5.503583,5.474242,5.582300,5.405985,5.469386,...,5.436937,5.824130,5.549417,5.601403,5.450568,5.434418,5.433962,5.383908,5.431250,5.570698
ffa88cbb8,5.406117,5.616352,5.426845,5.666128,5.428423,5.520968,5.417374,5.724147,5.400343,5.503032,...,5.419724,5.491212,5.708699,5.486681,5.460002,5.383749,5.452403,5.459299,5.442833,5.563057


In [None]:
sum([0.1, 0.233, 0.1, 0.233, 0.234])

In [17]:
lrs = np.arange(5e-4,1.0e-3,5e-5).tolist()
lrs, len(lrs)

([0.0005,
  0.00055,
  0.0006000000000000001,
  0.0006500000000000001,
  0.0007000000000000001,
  0.0007500000000000001,
  0.0008000000000000001,
  0.0008500000000000002,
  0.0009000000000000002,
  0.0009500000000000002],
 10)

In [18]:
print(torch.tensor([[1],[2],[3]]).size())
a*torch.tensor([[1],[2],[3]])

torch.Size([3, 1])


NameError: name 'a' is not defined