In [1]:
from glob import glob
import librosa
import numpy as np
import pandas as pd
import sys
import time
import datetime
from tqdm import tqdm
import random
import os

from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, AdamW, lr_scheduler
from torch.distributions import Uniform
from torch.utils.data import DataLoader, Dataset

from torchaudio.transforms import Spectrogram, MelSpectrogram
from torchaudio.transforms import TimeStretch, AmplitudeToDB, ComplexNorm, Resample
from torchaudio.transforms import FrequencyMasking, TimeMasking

"""from torchlibrosa.stft import Spectrogram, LogmelFilterBank
from torchlibrosa.augmentation import SpecAugmentation
"""

from efficientnet_pytorch import EfficientNet
from torchvision.models import resnet34, resnet50



In [2]:
train_tp = pd.read_csv('../../data/train_tp.csv')

SR = 22050
LEN_10SEC = SR * 10

In [3]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = False  # type: ignore
set_seed(1234)

In [4]:
class RfcxDataSet(Dataset):
    def __init__(self, df: pd.DataFrame, data_path:str):
        self.df =  df
        self.path = data_path
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx: int):
        sample = self.df.iloc[idx, :]
        recording_id = sample['recording_id']
        species_id = sample['species_id']
        
        record_path = self.path + recording_id + '.tensor'
        sound_tensor = torch.load(record_path)
        
        limit_sec = sound_tensor.size(0) - (SR * 10)
        
        start = random.randint(0, limit_sec)
        end = start + (SR * 10)
        
        target = torch.zeros([24], dtype=torch.float32)
        target[species_id] = 1

        return sound_tensor[start:end], target

In [5]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, pool=True):
        super().__init__()
        
        padding = kernel_size // 2
        self.pool = pool
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=1, padding=padding),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(out_channels + in_channels, out_channels, kernel_size=kernel_size, stride=1, padding=padding),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
        )

        self._init_weights()
        
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.zeros_(m.bias)
        
    def forward(self, x): # x.shape = [batch_size, in_channels, a, b]
        x1 = self.conv1(x)
        x = self.conv2(torch.cat([x, x1],1))
        if(self.pool): x = F.avg_pool2d(x, 2)
        return x   # x.shape = [batch_size, out_channels, a//2, b//2]


In [6]:
class RondomStretchMelSpectrogram(nn.Module):
    def __init__(self, sample_rate, n_fft, top_db, max_perc):
        super().__init__()
        self.time_stretch = TimeStretch(hop_length=None, n_freq=n_fft//2+1)
        self.stft = Spectrogram(n_fft=n_fft, power=None)
        self.com_norm = ComplexNorm(power=2.)
        self.fm = FrequencyMasking(100)
        self.tm = TimeMasking(100)
        self.mel_specgram = MelSpectrogram(sample_rate, n_fft=n_fft, f_max=8000)
        self.AtoDB= AmplitudeToDB(top_db=top_db)
        self.max_perc = max_perc
        self.sample_rate = sample_rate
        self.resamples = [
                Resample(sample_rate, sample_rate*0.6),
                Resample(sample_rate, sample_rate*0.7),
                Resample(sample_rate, sample_rate*0.8),
                Resample(sample_rate, sample_rate*0.9),
                Resample(sample_rate, sample_rate*1),
                Resample(sample_rate, sample_rate*1.1),
                Resample(sample_rate, sample_rate*1.2),
                Resample(sample_rate, sample_rate*1.3),
                Resample(sample_rate, sample_rate*1.4)
            ]
    
    def forward(self, x):
        #x = random.choice(self.resamples)(x)
        
        x = self.stft(x)

        if True:
            dist = Uniform(1.-self.max_perc, 1+self.max_perc)
            x = self.time_stretch(x, dist.sample().item())
            x = self.com_norm(x)
            x = self.fm(x, 0)
            x = self.tm(x, 0)
        else:
            x = self.com_norm(x)
        
        x = self.mel_specgram.mel_scale(x)
        x = self.AtoDB(x)
        
        size = torch.tensor(x.size())
        
        if size[2] > 280:
            x = x[:,:,0:280]
        else:
            x = torch.cat([x, torch.cuda.FloatTensor(size[0], size[1], 280 - size[2]).fill_(0)], dim=2)
        
        return x.unsqueeze(1)


In [7]:
def adaptive_concat_pool2d(x, sz=(1,1)):
    out1 = F.adaptive_avg_pool2d(x, sz).view(x.size(0), -1)
    out2 = F.adaptive_max_pool2d(x, sz).view(x.size(0), -1)
    return torch.cat([out1, out2], 1)

In [8]:
class BaseModel(nn.Module):
    def __init__(self, model_type, model_name, output_size, spectrogram_params, logmel_extractor_params, spec_augmenter_params):
        super().__init__()
        
        """self.spectrogram_extractor = Spectrogram(**spectrogram_params)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(**logmel_extractor_params)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(**spec_augmenter_params)
        
        self.bn0 = nn.BatchNorm2d(logmel_extractor_params['n_mels'])"""
        self.mel = RondomStretchMelSpectrogram(sample_rate=SR, n_fft=2**11, top_db=80, max_perc=0.4)
        
        self.conv1 = ConvBlock(1,64)
        self.conv2 = ConvBlock(64,128)
        self.conv3 = ConvBlock(128,256)
        self.conv4 = ConvBlock(256,512)
        self.conv5 = ConvBlock(512,1024)
        self.conv6 = ConvBlock(1024,2048,pool=False)
        
        self.fc = nn.Sequential(
            nn.BatchNorm1d(7936),
            nn.Linear(7936, 2048),
            nn.PReLU(),
            nn.BatchNorm1d(2048),
            nn.Linear(2048, 24),
        )

    def forward(self, input):
        """x = self.spectrogram_extractor(
            input)  # (batch_size, 1, time_steps, freq_bins)
        x = self.logmel_extractor(x)  # (batch_size, 1, time_steps, mel_bins)

        frames_num = x.shape[2]

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        if self.training:
            x = self.spec_augmenter(x)
            
        print(x.size())"""
        
        x = self.mel(input)
        
        x1 = self.conv1(x)
        x1 = F.dropout(x1, p=0.2, training=self.training)
        x2 = self.conv2(x1)
        x2 = F.dropout(x2, p=0.2, training=self.training)
        x3 = self.conv3(x2)
        x3 = F.dropout(x3, p=0.2, training=self.training)
        x4 = self.conv4(x3)
        x4 = F.dropout(x4, p=0.2, training=self.training)
        x5 = self.conv5(x4)
        x5 = F.dropout(x5, p=0.2, training=self.training)
        x6 = self.conv6(x5)
        x6 = F.dropout(x6, p=0.2, training=self.training)
        
        
        x = torch.cat([adaptive_concat_pool2d(x2), adaptive_concat_pool2d(x3),
                       adaptive_concat_pool2d(x4),adaptive_concat_pool2d(x5),
                       adaptive_concat_pool2d(x6)], 1)
                
        return self.fc(x)

In [9]:
def save(epoch, fold, model, optim, criterion, file_path="../../model/"):
    if not TEST_NAME in os.listdir(file_path):
        os.mkdir(file_path+TEST_NAME)
    
    
    output_path = file_path + TEST_NAME + '/' + f"{TEST_NAME}_{fold}_{epoch}.model"
    
    torch.save(
        {
            'epoch': epoch,
            'model_state_dict': model.cpu().state_dict(),
            'optimizer_state_dict': optim.state_dict(),
            'criterion': criterion
        },
        output_path)
    
    model.to(device)
    
    return output_path

In [10]:
# LRAP. Instance-level average
# Assume float preds [BxC], labels [BxC] of 0 or 1
def LRAP(preds, labels):
    # Ranks of the predictions
    ranked_classes = torch.argsort(preds, dim=-1, descending=True)
    # i, j corresponds to rank of prediction in row i
    class_ranks = torch.zeros_like(ranked_classes)
    for i in range(ranked_classes.size(0)):
        for j in range(ranked_classes.size(1)):
            class_ranks[i, ranked_classes[i][j]] = j + 1
    # Mask out to only use the ranks of relevant GT labels
    ground_truth_ranks = class_ranks * labels + (1e6) * (1 - labels)
    # All the GT ranks are in front now
    sorted_ground_truth_ranks, _ = torch.sort(ground_truth_ranks, dim=-1, descending=False)
    pos_matrix = torch.tensor(np.array([i+1 for i in range(labels.size(-1))])).unsqueeze(0)
    score_matrix = pos_matrix / sorted_ground_truth_ranks
    score_mask_matrix, _ = torch.sort(labels, dim=-1, descending=True)
    scores = score_matrix * score_mask_matrix
    score = (scores.sum(-1) / labels.sum(-1)).mean()
    return score.item()

# label-level average
# Assume float preds [BxC], labels [BxC] of 0 or 1
def LWLRAP(preds, labels):
    # Ranks of the predictions
    ranked_classes = torch.argsort(preds, dim=-1, descending=True)
    # i, j corresponds to rank of prediction in row i
    class_ranks = torch.zeros_like(ranked_classes)
    for i in range(ranked_classes.size(0)):
        for j in range(ranked_classes.size(1)):
            class_ranks[i, ranked_classes[i][j]] = j + 1
    # Mask out to only use the ranks of relevant GT labels
    ground_truth_ranks = class_ranks * labels + (1e6) * (1 - labels)
    # All the GT ranks are in front now
    sorted_ground_truth_ranks, _ = torch.sort(ground_truth_ranks, dim=-1, descending=False)
    # Number of GT labels per instance
    num_labels = labels.sum(-1)
    pos_matrix = torch.tensor(np.array([i+1 for i in range(labels.size(-1))])).unsqueeze(0)
    score_matrix = pos_matrix / sorted_ground_truth_ranks
    score_mask_matrix, _ = torch.sort(labels, dim=-1, descending=True)
    scores = score_matrix * score_mask_matrix
    score = scores.sum() / labels.sum()
    return score.item()

# Sample usage
# y_true = torch.tensor(np.array([[1, 1, 0], [1, 0, 1], [0, 0, 1]]))
# y_score = torch.tensor(np.random.randn(3, 3))
# print(LRAP(y_score, y_true), LWLRAP(y_score, y_true))

In [11]:
TEST_NAME = 'baseline'

n_splits = 5
random_state = 1
epochs = 35

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

spectrogram_params = {
    'n_fft': 1024,
    'hop_length': 320,
    'win_length': 1024,
    'window': 'hann',
    'center': True,
    'pad_mode': 'reflect',
    'freeze_parameters': True
}

logmel_extractor_params = {
    'sr': SR,
    'n_fft': 1024,
    'n_mels': 64,
    'fmin': 50,
    'fmax': 15000,
    'ref' : 1.0,
    'amin': 1e-10,
    'top_db': None,
    'freeze_parameters': True
}

spec_augmenter_params = {
    'time_drop_width':  64,
    'time_stripes_num': 2,
    'freq_drop_width':  8,
    'freq_stripes_num': 2
}

model_params = {
    'model_type': 'res',
    'model_name': 'resnet50',
    'output_size': 24,
    'spectrogram_params': spectrogram_params,
    'logmel_extractor_params': logmel_extractor_params,
    'spec_augmenter_params': spec_augmenter_params,
}

optim_params = {
    'lr': 1e-3,
    'weight_decay': 5e-5,
    'betas': (0.9, 0.999)
}

scheduler_params = {
    'mode': 'max',
    'patience': 1,
    'factor': 0.4,
    'verbose': False
}

data_params = {
    'path': '../../data/train_tp_torch/',
    'dataloder': {
        'batch_size': 128,
        'num_workers': 15,
        'pin_memory': True,
        'shuffle':False
    }
}

In [12]:
for fold_id, (train_index, val_index) in enumerate(skf.split(train_tp, train_tp.species_id)):
    print(f'---------- fold {fold_id} ----------')
    
    model = BaseModel(**model_params).to(device)
    optim = Adam(model.parameters(), **optim_params)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer=optim, **scheduler_params)
    criterion = nn.BCEWithLogitsLoss()
    
    train_dataset = RfcxDataSet(train_tp.iloc[train_index], data_params['path'])
    val_dataset = RfcxDataSet(train_tp.iloc[val_index], data_params['path'])

    train_dataloader = DataLoader(train_dataset, **data_params['dataloder'])
    val_dataloader = DataLoader(val_dataset, **data_params['dataloder'])
    
    for epoch in range(1, epochs):
        es = 5
        
        start_time = time.time()
        bast_score = 0
        
        # train
        model.train()
        train_loss = 0
        train_score = 0
        
        for data in train_dataloader:
            image = data[0].to(device)
            label = data[1].to(device)
            
            pred = model(image)
            
            optim.zero_grad()
            loss = criterion(pred, label)
            score = LWLRAP(pred.cpu(), label.cpu())
            
            loss.backward()
            optim.step()
            
            train_loss += loss.item()
            train_score += score
            
        train_loss  /= len(train_dataloader)
        train_score /= len(train_dataloader)
        
        # val
        model.eval()
        val_loss = 0
        val_score = 0
        
        with torch.no_grad():
            for val_data in val_dataloader:
                image = val_data[0].to(device)
                label = val_data[1].to(device)

                pred = model(image)
                loss = criterion(pred, label)
                score = LWLRAP(pred.cpu(), label.cpu())
                
                val_loss += loss.item()
                val_score += score

                
        val_loss  /= len(val_dataloader)
        val_score /= len(val_dataloader)
        
        duration = str(datetime.timedelta(seconds=time.time() - start_time))[:7]
        print(f'epoch {epoch:3}| T | loss: {train_loss:.3} | score: {train_score:.3} | V | loss: {val_loss:.3} | score: {val_score:.3} | time: {duration}')

        if bast_score < val_score:
            bast_score = val_score
            save(epoch, fold_id, model, optim, criterion)
        else:
            es -= 1
            
            if es == 0:
                break
        scheduler.step(val_score)

---------- fold 0 ----------


  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore
  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore


epoch   1| T | loss: 0.871 | score: 0.172 | V | loss: 9.41e+02 | score: 0.156 | time: 0:00:09
epoch   2| T | loss: 0.707 | score: 0.196 | V | loss: 12.6 | score: 0.156 | time: 0:00:09
epoch   3| T | loss: 0.653 | score: 0.195 | V | loss: 1.74 | score: 0.167 | time: 0:00:09
epoch   4| T | loss: 0.609 | score: 0.211 | V | loss: 0.795 | score: 0.167 | time: 0:00:09
epoch   5| T | loss: 0.551 | score: 0.244 | V | loss: 0.537 | score: 0.172 | time: 0:00:09
epoch   6| T | loss: 0.476 | score: 0.256 | V | loss: 0.526 | score: 0.185 | time: 0:00:09
epoch   7| T | loss: 0.39 | score: 0.229 | V | loss: 0.403 | score: 0.21 | time: 0:00:09
epoch   8| T | loss: 0.307 | score: 0.238 | V | loss: 0.305 | score: 0.198 | time: 0:00:09
epoch   9| T | loss: 0.244 | score: 0.247 | V | loss: 0.229 | score: 0.219 | time: 0:00:09
epoch  10| T | loss: 0.204 | score: 0.248 | V | loss: 0.219 | score: 0.215 | time: 0:00:09
epoch  11| T | loss: 0.182 | score: 0.263 | V | loss: 0.184 | score: 0.189 | time: 0:00:09


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/yuigahama/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-c776961bcd31>", line 39, in <module>
    train_loss += loss.item()
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/yuigahama/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2045, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/yuigahama/anaconda3/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1170, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/home/yuigahama/anaconda3/li

TypeError: object of type 'NoneType' has no len()