In [1]:
%reload_ext autoreload
%autoreload 2

In [42]:
import os
import cv2
import sys
import time
import math

import random
import librosa
from warnings import simplefilter
import torchaudio
import torchvision
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import typing as tp
import IPython.display as ipd
import matplotlib.pyplot as plt

from pathlib import Path
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torch.nn.functional as F

from torch.utils.data import DataLoader
from torch.nn.modules.utils import _pair
from torch.nn import Conv2d, Module, Linear, BatchNorm2d, ReLU


pd.options.display.max_rows = 500
pd.options.display.max_columns = 500
simplefilter("ignore")

In [27]:
data_dir = Path("../input/birdclef-2021")
train_audio_dir = data_dir / 'train_short_audio'
test_audio_dir = data_dir / 'test_soundscapes'
sample_file = data_dir / 'sample_submission.csv'

df_train = pd.read_csv("../input/birdclef-2021/train_metadata.csv")

LABEL_IDS = {label: label_id for label_id,label in enumerate(sorted(df_train["primary_label"].unique()))}
INV_LABEL_IDS = {val: key for key,val in LABEL_IDS.items()}

if not len(list(test_audio_dir.glob("*.ogg"))):
    test_audio_dir = data_dir / 'train_soundscapes'
    sample_file = None
    label_file =data_dir / 'train_soundscape_labels.csv'    

In [52]:
CLASSES = sorted(os.listdir(train_audio_dir))
NUM_CLASSES = len(CLASSES)
NUM_WORKERS = 4
THRESH = 0.5
SR = 32_000
DURATION = 5
SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE:", DEVICE)


class AudioParams:
    sr = 32000
    stride = 5
    true_kernel_size = 5

    img_size = None
    
    # Melspectrogram
    n_mels = 128
    fmin = 20
    fmax = 16000

DEVICE: cuda


In [5]:
data = pd.DataFrame(
     [(path.stem, *path.stem.split("_"), path) for path in Path(test_audio_dir).glob("*.ogg")],
    columns = ["filename", "id", "site", "date", "filepath"]
)
print(data.shape)
data.head()

(20, 5)


Unnamed: 0,filename,id,site,date,filepath
0,10534_SSW_20170429,10534,SSW,20170429,../input/birdclef-2021/train_soundscapes/10534...
1,11254_COR_20190904,11254,COR,20190904,../input/birdclef-2021/train_soundscapes/11254...
2,14473_SSW_20170701,14473,SSW,20170701,../input/birdclef-2021/train_soundscapes/14473...
3,18003_COR_20190904,18003,COR,20190904,../input/birdclef-2021/train_soundscapes/18003...
4,20152_SSW_20170805,20152,SSW,20170805,../input/birdclef-2021/train_soundscapes/20152...


In [6]:
def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    
    Arguments:
        seed {int} -- Number of the seed
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True #False


def load_audio(path, sr):
    clip, _ = librosa.load(path, sr=sr, mono=True, res_type="kaiser_fast")
    return clip


def load_model_weights(model, weights):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    state_dict = torch.load(weights, map_location=device)
    model.load_state_dict(state_dict)
    
    
def compute_melspec(y, params):
    melspec = librosa.feature.melspectrogram(
        y,
        sr=params.sr,
        n_mels=params.n_mels,
        fmin=params.fmin,
        fmax=params.fmax
    )
    
    melspec = librosa.power_to_db(melspec).astype(np.float32)
    
    return melspec

def mono_to_color(X, eps=1e-6, mean=None, std=None):
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)

    # Normalize to [0, 255]
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V


def resize(image, size=None):
    if size is not None:
        h, w, _ = image.shape
        new_w, new_h = int(w * size / h), size
        image = cv2.resize(image, (new_w, new_h))

    return image


def normalize(image, mean=None, std=None):
    image = image / 255.0
    if mean is not None and std is not None:
        image = (image - mean) / std
    return np.moveaxis(image, 2, 0).astype(np.float32)

In [39]:
class BirdCLEFDataset(Dataset):
    def __init__(self, data, params):
        
        self.data = data
        self.params = params
        self.audio_length = DURATION * SR
        self.step = self.audio_length

    def __len__(self):
        return len(self.data)
    
    def audio_to_image(self, audio):
        melspec = compute_melspec(audio, self.params)
        
        image = mono_to_color(melspec)
        image = resize(image, self.params.img_size)
        image = normalize(image, mean=None, std=None)
        return image

    def read_file(self, filepath):
        #audio, orig_sr = sf.read(filepath, dtype="float32")
        audio = load_audio(filepath, self.params.sr)

        audios = []
        for i in range(self.audio_length, len(audio) + self.step, self.step):
            start = max(0, i - self.audio_length)
            end = start + self.audio_length
            audios.append(audio[start:end])
            
        if len(audios[-1]) < self.audio_length:
            audios = audios[:-1]
            
        images = [self.audio_to_image(audio) for audio in audios]
        images = np.stack(images)
        
        return images
    
        
    def __getitem__(self, idx):
        return self.read_file(self.data.loc[idx, "filepath"])

In [48]:
@torch.no_grad()
def get_thresh_preds(out, thresh=None):
    thresh = thresh or THRESH
    o = (-out).argsort(1)
    npreds = (out > thresh).sum(1)
    preds = []
    for oo, npred in zip(o, npreds):
        preds.append(oo[:npred].cpu().numpy().tolist())
    return preds

def get_bird_names(preds):
    bird_names = []
    for pred in preds:
        if not pred:
            bird_names.append("nocall")
        else:
            bird_names.append(" ".join([INV_LABEL_IDS[bird_id] for bird_id in pred]))
    return bird_names

def predict(nets, test_data, names=False):
    preds = []
    with torch.no_grad():
        for idx in  tqdm(list(range(len(test_data)))):
            xb = torch.from_numpy(test_data[idx]).to(DEVICE)
            pred = 0.
            for net in nets:
                o = net(xb)
                o = torch.sigmoid(o)

                pred += o

            pred /= len(nets)
            
            if names:
                pred = get_bird_names(get_thresh_preds(pred))

            preds.append(pred)
    return preds

def predict(model, dataset, batch_size=16):
    model.eval()
    preds = np.empty((0, NUM_CLASSES))
    
    loader = DataLoader(
        dataset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True
    )

    with torch.no_grad():
        for x in loader:
            y_pred = model(x.cuda()).detach()
            preds = np.concatenate([preds, torch.sigmoid(y_pred).cpu().numpy()])
    
    return preds

def preds_as_df(data, preds):
    sub = {
        "row_id": [],
        "birds": [],
    }
    
    for row, pred in zip(data.itertuples(False), preds):
        row_id = [f"{row.id}_{row.site}_{5*i}" for i in range(1, len(pred)+1)]
        sub["birds"] += pred
        sub["row_id"] += row_id
        
    sub = pd.DataFrame(sub)
    
    if sample_file:
        sample_sub = pd.read_csv(sample_file, usecols=["row_id"])
        sub = sample_sub.merge(sub, on="row_id", how="left")
        sub["birds"] = sub["birds"].fillna("nocall")
    return sub

In [35]:
model_name = "resnest50_fast_1s1x64d"
weights = [f"/root/kaggle/kaggle_birdcall_identification/build/checkpoints/2021-05-30/{i}/{model_name}_double_{i}.pt" for i in range(5)]

for w in weights:
    assert os.path.isfile(w), f"Weights {w} not found"
    
configs = [{
    "name": model_name,
    "weights": weights,
}]

models = []
for config in configs:
    models_ = []
    for weights in config["weights"]:
        model = get_model(config['name'])
        load_model_weights(model, weights)
        models_.append(model)
    models.append(models_)

Using cache found in /root/.cache/torch/hub/zhanghang1989_ResNeSt_master
Using cache found in /root/.cache/torch/hub/zhanghang1989_ResNeSt_master
Using cache found in /root/.cache/torch/hub/zhanghang1989_ResNeSt_master
Using cache found in /root/.cache/torch/hub/zhanghang1989_ResNeSt_master
Using cache found in /root/.cache/torch/hub/zhanghang1989_ResNeSt_master


In [40]:
test_data = BirdCLEFDataset(data=data, params=AudioParams)
len(test_data), test_data[0].shape

(20, (120, 3, 128, 313))

In [45]:
len(models), type(models[0]), len(models[0])

(1, list, 5)

In [46]:
pred_probas = predict(models[0], test_data, names=False)
print(len(pred_probas))

  0%|          | 0/20 [00:00<?, ?it/s]

20


In [53]:
preds = [get_bird_names(get_thresh_preds(pred, thresh=THRESH)) for pred in pred_probas]

In [54]:
sub = preds_as_df(data, preds)
print(sub.shape)
sub

(2400, 2)


Unnamed: 0,row_id,birds
0,10534_SSW_5,nocall
1,10534_SSW_10,nocall
2,10534_SSW_15,grhowl
3,10534_SSW_20,nocall
4,10534_SSW_25,nocall
...,...,...
2395,7954_COR_580,nocall
2396,7954_COR_585,nocall
2397,7954_COR_590,nocall
2398,7954_COR_595,nocall


In [55]:
sub.to_csv("submission.csv", index=False)

In [56]:
def get_metrics(s_true, s_pred):
    s_true = set(s_true.split())
    s_pred = set(s_pred.split())
    n, n_true, n_pred = len(s_true.intersection(s_pred)), len(s_true), len(s_pred)
    
    prec = n/n_pred
    rec = n/n_true
    f1 = 2*prec*rec/(prec + rec) if prec + rec else 0
    
    return {"f1": f1, "prec": prec, "rec": rec, "n_true": n_true, "n_pred": n_pred, "n": n}

In [57]:
if label_file:
    sub_target = pd.read_csv(label_file)
    sub_target = sub_target.merge(sub, how="left", on="row_id")
    
    print(sub_target["birds_x"].notnull().sum(), sub_target["birds_x"].notnull().sum())
    assert sub_target["birds_x"].notnull().all()
    assert sub_target["birds_y"].notnull().all()
    
    df_metrics = pd.DataFrame([get_metrics(s_true, s_pred) for s_true, s_pred in zip(sub_target.birds_x, sub_target.birds_y)])
    
    print(df_metrics.mean())

2400 2400
f1        0.696347
prec      0.705208
rec       0.692410
n_true    1.130000
n_pred    1.000417
n         0.705417
dtype: float64


In [58]:
sub_target[sub_target.birds_y != "nocall"]

Unnamed: 0,row_id,site,audio_id,seconds,birds_x,birds_y
14,7019_COR_75,COR,7019,75,nocall,rudpig
44,7019_COR_225,COR,7019,225,nocall,compau
64,7019_COR_325,COR,7019,325,nocall,compau
81,7019_COR_410,COR,7019,410,nocall,plupig2
107,7019_COR_540,COR,7019,540,nocall,bucmot2
241,11254_COR_10,COR,11254,10,nocall,wbwwre1
244,11254_COR_25,COR,11254,25,rubwre1,rubwre1
268,11254_COR_145,COR,11254,145,obnthr1,obnthr1
269,11254_COR_150,COR,11254,150,obnthr1,obnthr1
270,11254_COR_155,COR,11254,155,obnthr1,obnthr1


In [59]:
sub_target[sub_target.birds_x != "nocall"]

Unnamed: 0,row_id,site,audio_id,seconds,birds_x,birds_y
240,11254_COR_5,COR,11254,5,rubwre1,nocall
242,11254_COR_15,COR,11254,15,rubwre1,nocall
244,11254_COR_25,COR,11254,25,rubwre1,rubwre1
267,11254_COR_140,COR,11254,140,obnthr1,nocall
268,11254_COR_145,COR,11254,145,obnthr1,obnthr1
...,...,...,...,...,...,...
2391,54955_SSW_560,SSW,54955,560,grycat,nocall
2393,54955_SSW_570,SSW,54955,570,grycat,grycat
2394,54955_SSW_575,SSW,54955,575,chswar,nocall
2396,54955_SSW_585,SSW,54955,585,grycat,nocall


In [7]:
class TestDataset(Dataset):
    def __init__(self, df, clip, params):
        self.df = df
        self.clip = clip
        self.params = params
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        end_seconds = int(self.df['seconds'][idx])
        start_seconds = int(end_seconds - 5)

        start_index = self.params.sr * start_seconds
        end_index = self.params.sr * end_seconds

        y = self.clip[start_index:end_index].astype(np.float32)

        melspec = compute_melspec(y, self.params)
        
        image = mono_to_color(melspec)
        image = resize(image, self.params.img_size)
        image = normalize(image, mean=None, std=None)
        
        return image

In [9]:
def get_model(name):
    if "resnest" in name:
        #model = ResNet(**MODEL_CONFIGS[name])
        model = torch.hub.load('zhanghang1989/ResNeSt', 'resnest50', pretrained=True)
    elif "resnext101" in name:
#         model = resnext101_32x8d()
        model = torchvision.models.resnext101_32x8d(pretrained=False)
    elif "resnext50" in name:
        model = torchvision.models.resnext50_32x4d(pretrained=False)
    else:
        raise NotImplementedError

    nb_ft = model.fc.in_features
    del model.fc
    model.fc = nn.Linear(nb_ft, NUM_CLASSES)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    return model

In [10]:
seed_everything(SEED)

In [11]:
def predict(model, dataset, batch_size=16):
    model.eval()
    preds = np.empty((0, NUM_CLASSES))
    
    loader = DataLoader(
        dataset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True
    )

    with torch.no_grad():
        for x in loader:
            y_pred = model(x.cuda()).detach()
            preds = np.concatenate([preds, torch.sigmoid(y_pred).cpu().numpy()])
    
    return preds

In [12]:
def post_process_site_12(preds, threshold=0.5, maxpreds=3):
    preds = preds * (preds >= threshold)   # remove preds < threshold

    next_preds = np.concatenate([preds[1:], np.zeros((1, preds.shape[-1]))])   # pred corresponding to next window
    prev_preds = np.concatenate([np.zeros((1, preds.shape[-1])), preds[:-1]])  # pred corresponding to previous window
    
    score = preds + 0.5 * next_preds + 0.5 * prev_preds  # Aggregating with neighbouring predictions
    
    n_birds = (score >= threshold).sum(-1)   # Counting birds
    n_birds = np.clip(n_birds, 0, maxpreds)  # keep at most maxpreds birds
    
    labels = [np.argsort(- score[i])[:n_birds[i]] for i in range(len(preds))]  # Getting the n_birds most likely class indices
    
    class_labels = [" ".join([CLASSES[l] for l in label]) for label in labels]  # Getting class names
    
    return class_labels

In [13]:
def max_pred_gen(site, duration):
    if site != "site_3":
        return 3
    else:
        rets = [(7,2), (15, 3), (30, 5), (60, 7)]
        
        for ref_duration,thresh in rets:
            if ref_duration >= duration:
                return thresh
        return 10
    
def reformat_preds(preds, df, site):
    prediction_df = pd.DataFrame({
        "row_id": df['row_id'].values,
        "birds": preds
    })
    
    prediction_df['birds'] = prediction_df['birds'].replace([''],'nocall')
    
    return prediction_df

In [14]:
def inference(test_df, test_audio, configs, params, threshold=0.5):
    unique_audio_id = test_df.audio_id.unique()
    
    models = []
    for config in configs:
        models_ = []
        for weights in config["weights"]:
            model = get_model(config['name'])
            load_model_weights(model, weights)
            models_.append(model)
        models.append(models_)
        
    print(f'\t -> Using {len(models)} models, with {len(models[0])} weights per model.')
        
    pred_dfs = []
    for audio_id in unique_audio_id :
        
        audio_df = test_df[test_df['audio_id'] == audio_id].reset_index(drop=True)
        site = audio_df["site"].values[0]      
        
        print(f'\nMaking predictions for audio {audio_id} in {site} ')

        clip = load_audio(test_audio / (audio_id + ".mp3"), params.sr)
        clip_duration = len(clip) // params.sr
        
        dataset = TestDataset(audio_df, clip, params)
        
        preds = []
        for i, config in enumerate(configs):
            for j, weights in enumerate(config["weights"]):
                pred = predict(models[i][j], dataset, batch_size=16)
                preds.append(pred)
        preds = np.mean(preds, 0)
        
        maxpreds = max_pred_gen(site, clip_duration)
        print(f'Limiting the number of birds to {maxpreds}')
        
        if site == 'site_3':
            preds_pp = post_process_site_3(preds, threshold=threshold, maxpreds=maxpreds)
        else:
            preds_pp = post_process_site_12(preds, threshold=threshold, maxpreds=maxpreds)
        
        print("Predicted classes :", preds_pp)
        
        pred_df = reformat_preds(preds_pp, audio_df, site)
        pred_dfs.append(pred_df)
    
    sub = pd.concat(pred_dfs, axis=0, sort=False).reset_index(drop=True)
    return sub

In [15]:
def vote(preds, min_votes=3):
    votes = Counter(preds)
    return [c for c, count in votes.items() if count >= min_votes]

In [21]:
def inference_voting(test_df, test_audio, configs, params, threshold=0.5, min_votes=3):
    filepaths = test_df.filepath.values.tolist()
    
    models = []
    for config in configs:
        models_ = []
        for weights in config["weights"]:
            model = get_model(config['name'])
            load_model_weights(model, weights)
            models_.append(model)
        models.append(models_)
        
    print(f'\t -> Using {len(models)} models, with {len(models[0])} weights per model.')
        
    pred_dfs = []
    for filepath in filepaths :
        
        audio_df = test_df[test_df.filepath == filepath].reset_index(drop=True)
        site = audio_df["site"].values[0]   
        
        print(f'\nMaking predictions for {filepath} ')

        clip = load_audio(filepath, params.sr)
        clip_duration = len(clip) // params.sr
        
        dataset = TestDataset(audio_df, clip, params)
        
        
        all_preds = []
        for i, config in enumerate(configs):
            
            preds = []
            for j, weights in enumerate(config["weights"]):
                pred = predict(models[i][j], dataset, batch_size=16)
                preds.append(pred)
            preds = np.mean(preds, 0)

            maxpreds = max_pred_gen(site, clip_duration)
            print(f'Limiting the number of birds to {maxpreds}')

            preds_pp = post_process_site_12(preds, threshold=threshold, maxpreds=maxpreds)

            all_preds.append(preds_pp)
            print("Predicted classes :", preds_pp)
        
        final_preds = []
        for i in range(len(all_preds[0])):
            preds = []
            for m in range(len(all_preds)):
                preds += all_preds[m][i].split(' ')
                
            final_pred = vote(preds, min_votes=min_votes)
            final_preds.append(' '.join(final_pred))
        
        print("\n    -> Voted classes :", final_preds)
        
        pred_df = reformat_preds(final_preds, audio_df, site)
        pred_dfs.append(pred_df)
    
    sub = pd.concat(pred_dfs, axis=0, sort=False).reset_index(drop=True)
    return sub

In [17]:
configs = []

In [18]:
model_name = "resnest50_fast_1s1x64d"
weights = [f"/root/kaggle/kaggle_birdcall_identification/build/checkpoints/2021-05-30/{i}/{model_name}_double_{i}.pt" for i in range(5)]

for w in weights:
    assert os.path.isfile(w), f"Weights {w} not found"
    
configs = [{
    "name": model_name,
    "weights": weights,
}]

In [19]:
threshold = 0.5
min_votes = 2

In [22]:
# submission = inference(test, TEST_AUDIO_DIR, configs, AudioParams, threshold=threshold)
submission = inference_voting(data, test_audio_dir, configs, AudioParams, threshold=threshold, min_votes=min_votes)

Using cache found in /root/.cache/torch/hub/zhanghang1989_ResNeSt_master
Using cache found in /root/.cache/torch/hub/zhanghang1989_ResNeSt_master
Using cache found in /root/.cache/torch/hub/zhanghang1989_ResNeSt_master
Using cache found in /root/.cache/torch/hub/zhanghang1989_ResNeSt_master
Using cache found in /root/.cache/torch/hub/zhanghang1989_ResNeSt_master


	 -> Using 1 models, with 5 weights per model.

Making predictions for ../input/birdclef-2021/train_soundscapes/10534_SSW_20170429.ogg 


KeyError: Caught KeyError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/root/miniconda3/envs/kaggle/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 3080, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 101, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 4554, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 4562, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'seconds'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/root/miniconda3/envs/kaggle/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 202, in _worker_loop
    data = fetcher.fetch(index)
  File "/root/miniconda3/envs/kaggle/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/root/miniconda3/envs/kaggle/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "<ipython-input-7-a53d3239549b>", line 11, in __getitem__
    end_seconds = int(self.df['seconds'][idx])
  File "/root/miniconda3/envs/kaggle/lib/python3.7/site-packages/pandas/core/frame.py", line 3024, in __getitem__
    indexer = self.columns.get_loc(key)
  File "/root/miniconda3/envs/kaggle/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 3082, in get_loc
    raise KeyError(key) from err
KeyError: 'seconds'
