In [1]:
!pip install '../input/birdmel/noisereduce-1.1.0/noisereduce-1.1.0'
!pip install '../input/birdmel/Keras_Applications-1.0.8-py3-none-any.whl'
!pip install '../input/birdmel/efficientnet-1.1.0/efficientnet-1.1.0'
!pip install '../input/birdmel/tensorflow_addons-0.11.1-cp37-cp37m-manylinux2010_x86_64.whl'

Processing /kaggle/input/birdmel/noisereduce-1.1.0/noisereduce-1.1.0
Building wheels for collected packages: noisereduce
  Building wheel for noisereduce (setup.py) ... [?25l- \ done
[?25h  Created wheel for noisereduce: filename=noisereduce-1.1.0-py3-none-any.whl size=7608 sha256=9d0ce30015fd20e035a8c57d704aad27d9e7002cf26a87cef66ec7341fc6fb31
  Stored in directory: /root/.cache/pip/wheels/49/8f/7b/bf5d8c00277dd5beab590b95f4e803aca73f99a7640bee9c71
Successfully built noisereduce
Installing collected packages: noisereduce
Successfully installed noisereduce-1.1.0
Processing /kaggle/input/birdmel/Keras_Applications-1.0.8-py3-none-any.whl
Installing collected packages: Keras-Applications
Successfully installed Keras-Applications-1.0.8
Processing /kaggle/input/birdmel/efficientnet-1.1.0/efficientnet-1.1.0
Building wheels for collected packages: efficientnet
  Building wheel for efficientnet (setup.py) ... [?25l- \ done
[?25h  Created wheel for efficientnet: filena

In [2]:
import numpy as np 
import pandas as pd

import tensorflow as tf

import librosa
import torchaudio
import noisereduce as no
import cv2
import matplotlib.pyplot as plt

import efficientnet.tfkeras as efn
import tensorflow_addons as tfa

import functools
import operator
import time
import torch

## Load a pretrained EfficientNetB5 model

In [3]:
batch_size = 16
img_size = (256, 5150)
seed = 1
storage_dir = '../input/birdmel/train_img_split/train_img_split'

In [4]:
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    storage_dir,
    validation_split=0.01,
    subset="validation",
    seed=seed,
    image_size=img_size,
    batch_size=batch_size,
    label_mode='categorical'
)

Found 50303 files belonging to 264 classes.
Using 503 files for validation.


In [5]:
class_names = np.array(val_ds.class_names)
class_num = len(val_ds.class_names)

In [6]:
normalization_layer = tf.keras.layers.experimental.preprocessing.Rescaling(scale=1./127.5, offset=-1)

def resize(image, height=256, width=256):
    return tf.image.resize_with_crop_or_pad(image, target_height=height, target_width=width)

def augment_image_test(image):
    image = resize(image, 256, 216)
    image = resize(image)
    image = normalization_layer(image)

    return image

def prepare(ds):
    return ds.map(lambda x, y: (augment_image_test(x), y))

In [7]:
net = tf.keras.models.load_model('../input/birdmel/efficientnet_b5_tf_2_3_0.h5')

## Prediction

In [8]:
def load_audio_fast(file_path):
    sample_rate = 22050
    signal, sr = torchaudio.load(file_path, normalization=False)
    signal = torchaudio.transforms.Resample(sr, sample_rate)(signal)
    signal = torch.mean(signal, dim=0, keepdim=True)
    signal = signal.cpu().numpy().squeeze()
        
    return (signal, sample_rate)

In [9]:
def get_melspectrogram_db(file_path):
    n_fft = 2048
    hop_length = 512   
    n_mels = 256
    fmin = 300

    signal, sr = load_audio_fast(file_path)
    
    signal = no.reduce_noise(audio_clip=signal, noise_clip=signal, verbose=False)

    spec = librosa.feature.melspectrogram(
        signal, 
        sr=sr, 
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        fmin=fmin
    )
    
    return (librosa.power_to_db(spec, ref=np.max), signal.shape[0]/sr)

In [10]:
def spec_to_image(spec, eps=1e-6):
    mean = spec.mean()
    std = spec.std()
    spec_norm = (spec - mean) / (std + eps)
    spec_min, spec_max = spec_norm.min(), spec_norm.max()
    spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
    spec_scaled = spec_scaled.astype(np.uint8)
    spec_scaled = np.flip(spec_scaled, axis=0)

    return spec_scaled

In [11]:
def img_to_imgs(img):
    size = 216 # 5 seconds
    
    width = img.shape[1]
    split_num = int(np.floor(width/size))
    residual = width - split_num*size
    end_split_index = width - residual
    
    imgs = []
    if split_num > 0:
        imgs = np.array_split(img[:, :end_split_index, :], split_num, axis=1)

    if residual > 0:
        imgs.append(img[:, end_split_index:, :])
    
    return imgs

In [12]:
def get_file_img(file):
    mel, time = get_melspectrogram_db(file)
    img = spec_to_image(mel)
    return (img, time)

def convert_img(full_img, time, offset=0, duration=5):    
    frame_per_second = full_img.shape[1]/time
    
    starting_index = int(np.floor(offset*frame_per_second))
    if duration is None:
        img = full_img[:, starting_index:]
    else:
        img = full_img[:, starting_index:int(starting_index + np.ceil(duration*frame_per_second))]

    img = cv2.merge((img,img,img))
    imgs = img_to_imgs(img)

    return imgs

In [13]:
def predict_imgs(model, class_names, imgs, threshold = 0.9):
    tensors = list(map(lambda x: tf.expand_dims(augment_image_test(x), 0), imgs))
    batch = np.vstack(tensors)
    probas = net.predict(x=batch)
    selected_class_names = list(set(functools.reduce(
        lambda y, x: operator.iconcat(y, list(class_names[x >= threshold])), 
        probas, 
        []
    )))
    
    selected_class_names.sort()
    
    return selected_class_names

In [14]:
def predict(test_folder='birdsong-recognition'):
    audio_dir = '../input/' + test_folder + '/test_audio/'
    df = pd.read_csv('../input/' + test_folder + '/test.csv')
    df = df.sort_values(by=['audio_id'])

    row_ids = []
    birds = []    
    
    current_filename = None
    current_img = None
    current_time = None
    for index, row in df.iterrows():
        row_id = row['row_id']
        filename = row['audio_id']
        site = row['site']
        file = audio_dir + filename + '.mp3'

        if current_filename != filename:
            current_filename = filename
            current_img, current_time = get_file_img(file)
        
        if site == 'site_3':
            images = convert_img(current_img, current_time, offset=0, duration=None)
        else:
            images = convert_img(current_img, current_time, offset=row['seconds'] - 5)
        
        preds = predict_imgs(net, class_names, images)
        row_ids.append(row_id)
        birds.append(preds)
        
    return pd.DataFrame({
        'row_id': row_ids,
        'birds': list(map(lambda x: 'nocall' if len(x) == 0 else ' '.join(x), birds))
    })

In [15]:
import warnings
warnings.filterwarnings('ignore')

try:
    sub_df = predict(test_folder='birdsong-recognition')
except:
    sub_df = predict(test_folder='birdcall-check')

In [16]:
sub_df.to_csv('submission.csv', index=False)

In [17]:
# sub_df = sub_df.sort_values(by=['row_id']).reset_index(drop=True)
# sample_sub = pd.read_csv('../input/birdmel/sample_submission.csv').sort_values(by=['row_id']).reset_index(drop=True)

In [18]:
# sum(sub_df['birds'] != sample_sub['birds'])