In [None]:
!pip install '../input/birdmel/noisereduce-1.1.0/noisereduce-1.1.0'
!pip install '../input/birdmel/Keras_Applications-1.0.8-py3-none-any.whl'
!pip install '../input/birdmel/efficientnet-1.1.0/efficientnet-1.1.0'
!pip install '../input/birdmel/tensorflow_addons-0.11.1-cp37-cp37m-manylinux2010_x86_64.whl'

In [None]:
import numpy as np 
import pandas as pd

import tensorflow as tf

import librosa
import noisereduce as no
import cv2
import matplotlib.pyplot as plt

import efficientnet.tfkeras as efn
import tensorflow_addons as tfa

## Load a pretrained EfficientNetB5 model

In [None]:
batch_size = 16
img_size = (256, 5150)
seed = 1
storage_dir = '../input/birdmel/train_img_split/train_img_split'

In [None]:
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    storage_dir,
    validation_split=0.1,
    subset="validation",
    seed=seed,
    image_size=img_size,
    batch_size=batch_size,
    label_mode='categorical'
)

In [None]:
class_names = np.array(val_ds.class_names)
class_num = len(val_ds.class_names)

In [None]:
normalization_layer = tf.keras.layers.experimental.preprocessing.Rescaling(scale=1./127.5, offset=-1)

def resize(image, height=256, width=256):
    return tf.image.resize_with_crop_or_pad(image, target_height=height, target_width=width)

def augment_image_test(image):
    image = resize(image, 256, 216)
    image = resize(image)
    image = normalization_layer(image)

    return image

def prepare(ds):
    return ds.map(lambda x, y: (augment_image_test(x), y))

In [None]:
net = tf.keras.models.load_model('../input/birdmel/efficientnet_b5_tf_2_3_0.h5')

## Prediction

In [None]:
fmin = 300
def get_melspectrogram_db(file_path):
    n_fft = 2048
    hop_length = 512   
    n_mels = 256

    signal, sr = librosa.load(file_path)
    
    signal = no.reduce_noise(audio_clip=signal, noise_clip=signal, verbose=False)

    spec = librosa.feature.melspectrogram(
        signal, 
        sr=sr, 
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        fmin=fmin
    )
    
    return librosa.power_to_db(spec, ref=np.max)

In [None]:
def spec_to_image(spec, eps=1e-6):
    mean = spec.mean()
    std = spec.std()
    spec_norm = (spec - mean) / (std + eps)
    spec_min, spec_max = spec_norm.min(), spec_norm.max()
    spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
    spec_scaled = spec_scaled.astype(np.uint8)
    spec_scaled = np.flip(spec_scaled, axis=0)

    return spec_scaled

In [None]:
def img_to_imgs(img):
    size = 216 # 5 seconds
    
    width = img.shape[1]
    split_num = int(np.floor(width/size))
    residual = width - split_num*size
    end_split_index = width - residual
    
    imgs = []
    if split_num > 0:
        imgs = np.array_split(img[:, :end_split_index, :], split_num, axis=1)

    if residual > 0:
        imgs.append(img[:, end_split_index:, :])
    
    return imgs

In [None]:
def file_to_imgs(file):
    if file.endswith(".mp3"):
        img = spec_to_image(get_melspectrogram_db(file))
        img = cv2.merge((img,img,img))
        imgs = img_to_imgs(img)
        
        return imgs
    else:
        return []

In [None]:
def predict_img(model, class_names, img, threshold = 0.9):
    tensor = augment_image_test(img)
    batch = tf.expand_dims(tensor, 0)
    probas = net.predict(x=batch)[0]
    selected_class_names = list(class_names[probas > threshold])
    selected_class_names.sort()
    
    return selected_class_names

In [None]:
def predict_imgs(model, class_names, imgs, threshold = 0.9):
    preds = []
    
    for img in imgs:
        preds += predict_img(model, class_names, img, threshold)
        
    preds = list(set(preds))
    preds.sort()
    
    return preds

In [None]:
def predict_df(df, audio_dir='../input/birdsong-recognition/test_audio/'):
    row_ids = []
    birds = []
    
    current_file_name = ''
    current_images = []
    for index, row in df.iterrows():
        row_id = row['row_id']
        filename = row['audio_id']
        
        if filename != current_file_name:
            current_file_name = filename
            current_images = file_to_imgs(audio_dir + filename + '.mp3')
            print(filename)
           
        site = row['site']
        if site == 'site_3':
            preds = predict_imgs(net, class_names, current_images)
        else:
            preds = predict_img(net, class_names, current_images.pop(0))
            
        row_ids.append(row_id)
        birds.append(preds)
        
    return pd.DataFrame({
        'row_id': row_ids,
        'birds': list(map(lambda x: 'nocall' if len(x) == 0 else ' '.join(x), birds))
    })

In [None]:
test_df = pd.read_csv('../input/birdsong-recognition/test.csv')

In [None]:
test_df

In [None]:
import warnings
warnings.filterwarnings('ignore')

try:
    sub_df = predict_df(test_df)
except:
    sub_df = pd.read_csv('../input/birdsong-recognition/sample_submission.csv')
    sub_df['birds'] = 'error'

In [None]:
sub_df.to_csv('submission.csv', index=False)