In [None]:
! pip install efficientnet_pytorch
! pip install prefetch-generator
! pip install torchaudio_augmentations

In [1]:
import optuna
from optuna.trial import TrialState
import os
import pandas as pd
pd.options.mode.chained_assignment = None # avoids assignment warning
import numpy as np
import random
from glob import glob
from tqdm import tqdm
tqdm.pandas()  # enable progress bars in pandas operations
import gc

import librosa
import sklearn
import json
import argparse

# Import for visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import librosa.display as lid
import IPython.display as ipd

# from kaggle_datasets import KaggleDatasets
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import transformers

from transformers import set_seed
from transformers import ASTFeatureExtractor
from transformers import ASTPreTrainedModel, ASTModel, AutoConfig, ASTConfig

  from .autonotebook import tqdm as notebook_tqdm
2023-04-18 17:43:23.687438: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-18 17:43:23.721228: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
import os

class CFG:
    # Debugging
    debug = False
    
    # Plot training history
    training_plot = True
    
    # Weights and Biases logging
    wandb = True
    competition   = 'birdclef-2023' 
    _wandb_kernel = 'awsaf49'
    
    # Experiment name and comment
    exp_name = 'baseline-v2'
    comment = 'EfficientNetB0|FSR|t=10s|128x384|up_thr=50|cv_filter'
    
    # Notebook link
    notebook_link = 'https://www.kaggle.com/awsaf49/birdclef23-effnet-fsr-cutmixup-train/edit'
    
    # Verbosity level
    verbose = 0
    
    # Device and random seed
    device = 'TPU-VM'
    seed = 42
    
    # Input image size and batch size
    img_size = [128, 384]
    batch_size = 32
    upsample_thr = 50 # min sample of each class (upsample)
    cv_filter = True # always keeps low sample data in train
    
    # Inference batch size, test time augmentation, and drop remainder
    infer_bs = 2
    tta = 1
    drop_remainder = True
    
    # Number of epochs, model name, and number of folds
    epochs = 25
    model_name = 'EfficientNetB0'
    fsr = True # reduce stride of stem block
    num_fold = 5
    
    # Selected folds for training and evaluation
    selected_folds = [0]

    # Pretraining, neck features, and final activation function
    pretrain = 'imagenet'
    neck_features = 0
    final_act = 'softmax'
    
    # Learning rate, optimizer, and scheduler
    lr = 1e-3
    scheduler = 'cos'
    optimizer = 'Adam' # AdamW, Adam
    
    # Loss function and label smoothing
    loss = 'BCE' # BCE, CCE
    
    label_smoothing = 0.05 # label smoothing
    
    # Audio duration, sample rate, and length
    duration = 10 # second
    sample_rate = 32000
    target_rate = 8000
    audio_len = duration*sample_rate
    
    # STFT parameters
    nfft = 2048
    window = 2048
    hop_length = audio_len // (img_size[1] - 1)
    fmin = 20
    fmax = 16000
    normalize = True
    
    # Data augmentation parameters
    augment=True
    
    # Spec augment
    spec_augment_prob = 0.80
    
    mixup_prob = 0.65
    mixup_alpha = 0.5
    
    cutmix_prob = 0.0
    cutmix_alpha = 0.5
    
    mask_prob = 0.65
    freq_mask = 20
    time_mask = 30


    # Audio Augmentation Settings
    audio_augment_prob = 0.5
    
    timeshift_prob = 0.0
    
    gn_prob = 0.35

    # Data Preprocessing Settings
    base_path = '/kaggle/input/birdclef-2023'  # for server: base_path = '/data/zjh_data/program/ml_project_birdclef23/birdclef-2023'
    if not os.path.exists(base_path):
        base_path = '/data/zjh_data/program/ml_project_birdclef23/birdclef-2023'
    class_names = sorted(os.listdir('{}/train_audio'.format(base_path)))
    num_classes = len(class_names)
    class_labels = list(range(num_classes))
    label2name = dict(zip(class_labels, class_names))
    name2label = {v:k for k,v in label2name.items()}

    # Training Settings
    target_col = ['target']
    tab_cols = ['filename']
    monitor = 'auc'
    
    ### add by plathzheng
    unilm_model_path = './pretrained_models/unilm/BEATs_iter3_plus_AS2M.pt'
    use_apex = True
    time_length = 10  # beats模型中，训练时，截取的音频片段时长
    ast_fix_layer = 9 # the parameters in layer<ast_fix_layer would be fixed, choosen from [0, 5], if ast_fix_layer>5 all param woudl be fixed

In [4]:
set_seed(CFG.seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
device = torch.device('cpu')

In [5]:
test_paths = glob('/kaggle/input/birdclef-2023/test_soundscapes/*ogg')
test_df = pd.DataFrame(test_paths, columns=['filepath'])
test_df['filename'] = test_df.filepath.map(lambda x: x.split('/')[-1].replace('.ogg',''))
test_df.head()

Unnamed: 0,filepath,filename
0,/kaggle/input/birdclef-2023/test_soundscapes/s...,soundscape_29201


In [7]:

ast_feature_extractor = ASTFeatureExtractor()

class ASTDataset(Dataset):
    def __init__(self, df):
        self.paths = df.filepath.values
        
        # extract_ast_features(df)  # wheter to extract data
        
    def __getitem__(self, idx):
        path = self.paths[idx]
        raw, sr = librosa.load(path, sr=CFG.sample_rate, mono=True)
        raw = librosa.resample(raw, orig_sr=CFG.sample_rate, target_sr=16000)  # ast can only process the audio with sr=16000
        inputs = ast_feature_extractor(raw, sampling_rate=16000, return_tesnors='pt')
        inputs = inputs['input_values'][0]
        return path, inputs
    
    def __len__(self):
        return len(self.paths)

dataset_eval = ASTDataset(test_df)
CFG.debug=True
loader = DataLoader(dataset_eval, batch_size=CFG.batch_size, shuffle=False, drop_last=False, num_workers=0 if CFG.debug else 10)

In [6]:
class DenseLayer(nn.Module):
    def __init__(self, config, output_dim):
        super().__init__()
        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dense = nn.Linear(config.hidden_size, output_dim)
    
    def forward(self, hidden_state):
        hidden_state = self.layernorm(hidden_state)
        hidden_state = self.dense(hidden_state)
        return hidden_state
    
class ASTagModel(ASTPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.audio_spectrogram_transformer = ASTModel(config)

        for pname, p in self.named_parameters():
            if pname.find('layer.') >= 0:
                layer = int(pname.split('.')[3])
                if layer<=CFG.ast_fix_layer:
                    p.requires_grad = False
            else:
                p.requires_grad = False
            
        self.linear = DenseLayer(config, CFG.num_classes)
        self.n_class = CFG.num_classes
    
    def forward(self, input_values):
        outputs = self.audio_spectrogram_transformer(input_values)
        hidden_states = outputs.last_hidden_state
        pool_output = torch.mean(hidden_states, dim=1)
        # pool_output = outputs.pooler_output
        logits = self.linear(pool_output)
        return nn.Sigmoid()(logits) if CFG.loss=='BCE' else logits

In [18]:
config = ASTConfig() 
model = ASTagModel(config=config)

ckpt = torch.load('experiments/ast/trial_0/ast.pth', map_location=device)
model.load_state_dict(ckpt)
model = model.to(device)

In [19]:
def file_load_chunk(audio_path, duration=5, sr=32000):
    raw, sr = librosa.load(audio_path, sr=CFG.sample_rate, mono=True)
    raw = librosa.resample(raw, orig_sr=CFG.sample_rate, target_sr=16000)  # ast can only process the audio with sr=16000
    frame_length = int(duration*16000)
    frame_step = int(duration*16000)
    chunks = librosa.util.frame(raw, frame_length=frame_length, hop_length=frame_step, axis=0)
    return chunks


model.eval()
pred_stack = torch.randn(size=(1, CFG.num_classes)).to(device)
ids = []
for filepath in tqdm(test_df.filepath.tolist()):
    filename = filepath.split('/')[-1].replace('.ogg','')
    chunks = file_load_chunk(filepath)
    
    inputs = ast_feature_extractor(chunks.tolist(), sampling_rate=16000, return_tesnors='pt')
    inputs = inputs['input_values']
    inputs = np.stack(inputs)
    inputs = torch.from_numpy(inputs)
    if len(inputs.shape)==2:
        inputs = inputs.unsqueeze(0)
    inputs = inputs.to(device)
    with torch.no_grad():
        prob = model(inputs)
    pred_stack = torch.cat([pred_stack, prob], dim=0)
    ids += [f'{filename}_{(frame_id+1)*5}' for frame_id in range(len(chunks))]
pred_stack = pred_stack[1:]
preds = pred_stack.detach().cpu().numpy()
    

100%|██████████| 1/1 [01:47<00:00, 107.42s/it]


In [11]:
# Submit prediction
pred_df = pd.DataFrame(ids, columns=['row_id'])
pred_df.loc[:, CFG.class_names] = preds
pred_df.to_csv('submission.csv',index=False)
pred_df.head()

Unnamed: 0,row_id,abethr1,abhori1,abythr1,afbfly1,afdfly1,afecuc1,affeag1,afgfly1,afghor1,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1
0,soundscape_29201_5,0.001188,0.008082,0.060659,0.004431,0.023877,0.029688,0.060907,0.000638,0.000295,...,0.013793,0.002948,0.023256,0.004431,0.003432,0.013142,0.137708,0.002249,0.016115,0.004742
1,soundscape_29201_10,0.001139,0.008449,0.061043,0.004339,0.02388,0.03062,0.063125,0.000606,0.000286,...,0.014135,0.002871,0.0238,0.004516,0.003359,0.013473,0.144527,0.002333,0.015734,0.0046
2,soundscape_29201_15,0.001139,0.008284,0.061248,0.004292,0.023421,0.030652,0.062929,0.000604,0.000284,...,0.014275,0.002887,0.023936,0.004503,0.003334,0.013365,0.143368,0.00235,0.015586,0.004561
3,soundscape_29201_20,0.00113,0.008335,0.060825,0.004328,0.023678,0.030282,0.062363,0.000611,0.000289,...,0.014226,0.002891,0.023714,0.004505,0.003416,0.013452,0.143186,0.002365,0.015755,0.004628
4,soundscape_29201_25,0.001148,0.008453,0.060764,0.004335,0.023844,0.030457,0.063132,0.000617,0.000291,...,0.014084,0.002899,0.023841,0.004527,0.0034,0.013581,0.143059,0.002378,0.015701,0.004613


In [8]:
from optuna_utils.modules import *
from optuna_utils.efficientnet_pytorch import EfficientNet

class Efficient(nn.Module):
    def __init__(self, dataset='mtat'):
        super(Efficient, self).__init__()
        
        self.model = EfficientNet.from_pretrained('efficientnet-b0')
        feature = self.model._fc.in_features
        self.model._fc = nn.Linear(in_features=feature, out_features=CFG.num_classes, bias=True)

        self.spec_bn = nn.BatchNorm2d(3)

        # Pons front-end
        m1 = Conv_V(3, 204, (int(0.7*96), 7))
        m2 = Conv_V(3, 204, (int(0.4*96), 7))
        m3 = Conv_H(3, 51, 129)
        m4 = Conv_H(3, 51, 65)
        m5 = Conv_H(3, 51, 33)
        self.layers = nn.ModuleList([m1, m2, m3, m4, m5])

        # Pons back-end
        backend_channel= 512 if dataset=='msd' else 64
        self.layer1 = Conv_1d(561, backend_channel, 7, 1, 1)
        self.layer2 = Conv_1d(backend_channel, backend_channel, 7, 1, 1)
        self.layer3 = Conv_1d(backend_channel, backend_channel, 7, 1, 1)

        # Dense
        dense_channel = 500 if dataset=='msd' else 200
        self.dense1 = nn.Linear((561+(backend_channel*3))*2, dense_channel)
        self.bn = nn.BatchNorm1d(dense_channel)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.dense2 = nn.Linear(dense_channel, CFG.num_classes)
    
    def forward(self, audio):
        res = self.model(audio)
        res = nn.Sigmoid()(res) if CFG.loss=='BCE' else res
        return res
# from efficientnet_pytorch import EfficientNet
# model = Efficient()
# ckpt = torch.load('experiments/efficient/trial_14/efficient.pth', map_location=device)
# model.load_state_dict(ckpt)
# model = model.to(device)
model = torch.load('experiments/efficient_save/trial_0/efficient.pth', map_location='cpu')
model = model.to(device)
torch.save(model.state_dict(), 'efficient.pth')

In [7]:
import torchaudio
def file_load_chunk(audio_path, duration=5, sr=32000):
    raw, sr = librosa.load(audio_path, sr=CFG.sample_rate, mono=True)
    raw = librosa.resample(raw, orig_sr=CFG.sample_rate, target_sr=16000)  # ast can only process the audio with sr=16000
    frame_length = int(duration*16000)
    frame_step = int(duration*16000)
    chunks = librosa.util.frame(raw, frame_length=frame_length, hop_length=frame_step, axis=0)
    chunks = [torchaudio.transforms.MelSpectrogram(
            sample_rate=CFG.sample_rate, 
            n_fft=CFG.nfft, 
            win_length=CFG.window, 
            hop_length=CFG.hop_length, 
            f_min=CFG.fmin, 
            f_max=CFG.fmax, 
            n_mels=CFG.img_size[0])(torch.Tensor(wav)).unsqueeze(0).repeat(3, 1, 1) for wav in chunks.tolist()]
    
    return chunks


model.eval()
pred_stack = torch.randn(size=(1, CFG.num_classes)).to(device)
ids = []
for filepath in tqdm(test_df.filepath.tolist()):
    filename = filepath.split('/')[-1].replace('.ogg','')
    chunks = file_load_chunk(filepath)
    inputs = torch.stack(chunks)
    if len(inputs.shape)==2:
        inputs = inputs.unsqueeze(0)
    inputs = inputs.to(device)
    with torch.no_grad():
        prob = model(inputs)
    pred_stack = torch.cat([pred_stack, prob], dim=0)
    ids += [f'{filename}_{(frame_id+1)*5}' for frame_id in range(len(inputs))]
pred_stack = pred_stack[1:]
preds = pred_stack.detach().cpu().numpy()

100%|██████████| 1/1 [00:02<00:00,  2.96s/it]


In [None]:
from optuna_utils.utils import *

def eval(model, loader):
    model.eval()
    label_stack = []
    pred_stack = torch.randn(size=(1, CFG.num_classes)).to(device)
    for batch in loader:
        batch = set_device(batch, device)
        audio, label = batch
        with torch.no_grad():
            prob = model(audio)
        label_stack += label.cpu().numpy().tolist()
        pred_stack = torch.cat([pred_stack, prob], dim=0)
    pred_stack = pred_stack[1:]
    pred_stack = pred_stack.detach().cpu().numpy()
    label_stack = np.array(label_stack)
    acc, auc = measurement(label_stack, pred_stack)
    return acc, auc

model, loader_eval = load_data_model(args, df)
acc, auc = eval(model, loader_eval)
print(acc, auc)