In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import os
import librosa
from pathlib import Path
import torch
from torch import nn
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
LOCAL_FOLDER = Path('/Users/julian/Downloads/birds/birdsong-recognition/')

KAGGLE_MODEL_FOLDER = Path('../input/birdscall-model/')
MODEL_FOLDER = KAGGLE_MODEL_FOLDER if os.path.exists(KAGGLE_MODEL_FOLDER) else Path('./models')


TEST_PATH = Path('../input/birdsong-recognition') if os.path.exists('../input/birdsong-recognition/test_audio') else Path('./test_check/')
TEST_AUDIO_PATH = TEST_PATH/'test_audio'

In [4]:
print(TEST_AUDIO_PATH, MODEL_FOLDER, TEST_PATH)

test_check/test_audio models test_check


In [5]:
test_df = pd.read_csv(TEST_PATH/'test.csv')

In [6]:
test_df[test_df['site']=='site_3'].head(50)

Unnamed: 0,site,row_id,seconds,audio_id
71,site_3,site_3_9cc5d9646f344f1bbb52640a988fe902,,9cc5d9646f344f1bbb52640a988fe902
72,site_3,site_3_a56e20a518684688a9952add8a9d5213,,a56e20a518684688a9952add8a9d5213
73,site_3,site_3_96779836288745728306903d54e264dd,,96779836288745728306903d54e264dd
74,site_3,site_3_f77783ba4c6641bc918b034a18c23e53,,f77783ba4c6641bc918b034a18c23e53
75,site_3,site_3_856b194b097441958697c2bcd1f63982,,856b194b097441958697c2bcd1f63982


In [7]:
def get_pytorch_model_all_conv(window_size=1024, resnet='resnet18', pretrained=True, n_classes=10, init_fourier=False, train_fourier=False):
    kernel_size = window_size
    stride = kernel_size//4
    filters = kernel_size//2
    
    model_resnet = torch.hub.load('pytorch/vision:v0.6.0', resnet, pretrained=pretrained)
    if resnet=='resnet18':
        linear_inp = 512
    else:
        linear_inp = 2048
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.cos = nn.Conv1d(1, filters, kernel_size, stride=stride)
            self.sin = nn.Conv1d(1, filters, kernel_size, stride=stride)
            if init_fourier:
                cos_weights, sin_weights = get_fourier_weights(window_size)
                self.cos.weight.data = torch.from_numpy(cos_weights.reshape(cos_weights.shape[0], 1, cos_weights.shape[1])).float()
                self.sin.weight.data = torch.from_numpy(sin_weights.reshape(sin_weights.shape[0], 1, sin_weights.shape[1])).float()
            self.resnet = nn.Sequential(*list(model_resnet.children())[:-1])
            self.conv_out = nn.Conv2d(linear_inp, n_classes, 1)
        def forward(self, x):
            min_power=1e-10
            x_spec = 10*torch.log10(self.cos(x)**2 + self.sin(x)**2 + min_power)
            x_spec = (x_spec + 60)/120
            x = torch.reshape(x_spec, (len(x_spec), 1, 512, -1))
            x = torch.cat([x, x, x], dim=1)
            x = self.resnet(x)
            x = self.conv_out(x).flatten(start_dim=1)
            return x_spec, x
    model = Net()
    if not train_fourier:
        list(model.cos.parameters())[0].requires_grad = False
        list(model.sin.parameters())[0].requires_grad = False
    return model

In [8]:
window_size = 1024
n_classes = 264
model = get_pytorch_model_all_conv(window_size, resnet='resnet18', 
                                   pretrained=False, n_classes=n_classes, 
                                   init_fourier=False, train_fourier=False).to(device)
state_dict = torch.load(MODEL_FOLDER/f'model_ambient_{n_classes}_0.627.pth', map_location=torch.device('cpu'))

model.load_state_dict(state_dict)
_ = model.eval()

Using cache found in /Users/julian/.cache/torch/hub/pytorch_vision_v0.6.0


In [9]:
classes = np.load(MODEL_FOLDER/f'classes_{n_classes}.npy')
# print(classes)

In [10]:
class AudioDataset(Dataset):
    def __init__(self, items, classes, rec):
        self.items = items
        self.vocab = classes
        self.mean = rec.mean()
        self.std = rec.std()
        self.rec = rec
    def __getitem__(self, idx):
        _, rec_fn, start = self.items[idx]
        x = self.rec[start*SAMPLE_RATE:(start+5)*SAMPLE_RATE]
        x = self.normalize(x)
        return x.astype(np.float32)
    def normalize(self, x):
        return (x - self.mean) / self.std    
    def __len__(self):
        return len(self.items)

In [11]:
SAMPLE_RATE = 22050
duration = 5
res_type = 'kaiser_best'

In [12]:
def get_classes(preds):
    all_preds = []
    for row in preds:
        row_birds = classes[np.where(row>0)]
        if len(row_birds) == 0:
            row_birds = ['nocall']
        all_preds = all_preds + [' '.join(row_birds)]
    return all_preds

In [13]:
def predict_sites_1_2():
    row_ids = []
    results = []
    total_seconds = 0
    for audio_id in test_df[test_df.site.isin(['site_1', 'site_2'])].audio_id.unique():
        items = [(row.row_id, row.audio_id, int(row.seconds)-5) for idx, row in test_df[test_df.audio_id == audio_id].iterrows()]
        # Load full audio archive
        rec = librosa.load(TEST_AUDIO_PATH/f'{audio_id}.mp3', sr=SAMPLE_RATE, res_type=res_type)[0]
        test_ds = AudioDataset(items, classes, rec)
        dl = DataLoader(test_ds, batch_size=128)
        for batch in dl:
            with torch.no_grad():
                total_seconds = total_seconds + duration*len(items)
                _, preds = model(batch.reshape(-1, 1, SAMPLE_RATE*duration).to(device))
                preds = preds.cpu().detach()
                birds = get_classes(preds)
            results = results + birds
        row_ids += [item[0] for item in items]
    return row_ids, results

In [14]:
%%time
row_ids_12, results_12 = predict_sites_1_2()



CPU times: user 26.9 s, sys: 2.02 s, total: 29 s
Wall time: 29.2 s


In [26]:
def predict_site_3():
    row_ids = []
    results = []
    for audio_id in test_df[test_df.site.isin(['site_3'])].audio_id.unique():
        row_id = test_df[test_df.audio_id == audio_id]['row_id'].values[0]
        row_ids.append(row_id)
        rec = librosa.load(TEST_AUDIO_PATH/f'{audio_id}.mp3', sr=SAMPLE_RATE, res_type=res_type)[0]
        rec = (rec - rec.mean())/rec.std()
        chunks = len(rec)//(SAMPLE_RATE*duration)
        if chunks == 0:
            reshaped_rec = rec
        else:
            reshaped_rec = rec[:chunks*SAMPLE_RATE*duration].reshape(-1, SAMPLE_RATE*duration)
        _, preds = model(torch.from_numpy(reshaped_rec.reshape(-1, 1, SAMPLE_RATE*duration)).to(device))
        preds = preds.cpu().detach()
        predicted_classes = get_classes(preds)
        joined_predicted_classes = []
        for pr_cl in predicted_classes:
            joined_predicted_classes = joined_predicted_classes + pr_cl.split(' ')
        predicted_classes = list(np.unique(joined_predicted_classes))
#         print(predicted_classes)
        if ('nocall' in predicted_classes) and (len(predicted_classes)>1):
            predicted_classes.remove('nocall')
#         print(predicted_classes)
        birds = ' '.join(predicted_classes)
        results.append(birds)
#         break
    return row_ids, results

In [27]:
%%time
row_ids_3, results_3 = predict_site_3()



CPU times: user 25.7 s, sys: 3.16 s, total: 28.8 s
Wall time: 29.2 s


In [28]:
row_ids = row_ids_12 + row_ids_3
results = results_12 + results_3

In [29]:
predicted = pd.DataFrame(data={'row_id': row_ids, 'birds': results})

sub = pd.DataFrame(data={'row_id': test_df.row_id})
sub = sub.merge(predicted, 'left', 'row_id')
sub.fillna('nocall', inplace=True)
sub.to_csv('submission.csv', index=False)

In [30]:
sub

Unnamed: 0,row_id,birds
0,site_1_41e6fe6504a34bf6846938ba78d13df1_5,aldfly
1,site_1_41e6fe6504a34bf6846938ba78d13df1_10,aldfly
2,site_1_41e6fe6504a34bf6846938ba78d13df1_15,aldfly
3,site_1_41e6fe6504a34bf6846938ba78d13df1_20,nocall
4,site_1_41e6fe6504a34bf6846938ba78d13df1_25,aldfly hamfly
...,...,...
71,site_3_9cc5d9646f344f1bbb52640a988fe902,aldfly comyel
72,site_3_a56e20a518684688a9952add8a9d5213,aldfly
73,site_3_96779836288745728306903d54e264dd,aldfly hamfly
74,site_3_f77783ba4c6641bc918b034a18c23e53,yebfly


In [34]:
model_resnet = torch.hub.load('pytorch/vision:v0.6.0', 'resnet18', pretrained=False)

Using cache found in /Users/julian/.cache/torch/hub/pytorch_vision_v0.6.0


In [41]:
torch.save(model_resnet, 'resnet18.pth')

In [40]:
torch.load('resnet_model.pth')

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  