In [4]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import time
import tarfile
import re
import io
import os

import numpy
import pandas
import matplotlib.pyplot as plt
import dask.array
import IPython.display

import sklearn
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn import metrics

# Custom modules
import dcase2018bad
import features

# Load dataset

In [5]:
dataset = dcase2018bad.load_dataset()
print(dataset.shape)
dataset.head(3)

(48310, 4)


Unnamed: 0,itemid,datasetid,hasbird,folder
0,BUK4_20161103_204504_125,PolandNFC,,polandnfc
1,BUK4_20161016_012704_132,PolandNFC,,polandnfc
2,6wichura_deszcz_BUK4_20161005_022304_129,PolandNFC,,polandnfc


In [27]:
examples = list(dcase2018bad.wav_urls(dataset[dataset.hasbird == 1.0]))[10:12]
print('\n'.join(examples))

IPython.display.Audio(examples[0])

https://storage.googleapis.com/dcase2018-bad/birdvox/wav/0055615b-c316-43d7-a789-a0f6c2983081.wav
https://storage.googleapis.com/dcase2018-bad/birdvox/wav/005a19d0-e868-41c5-ab87-3fdaf5040a57.wav


In [7]:
with_labels = dataset[dataset.hasbird.notna()].copy()
print(with_labels.shape)
with_labels['hasbird'] = with_labels.hasbird.astype(bool)


(35690, 4)


In [8]:
trainset = with_labels.groupby('datasetid').apply(lambda g: g.sample(n=1000, random_state=1))
trainset.shape

(3000, 4)

# Feature extraction

In [9]:
import dask.distributed
scheduler = os.environ.get('DASK_SCHEDULER', None) # set to 'mycluster:port' to use remote cluster

c = dask.distributed.Client(scheduler)
c

0,1
Client  Scheduler: tcp://127.0.0.1:41651  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 2  Memory: 1.91 GB


In [10]:
# Upload our custom Python modules to workers
c.upload_file('features.py')
c.upload_file('dcase2018bad.py')

In [23]:
# 64-band melspectrogram max-summarized
train_F = features.extract(dcase2018bad.wav_urls(trainset))
train_F

dask.array<concatenate, shape=(35700, 64), dtype=float64, chunksize=(50, 64)>

### Compute features

In [25]:
train_F = train_F.persist() # request to keep results in worker memory
train_X = train_F.compute()
train_X[0,:]

array([4.42485034e-01, 3.59241465e-01, 3.17188576e-01, 2.04637591e-01,
       4.12524190e-01, 2.58340449e-01, 4.42977160e-01, 1.87095160e-01,
       2.48387205e-01, 1.72009477e-01, 3.48023945e-01, 4.47225595e-01,
       4.68187764e-01, 3.15744833e-01, 7.46314851e-01, 1.00000000e+00,
       3.30624341e-01, 1.13837092e-01, 2.14221469e-01, 4.88002863e-01,
       1.00000000e+00, 1.00000000e+00, 2.99592239e-01, 8.96466825e-01,
       4.93006895e-01, 2.06181034e-01, 1.39237223e-01, 1.89311828e-01,
       1.63093913e-01, 1.45103495e-01, 2.08930718e-01, 3.78429665e-01,
       4.07082901e-01, 7.88849402e-01, 1.00000000e+00, 1.00000000e+00,
       8.14479537e-01, 4.52259986e-01, 2.27068266e-01, 1.36656770e-01,
       1.65054000e-01, 1.12763795e-01, 1.55173261e-01, 1.40202217e-01,
       2.93130885e-01, 1.19175711e-01, 1.00000000e+00, 9.61103796e-01,
       1.81041924e-01, 1.48358912e-01, 1.25898345e-01, 1.18783293e-01,
       8.15355188e-02, 7.72337381e-02, 5.14123465e-02, 3.18591472e-02,
      

# Model

In [27]:
rf = make_pipeline(
    RandomForestClassifier(n_estimators=100, min_samples_leaf=2, random_state=1),
)

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(train_X, train_Y, test_size=0.3)

start = time.time()
print('Starting train', X_train.shape, numpy.mean(Y_train))
rf.fit(X_train, Y_train)
end = time.time()
print('Train time', end-start)

print('train', model_selection.cross_val_score(rf, X_train, Y_train, scoring='roc_auc', cv=3))
print('test', model_selection.cross_val_score(rf, X_test, Y_test, scoring='roc_auc', cv=3))

#evaluate_model(rf, split)

Starting train (24983, 64) 0.5048232798302846
Train time 70.53949069976807
train [0.85059628 0.8534503  0.85257123]
test [0.84328148 0.83735134 0.83947204]
