In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import time
import tarfile
import re
import io

import numpy
import pandas
import matplotlib.pyplot as plt
import dask.array

import sklearn
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn import metrics

# Custom modules
import dcase2018bad
import features

  from numpy.core.umath_tests import inner1d


# Load dataset

In [2]:
dataset = dcase2018bad.load_dataset()
print(dataset.shape)
dataset.head(3)

(48310, 4)


Unnamed: 0,itemid,datasetid,hasbird,folder
0,BUK4_20161103_204504_125,PolandNFC,,polandnfc
1,BUK4_20161016_012704_132,PolandNFC,,polandnfc
2,6wichura_deszcz_BUK4_20161005_022304_129,PolandNFC,,polandnfc


In [3]:
trainset = dataset[dataset.hasbird.notna()].copy()
print(trainset.shape)
trainset['hasbird'] = trainset.hasbird.astype(bool)
trainset.groupby('folder').head(1)

(35690, 4)


Unnamed: 0,itemid,datasetid,hasbird,folder
0,00053d90-e4b9-4045-a2f1-f39efc90cfa9,BirdVox-DCASE-20k,True,birdvox
0,64486,ff1010bird,False,ff1010bird
0,759808e5-f824-401e-9058,warblrb10k,True,warblr10k_public


In [4]:
evalset = dataset[dataset.hasbird.isna()].copy()
print(evalset.shape)
del dataset
evalset.groupby('folder').head(1)

(12620, 4)


Unnamed: 0,itemid,datasetid,hasbird,folder
0,BUK4_20161103_204504_125,PolandNFC,,polandnfc
0,64dbf3fd-bb82-4d76-b40e,chern,,chern
0,7ac6b210-51cb-4708-adfb,wabrlrb10k_test,,warblr10k_test


# Feature extraction

In [5]:
import dask.distributed
scheduler_address = '35.242.172.90:8786'
c = dask.distributed.Client(scheduler_address)
c

0,1
Client  Scheduler: tcp://35.242.172.90:8786  Dashboard: http://35.242.172.90:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 7.53 GB


In [6]:
c.upload_file('features.py')
c.upload_file('dcase2018bad.py')

In [23]:
train_F = features.extract(dcase2018bad.wav_urls(trainset))
train_F

dask.array<concatenate, shape=(35700, 64), dtype=float64, chunksize=(50, 64)>

In [24]:
if False:
    # TEMP: subsample for quicker testing
    sub = sample_chunkwise(train_X, 0.01)
    _train_F = train_F
    train_F = _train_F[sub,:]
    train_Y = trainset.iloc[sub].hasbird
else:
    train_Y = trainset.hasbird

### Compute features

In [25]:
#train_F = train_F.persist()
train_X = train_F.compute()
train_X[0,:]

array([4.42485034e-01, 3.59241465e-01, 3.17188576e-01, 2.04637591e-01,
       4.12524190e-01, 2.58340449e-01, 4.42977160e-01, 1.87095160e-01,
       2.48387205e-01, 1.72009477e-01, 3.48023945e-01, 4.47225595e-01,
       4.68187764e-01, 3.15744833e-01, 7.46314851e-01, 1.00000000e+00,
       3.30624341e-01, 1.13837092e-01, 2.14221469e-01, 4.88002863e-01,
       1.00000000e+00, 1.00000000e+00, 2.99592239e-01, 8.96466825e-01,
       4.93006895e-01, 2.06181034e-01, 1.39237223e-01, 1.89311828e-01,
       1.63093913e-01, 1.45103495e-01, 2.08930718e-01, 3.78429665e-01,
       4.07082901e-01, 7.88849402e-01, 1.00000000e+00, 1.00000000e+00,
       8.14479537e-01, 4.52259986e-01, 2.27068266e-01, 1.36656770e-01,
       1.65054000e-01, 1.12763795e-01, 1.55173261e-01, 1.40202217e-01,
       2.93130885e-01, 1.19175711e-01, 1.00000000e+00, 9.61103796e-01,
       1.81041924e-01, 1.48358912e-01, 1.25898345e-01, 1.18783293e-01,
       8.15355188e-02, 7.72337381e-02, 5.14123465e-02, 3.18591472e-02,
      

In [26]:
train_X = train_X[:35690] 

# Model

In [27]:
rf = make_pipeline(
    RandomForestClassifier(n_estimators=100, min_samples_leaf=2, random_state=1),
)

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(train_X, train_Y, test_size=0.3)

start = time.time()
print('Starting train', X_train.shape, numpy.mean(Y_train))
rf.fit(X_train, Y_train)
end = time.time()
print('Train time', end-start)

print('train', model_selection.cross_val_score(rf, X_train, Y_train, scoring='roc_auc', cv=3))
print('test', model_selection.cross_val_score(rf, X_test, Y_test, scoring='roc_auc', cv=3))

#evaluate_model(rf, split)

Starting train (24983, 64) 0.5048232798302846
Train time 70.53949069976807
train [0.85059628 0.8534503  0.85257123]
test [0.84328148 0.83735134 0.83947204]


In [29]:
eval_F = features.extract(dcase2018bad.wav_urls(evalset))
eval_F = eval_F.persist()
eval_F

dask.array<concatenate, shape=(12650, 64), dtype=float64, chunksize=(50, 64)>

In [30]:
eval_X = eval_F.compute()
eval_X.shape

(12650, 64)

In [34]:
eval_X = eval_X[:12620]

In [41]:
def make_submission():
    df = pandas.DataFrame({
        'item': evalset.itemid,
        'prediction': rf.predict_proba(eval_X)[:,1].astype('float'),
    })
    df = df.sort_values(by='item')

    f = 'submission.csv'
    df.to_csv(f, index=False, header=False)
    return f
    
print(make_submission())

submission.csv
