In [109]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import time
import tarfile
import re
import io

import numpy
import pandas
import matplotlib.pyplot as plt
import dask.array

import sklearn
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn import metrics

import scipy.stats
import scipy.io.wavfile
import librosa

# Custom modules
import dcase2018bad
import features

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load dataset

In [124]:
dataset = dcase2018bad.load_dataset()
print(dataset.shape)
dataset.head(3)

(48310, 4)


Unnamed: 0,itemid,datasetid,hasbird,folder
0,BUK4_20161103_204504_125,PolandNFC,,polandnfc
1,BUK4_20161016_012704_132,PolandNFC,,polandnfc
2,6wichura_deszcz_BUK4_20161005_022304_129,PolandNFC,,polandnfc


In [125]:
trainset = dataset[dataset.hasbird.notna()].copy()
print(trainset.shape)
trainset['hasbird'] = trainset.hasbird.astype(bool)
trainset.groupby('folder').head(1)

(35690, 4)


Unnamed: 0,itemid,datasetid,hasbird,folder
0,00053d90-e4b9-4045-a2f1-f39efc90cfa9,BirdVox-DCASE-20k,True,birdvox
0,64486,ff1010bird,False,ff1010bird
0,759808e5-f824-401e-9058,warblrb10k,True,warblr10k_public


In [123]:
evalset = dataset[dataset.hasbird.isna()].copy()
print(evalset.shape)
del dataset
evalset.groupby('folder').head(1)

(12620, 4)


Unnamed: 0,itemid,datasetid,hasbird,folder
0,BUK4_20161103_204504_125,PolandNFC,,polandnfc
0,64dbf3fd-bb82-4d76-b40e,chern,,chern
0,7ac6b210-51cb-4708-adfb,wabrlrb10k_test,,warblr10k_test


# Feature extraction

In [128]:
train_X = features.extract(dcase2018bad.wav_urls(trainset))
train_X

dask.array<concatenate, shape=(35700, 64), dtype=float64, chunksize=(50, 64)>

In [129]:
def sample_chunkwise(da, amount):
    chunk_size = da.chunks[0][0]
    chunks = len(da.chunks[0])

    chosen_chunks = numpy.random.random(size=chunks) > (1-amount)
    samples = []
    for i, chosen in enumerate(chosen_chunks):
        if chosen:
            indices = (i*chunk_size)+numpy.array(range(0, chunk_size))
            assert len(indices) == chunk_size 
            samples.append(indices)
    return numpy.concatenate(samples)

sub = sample_chunkwise(train_X, 0.01)
trainset.iloc[sub].head()

Unnamed: 0,itemid,datasetid,hasbird,folder
1050,0dbc912a-58d5-4d2b-a1a4-a231442ac3cd,BirdVox-DCASE-20k,False,birdvox
1051,0dc86709-bb3a-4570-a24f-9cadc165db78,BirdVox-DCASE-20k,False,birdvox
1052,0dc97e9e-51a6-4824-a29b-a11ee7bae73d,BirdVox-DCASE-20k,True,birdvox
1053,0dcc834b-027c-47cf-a7fb-669bfc8f27ca,BirdVox-DCASE-20k,True,birdvox
1054,0dcdf7aa-96f5-428d-a3a2-31fbe02f4086,BirdVox-DCASE-20k,True,birdvox


In [132]:
# TEMP: subsample for testing
_train_X = train_X
train_X = _train_X[sub,:]

train_Y = trainset.iloc[sub].hasbird

In [133]:
train_X.persist().compute()
train_X[0,:]

dask.array<getitem, shape=(64,), dtype=float64, chunksize=(64,)>

In [134]:
train_X[0,:].compute()

array([1.92452669e-03, 6.60785407e-04, 2.15816140e-03, 1.07458552e-02,
       5.67068300e-02, 1.95372936e-01, 1.59412898e-01, 1.45568011e-01,
       1.13850264e-01, 5.98223067e-02, 3.57005004e-02, 6.81339160e-02,
       2.54984392e-01, 2.07301322e-02, 2.18170321e-02, 1.72691825e-02,
       2.01294154e-02, 4.07337492e-02, 1.25575315e-02, 1.01468376e-02,
       6.36580864e-02, 6.02020435e-01, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 3.76088452e-01, 8.52870687e-02,
       1.70031563e-02, 5.20163111e-03, 1.23029468e-03, 6.91389796e-04,
       3.41119921e-03, 4.67161613e-02, 5.46502964e-02, 2.57144911e-02,
       4.99059633e-02, 3.44320738e-02, 2.00624033e-02, 6.34100136e-03,
       1.85316350e-03, 4.23598189e-04, 3.05980832e-04, 3.91833838e-04,
       4.28603342e-04, 5.82811936e-04, 1.19638782e-03, 5.32385907e-04,
       7.02966319e-04, 1.35517516e-03, 1.33357917e-03, 1.16085574e-03,
       1.32440570e-03, 1.00296969e-03, 2.01670415e-03, 1.95161035e-03,
      

# Model

In [135]:
rf = make_pipeline(
    RandomForestClassifier(n_estimators=100, min_samples_leaf=2, random_state=1),
)

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(train_X, train_Y, test_size=0.3)

start = time.time()
print('Starting train', X_train.shape, numpy.mean(Y_train))
rf.fit(X_train, Y_train)
end = time.time()
print('Train time', end-start)

print('train', model_selection.cross_val_score(rf, X_train, Y_train, scoring='roc_auc', cv=3))
print('test', model_selection.cross_val_score(rf, X_test, Y_test, scoring='roc_auc', cv=3))

#evaluate_model(rf, split)

  return X[indices]
  return X[indices]


Starting train (245, 64) 0.5469387755102041


KeyboardInterrupt: 

In [103]:
compete = dataset.hasbird.isna()
dataset[compete].groupby('folder').head(2)

Unnamed: 0,itemid,datasetid,hasbird,folder
0,BUK4_20161103_204504_125,PolandNFC,,polandnfc
1,BUK4_20161016_012704_132,PolandNFC,,polandnfc
0,64dbf3fd-bb82-4d76-b40e,chern,,chern
1,149a895a-68cf-41b0-8b56,chern,,chern
0,7ac6b210-51cb-4708-adfb,wabrlrb10k_test,,warblr10k_test
1,81274cb3-f01d-4cc9-837d,wabrlrb10k_test,,warblr10k_test


In [None]:
eval_X = features.extract(dcase2018bad.wav_urls(evalset))
eval_X.persist().compute()

In [97]:
def make_submission():
    df = pandas.DataFrame({
        'item': evalset.index,
        'prediction': rf.predict_proba(eval_X)[1,:].astype('float'),
    })

    f = 'submission.csv'
    df.to_csv(f, index=False, header=False)
    return f
    
print(submission)

AttributeError: 'Array' object has no attribute 'index'