# RandomForest on mel-spectrogram summarization

This is very close to the `melspec-maxp` (max summarization) baseline model in [Automatic large-scale classification of bird sounds is strongly improved by unsupervised feature learning](https://peerj.com/articles/488/) (Dan Stowell, Mark D. Plumbley)


In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import time

import numpy
import pandas
import matplotlib.pyplot as plt

import sklearn
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn import metrics

# Custom modules
import dcase2018bird

# Load dataset

In [2]:
dataset = dcase2018bird.load_dataset()
print(dataset.shape)
dataset.head(3)

(48310, 4)


Unnamed: 0,itemid,datasetid,hasbird,folder
0,BUK4_20161103_204504_125,PolandNFC,,polandnfc
1,BUK4_20161016_012704_132,PolandNFC,,polandnfc
2,6wichura_deszcz_BUK4_20161005_022304_129,PolandNFC,,polandnfc


In [16]:
dataset.datasetid.unique()

array(['PolandNFC', 'BirdVox-DCASE-20k', 'chern', 'ff1010bird',
       'warblrb10k', 'wabrlrb10k_test'], dtype=object)

## Split training and evaluation data
No labels available for evaluation, they are the thing to predict in competition

In [6]:
trainset = dataset[dataset.hasbird.notna()].copy()
print(trainset.shape)
trainset['hasbird'] = trainset.hasbird.astype(bool)
#trainset.groupby('folder').head(1)

(35690, 4)


In [7]:
evalset = dataset[dataset.hasbird.isna()].copy()
print(evalset.shape)
#evalset.groupby('folder').head(1)

(12620, 4)


# Load features

In [23]:
# 64-band melspectrogram max-summarized
train_F = features.extract(dcase2018bad.wav_urls(trainset))
train_F

dask.array<concatenate, shape=(35700, 64), dtype=float64, chunksize=(50, 64)>

### Compute features

# Model

In [27]:
rf = make_pipeline(
    RandomForestClassifier(n_estimators=100, min_samples_leaf=2, random_state=1),
)

X_train, X_test, Y_train, Y_test = \
  model_selection.train_test_split(train_X, train_Y, test_size=0.3)

start = time.time()
print('Starting train', X_train.shape, numpy.mean(Y_train))
rf.fit(X_train, Y_train)
end = time.time()
print('Train time', end-start)

print('train', model_selection.cross_val_score(rf, X_train, Y_train, scoring='roc_auc', cv=3))
print('test', model_selection.cross_val_score(rf, X_test, Y_test, scoring='roc_auc', cv=3))


Starting train (24983, 64) 0.5048232798302846
Train time 70.53949069976807
train [0.85059628 0.8534503  0.85257123]
test [0.84328148 0.83735134 0.83947204]


In [29]:
eval_F = features.extract(dcase2018bad.wav_urls(evalset))
eval_F = eval_F.persist()
eval_F

dask.array<concatenate, shape=(12650, 64), dtype=float64, chunksize=(50, 64)>

In [30]:
eval_X = eval_F.compute()
eval_X = eval_X[:12620] # last chunk is partially empty
eval_X.shape

(12650, 64)

In [41]:
def make_submission():
    df = pandas.DataFrame({
        'item': evalset.itemid,
        'prediction': rf.predict_proba(eval_X)[:,1].astype('float'),
    })
    df = df.sort_values(by='item')

    f = 'submission.csv'
    df.to_csv(f, index=False, header=False)
    return f
    
print(make_submission())

submission.csv
