In [96]:
%matplotlib inline

import numpy
import pandas
import matplotlib.pyplot as plt

import sklearn
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
from sklearn import metrics

In [64]:
import tarfile
import re
import io

# TODO: also calculate delta-frame features?
def features_from_mfcc(mfcc):
    return mfcc.values[:, 0:13].flatten()
    
def load_dataset(tarpath, labelpath, limit=None):
    labels = pandas.read_csv(labelpath, index_col='itemid')
    dataset = []
    labeldata = []
    
    with tarfile.open(tarpath, mode="r|gz") as tar:
        regex = r".*\/(\d+)\..*"
        for i, member in enumerate(tar):
            if limit is not None:
                if i >= limit:
                    break

            sample = int(re.findall(regex, member.name)[0])
            content = tar.extractfile(member).read()
            data = pandas.read_csv(io.BytesIO(content), encoding='utf8')

            features = features_from_mfcc(data)
            hasbird = labels.hasbird[sample].astype('bool')
            dataset.append(features)
            labeldata.append(hasbird)
        
    return numpy.array(dataset), numpy.array(labeldata)


In [65]:
X, Y = load_dataset('data/ff1010bird.mfcc.tgz', 'data/ff1010bird.labels.csv', limit=2000)
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.3, random_state=1)
X.shape, Y.shape, numpy.mean(Y)

((2000, 5603), (2000,), 0.233)

In [109]:
# Asssumes MFCC features as input
gmm = make_pipeline(
    StandardScaler(),
    GaussianMixture(n_components=2, covariance_type='spherical'),
)

scores = model_selection.cross_val_score(gmm, X_train, Y_train, scoring='roc_auc', cv=5)
print(scores)

gmm.fit(X_train, Y_train)
metrics.roc_auc_score(Y_test, gmm.predict_proba(X_test)[:,1])

[0.42922976 0.54513811 0.4357197  0.50196202 0.58879485]


0.44550067110134367

array([0.00000000e+000, 0.00000000e+000, 1.00000000e+000, 0.00000000e+000,
       1.00000000e+000, 0.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 0.00000000e+000, 1.00000000e+000, 0.00000000e+000,
       1.00000000e+000, 0.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       0.00000000e+000, 1.00000000e+000, 1.00000000e+000, 0.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       1.00000000e+000, 0.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 1.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 6.36265919e-153,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       0.00000000e+000, 1.00000000e+000, 1.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 1.00000000e+000,
       0.00000000e+000, 0

In [103]:
# Asssumes MFCC features as input
rf = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(n_estimators=100),
)

scores = model_selection.cross_val_score(rf, X_train, Y_train, scoring='roc_auc', cv=5)
print(scores)
rf.fit(X_train, Y_train)
metrics.roc_auc_score(Y_test, rf.predict_proba(X_test)[:,1])

[0.5489357  0.62489942 0.51634847 0.58605808 0.6240948 ]


0.6283499756633578