In [2]:
%matplotlib inline

import numpy
import pandas
import matplotlib.pyplot as plt

import sklearn
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn import model_selection

In [33]:
import tarfile
import re
import io

# TODO: also calculate delta-frame features?
def features_from_mfcc(mfcc):
    return mfcc.values[:, 0:13].flatten()
    
def load_dataset(tarpath, labelpath):
    labels = pandas.read_csv(labelpath, index_col='itemid')
    dataset = []
    labeldata = []
    
    with tarfile.open(tarpath, mode="r|gz") as tar:
        regex = r".*\/(\d+)\..*"
        for i, member in enumerate(tar):
            if i > 2000:
                break

            sample = int(re.findall(regex, member.name)[0])
            content = tar.extractfile(member).read()
            data = pandas.read_csv(io.BytesIO(content), encoding='utf8')

            features = features_from_mfcc(data)
            hasbird = labels.hasbird[sample]
            dataset.append(features)
            labeldata.append(hasbird)
        
    return numpy.array(dataset), numpy.array(labeldata)

X, Y = load_dataset('data/ff1010bird.mfcc.tgz', 'data/ff1010bird.labels.csv')


In [34]:
X.shape, Y.shape

((2001, 5603), (2001,))

In [37]:
# Asssumes MFCC features as input
gmm = make_pipeline(
    StandardScaler(),
    GaussianMixture(n_components=1, covariance_type='spherical'),
)

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.3, random_state=1)
scores = model_selection.cross_val_score(gmm, X_test, Y_test, scoring='accuracy', cv=5)
scores

array([0.69421488, 0.74166667, 0.76666667, 0.75833333, 0.74166667])