In [None]:
import glob

from potoo.plot import *
from potoo.util import *
import sklearn

from cache import *
from constants import *
from datasets import *
from features import *
from load import *
from sp14.model import *
from util import *
from viz import *

figsize('inline_short');

In [None]:
recs = (
    load_recs([
        'peterson-field-guide',
        'recordings',
    ])
    # [:50]  # Faster dev
)
display(df_summary(recs), recs[:5])

In [None]:
# recs = recs_load_audio(recs)
# display(df_summary(recs), recs[:5])

In [None]:
recs = recs_load_metadata(recs)
display(df_summary(recs), recs[:5])

In [None]:
# features = Features()
# recs['spectro'] = features.spectro(recs)
# display(df_summary(recs), recs[:5])

In [None]:
# features = Features()
# recs['patches'] = features.patches(recs)
# display(df_summary(recs), recs[:5])

In [None]:
features = Features()
recs = features.transform(recs)
display(df_summary(recs), recs[:5])

In [None]:
# Stats
(recs
    .assign(
        n=1,
        duration_h=lambda df: df.duration_s / 3600,
        samples_gb=lambda df: df.samples_mb / 1024,
        species=lambda df: df.species.cat.remove_unused_categories(),  # Else groupby includes all categories
    )
    .groupby([
        'dataset',
        'species',
    ])
    [['n', 'duration_h', 'samples_gb', 'samples_n']]
    .sum()
    .dropna()
)

In [None]:
(recs
    .assign(species=lambda df: df.species_longhand)
    .assign(n=1)
    .pipe(lambda df: df.append(
        pd.DataFrame([
            dict(species=species, dataset=dataset, n=0)
            for species in df.species.unique()
            for dataset in df.dataset.unique()
        ])
        .astype({'species': df.species.dtype})
    ))
    .groupby(['dataset', 'species'])['n'].sum().reset_index()
    .pipe(df_reverse_cat, 'species')
    .pipe(ggplot, aes(x='species', y='n', color='dataset', fill='dataset'))
    + coord_flip()
    + stat_identity(geom='bar', position=position_dodge())
    + ylab('num recordings')
    + scale_color_cmap_d(mpl.cm.tab10)
    + scale_fill_cmap_d(mpl.cm.tab10)
    + theme_figsize('half')
    + ggtitle('How many recs per species x dataset?')
)

In [None]:
%%time
# Fit projection, add learned features
from sp14.model import *

k, n = 5,  6   # Faster dev
# k, n = 50,  60   # Faster dev
# k, n = 500, 300  # Mem safe (~17m uncached for 6 configs)
# k, n = 500, 535  # Full [XXX Not mem safe] [len(peterson) is currently 535]
recs_train_projection = (recs
    [lambda df: df.dataset == 'peterson-field-guide']
    .pipe(sklearn.utils.shuffle, random_state=0)
    .sample(n, random_state=0)
)

projection = Projection(k=k)
projection.fit(recs_train_projection)
# recs['feat'] = projection.transform(recs)

# projection.save('peterson-v0')