In [None]:
import glob

from potoo.plot import *
from potoo.util import *
import sklearn

from cache import *
from constants import *
from datasets import *
from features import *
from load import *
from sp14.model import *
from util import *
from viz import *

figsize('inline_short');

In [None]:
recs_paths = load_recs_paths(['peterson-field-guide'])
display(
    df_summary(recs_paths),
    recs_paths[:5],
)

In [None]:
recs = load_recs_data(recs_paths)

In [None]:
display(
    df_summary(recs),
    recs[:5],
)

In [None]:
(recs
    .pipe(df_reverse_cat, 'species_longhand')
    .pipe(ggplot, aes(x='species_longhand'))
    + geom_bar()
    + coord_flip()
    + ylab('num recordings')
    + theme_figsize(aspect_ratio=1/2)
    + ggtitle('How many training recordings per species?')
)

In [None]:
for col in ['n_recs', 'n_xc_recs']:
    repr(recs
        .assign(
            n_recs=1,
            species=lambda df: df.species.cat.remove_unused_categories(),  # Else groupby includes all categories
            species_longhand=lambda df: df.species_longhand.cat.remove_unused_categories(),  # Else groupby includes all categories
        )
        .groupby(['species', 'species_longhand'])[['n_recs']].sum().reset_index()
        .dropna()
        .merge(
            metadata.xc_counts.with_species,
            how='left',
            left_on='species',
            right_on='shorthand',
        )
        .pipe(pd.melt, id_vars=['species', 'species_longhand'], value_vars=['n_recs', 'n_xc_recs'])
        .astype({
            'species': recs.species.dtype,
            'species_longhand': recs.species_longhand.dtype,
        })
        .pipe(df_reverse_cat, 'species', 'species_longhand')
        [lambda df: df.variable == col]  # XXX Workaround for facet_grid(scales='free') not working
        .pipe(ggplot, aes(x='species_longhand', y='value'))
        + geom_col()
        + coord_flip()
        # + facet_grid('. ~ variable', scales='free')  # FIXME Why doesn't scales='free' work here?
        + ylab('num recordings')
        + theme_figsize(aspect_ratio=1/2)
        + ggtitle('How many training recordings per species?')
    )

In [None]:
# Stats
(recs
    .assign(
        n=1,
        duration_h=lambda df: df.duration_s / 3600,
        samples_gb=lambda df: df.samples_mb / 1024,
        species=lambda df: df.species.cat.remove_unused_categories(),  # Else groupby includes all categories
    )
    .groupby([
        'dataset',
        'species',
    ])
    [['n', 'duration_h', 'samples_gb', 'samples_n']]
    .sum()
)

In [None]:
recs_test = recs
display(
    df_summary(recs_test),
    recs_test[:10],
    (recs_test
        .assign(n=1)
        .groupby(['dataset', 'species'])
        ['n'].sum()
    ),
)

In [None]:
# Compute spectros
model = Model(verbose_config=False)
display(len(recs_test))
recs['spectro'] = model.spectros(recs)

In [None]:
with with_figsize(width=24, aspect_ratio=1/12):
    for spectro in recs.query("species == 'OCWA'").spectro[:3]:
        spectro.plot(show_audio=True, fancy=True)
        plt.show()

In [None]:
with with_figsize(width=25/2, height=212/2):
    plot_many_spectros(t_max=30, recs=(recs
        [-50:]  # Avoid heavy plot
    ))

In [None]:
def plot_results(exp, pca_rows=3, centroid_rows=6):
    display(exp.config.proj_skm_config)
    with with_figsize('full'):
        gs = mpl.gridspec.GridSpec(nrows=2, ncols=1, height_ratios=[1, 2], hspace=.1)
        plt.subplot(gs[0, 0])
        exp.model.plot_patches(exp.model.proj_skm_.pca.components_.T, rows=pca_rows)
        plt.subplot(gs[1, 0])
        exp.model.plot_proj_centroids(rows=centroid_rows, sort=dict(reverse=True, key=lambda patch: (
            # patch.reshape(f, p).mean(axis=1).argmax()  # Freq mode
            # patch.std()  # Less useful than .sum()
            patch.sum()  # Spread (via total energy, which happens to correlate)
        )))
        plt.xlabel(str(exp.config.proj_skm_config))
        plt.show()

In [None]:
%%time
from sp14.model import *
# k, n = 50,  60   # Faster dev
# k, n = 50,  300  # [Useful?]
k, n = 500, 300  # Mem safe (~17m uncached for 6 configs)
# k, n = 500, 535  # Full [XXX Not mem safe] [len(recs) is currently 535]
exps = []
for i, config in enumerate([
    # Experiment ordering:
    #   - Order from least to most useful looking (determined mainly by high->low pca dimension)
    #   - Interleave pca False->True since that's a very salient change to observe
    Dict(n=n, proj_skm_config=dict(k=k, normalize=True,  standardize=True,  pca_whiten=False, do_pca=False)),
    Dict(n=n, proj_skm_config=dict(k=k, normalize=True,  standardize=True,  pca_whiten=True,  do_pca=True)),
    Dict(n=n, proj_skm_config=dict(k=k, normalize=True,  standardize=False, pca_whiten=False, do_pca=False)),
    Dict(n=n, proj_skm_config=dict(k=k, normalize=True,  standardize=False, pca_whiten=True,  do_pca=True)),
    Dict(n=n, proj_skm_config=dict(k=k, normalize=False, standardize=True,  pca_whiten=False, do_pca=False)),
    Dict(n=n, proj_skm_config=dict(k=k, normalize=False, standardize=True,  pca_whiten=True,  do_pca=True)),
    Dict(n=n, proj_skm_config=dict(k=k, normalize=False, standardize=False, pca_whiten=False, do_pca=False)),
    Dict(n=n, proj_skm_config=dict(k=k, normalize=False, standardize=False, pca_whiten=True,  do_pca=True)),  # SKM defaults
]):
    print(f'\n\ni[{i}] config[{config}]\n')
    model = Model(
        verbose_config=False,
        proj_skm_config=config.proj_skm_config,
    )
    recs_test_n = (recs_test
        .pipe(sklearn.utils.shuffle, random_state=0)
        .sample(config.n, random_state=0)
    )
    model.fit_proj(recs_test_n)

    # model.fit_class(recs_test_n)
    # display(
    #     model.test(recs_test_n, 'classes'),
    #     model.test(recs_test_n, 'kneighbors'),
    # )

    exp = Dict(
        i=i,
        config=config,
        model=model,
    )
    exps.append(exp)

    # plot_results(exp)

In [None]:
for exp in exps:
    plot_results(exp)

# Conclusions
- Normalize and standardize are both junk (on top of the denoising we already have):
    - With pca, all 3 combos of norm/std increase the intrinsic dimensionality of the data, which is counterproductive
    - None of the 3 combos of norm/std make more visually plausible centroids than without
- PCA whitening is helping:
    - The “noise” centroids disappear with pca enabled
- Woohoo!

# Open Qs
- Why does [SP14] Fig 10 (below) have some "negative" centroids (lots of black, little white), whereas we have none?
    - Naively, I'd think that negative centroids aren't helpful, and that they are maybe due to noisy training data
    - But it's not clear what kind of training data would produce them...

In [None]:
# [SP14] Fig 10
Image(url='https://user-images.githubusercontent.com/627486/40503140-fc125570-5f41-11e8-9b82-e8abd5c129fb.png')