In [None]:
import glob

from potoo.plot import *
from potoo.util import *
import sklearn

from cache import *
from datasets import *
from features import *
from load import *
from sp14.model import *
from util import *

figsize('inline_short');

In [None]:
glob.glob(f'{data_dir}/mlsp-2013/*')

[
  '/Users/danb/hack/bubo/data/mlsp-2013/light_data',
  '/Users/danb/hack/bubo/data/mlsp-2013/mlsp13birdchallenge_documentation.pdf',
  '/Users/danb/hack/bubo/data/mlsp-2013/mlsp_contest_dataset',
  '/Users/danb/hack/bubo/data/mlsp-2013/README.txt',
]

In [None]:
glob.glob(f'{data_dir}/mlsp-2013/light_data/*')

In [None]:
glob.glob(f'{data_dir}/mlsp-2013/mlsp_contest_dataset/essential_data/*')

In [None]:
# Boring
# pd.read_csv(f'{data_dir}/mlsp-2013/mlsp_contest_dataset/essential_data/CVfolds_2.txt')[:50]

# Interesting
rec_id_to_filename_df = pd.read_csv(f'{data_dir}/mlsp-2013/mlsp_contest_dataset/essential_data/rec_id2filename.txt')
sample_submission_df = pd.read_csv(f'{data_dir}/mlsp-2013/mlsp_contest_dataset/essential_data/sample_submission.csv')
species_df = pd.read_csv(f'{data_dir}/mlsp-2013/mlsp_contest_dataset/essential_data/species_list.txt')

# This one has variable numbers of columns, so parse it manually
with open(f'{data_dir}/mlsp-2013/mlsp_contest_dataset/essential_data/rec_labels_test_hidden.txt') as f:
    rec_labels_test_hidden_df = (
        pd.DataFrame(line.rstrip().split(',', 1) for line in f.readlines())
        .T.set_index(0).T  # Pull first row into df col names
    )

In [None]:
display(
    species_df.shape,
    species_df,
)

In [None]:
display(
    rec_id_to_filename_df.shape,
    rec_id_to_filename_df[:10],
)

In [None]:
display(
    sample_submission_df.shape,
    sample_submission_df[:10],
)

In [None]:
# Test examples are '[labels]' = '?'
display(
    rec_labels_test_hidden_df.shape,
    rec_labels_test_hidden_df[:10],
)

In [None]:
# Split train vs. test
train_labels_df = rec_labels_test_hidden_df[lambda df: df['[labels]'] != '?']
test_labels_df = rec_labels_test_hidden_df[lambda df: df['[labels]'] == '?']

In [None]:
display(
    test_labels_df.shape,
    test_labels_df[:10],
)

In [None]:
display(
    train_labels_df.shape,
    train_labels_df[:10],
)

In [None]:
# How many train vs. test recordings?
(rec_labels_test_hidden_df
    .assign(group=lambda df: df['[labels]'].map(lambda x: 'test' if x == '?' else 'train'))
    .assign(n=1).groupby('group')['n'].count()
)

In [None]:
(train_labels_df
    ['[labels]']
    .fillna('')
    .map(lambda s: [int(x) for x in s.split(',') if x != ''])
    .map(lambda class_ids: len(class_ids))
    .pipe(gghist)
    + xlab('num species in same recording')
    + ylab('num recordings')
    + ggtitle('How many species per training recording?')
)

In [None]:
(train_labels_df
    .fillna({'[labels]': '-1'})
    .astype({'rec_id': 'int'})
    .set_index('rec_id')['[labels]']
    .fillna('')
    .map(lambda s: [int(x) for x in s.split(',') if x != ''])
    .apply(pd.Series).unstack()  # flatmap
    .reset_index(level=0, drop=True)  # Drop 'level' index
    .sort_index().reset_index()  # Sort and reset 'rec_id' index
    .rename(columns={0: 'class_id'})
    .dropna()
    .merge(species_df, how='left', on='class_id').drop(columns=['class_id'])
    .fillna({'code': 'none', 'species': 'none'})
    .pipe(ggplot, aes(x='code'))
    + geom_bar()
    + coord_flip()
    + xlab('species')
    + ylab('num recordings')
    + ggtitle('How many training recordings per species? (multiple species per recording)')
)

In [None]:
recs_paths = load_recs_paths(['mlsp-2013'])
display(
    recs_paths.shape,
    recs_paths.groupby('dataset').head(5),
    recs_paths.dataset.value_counts(),
)

In [None]:
recs = load_recs_data(
    (recs_paths
        # .sample(1000)  # For faster dev
    ),
    # FIXME pickling AudioSegment's across processes makes this (1) slow and (2) super-linearly slow
    #   - TODO We want 'processes' par for converting audio to std .wav format and 'threads' par for loading from std
    #     .wav, so split the convert (metadata_only=True) + load (metadata_only=False) steps to separate these concerns
    # metadata_only=True, dask_opts=dict(scheduler='processes'),
    metadata_only=False, dask_opts=dict(scheduler='threads'),
)
display(
    recs.shape,
    recs[:20],
    recs[:1].T,
)

In [None]:
# Stats
(recs
    .fillna('')
    .assign(
        n=1,
        duration_h=lambda df: df.duration_s / 3600,
        samples_gb=lambda df: df.samples_mb / 1024,
    )
    .groupby([
        'dataset',
        'species',
    ])
    [['n', 'duration_h', 'samples_gb', 'samples_n']]
    .sum()
)

In [None]:
# TODO What did 'XXXX' vs. 'none' mean here? [see datasets.metadata_from_audio]

In [None]:
recs_multi = (recs
    .pipe(df_flatmap_list_col, 'species', lambda s: s.str.split(','))
)
display(
    recs_multi.shape,
    recs_multi[:20]
)

In [None]:
# Stats
(recs_multi
    .fillna('')
    .assign(
        n=1,
        duration_h=lambda df: df.duration_s / 3600,
        samples_gb=lambda df: df.samples_mb / 1024,
    )
    .groupby([
        'dataset',
        'species',
    ])
    [['n', 'duration_h', 'samples_gb', 'samples_n']]
    .sum()
)

In [None]:
# TODO
#   - [ ] Inspect random spectros: how much non-bird time? how much noise?

In [None]:
recs_test = (recs
    [lambda df: ~df.species.isin(['XXXX', 'none'])]
    [lambda df: df.species.str.split(',').str.len() == 1]
)
display(
    recs_test.shape,
    recs_test[:10],
    (recs_test
        .assign(n=1)
        .groupby(['dataset', 'species'])
        ['n'].sum()
    ),
)

In [None]:
# Spectros
model = Model(verbose_config=False)
display(dict(model.config))
spectros = Model._spectros(df_rows(recs_test), **model.config.patch_config.spectro_config)
display(len(spectros))

In [None]:
with with_figsize(width=24, aspect_ratio=1/16):
    for spectro in spectros[:10]:
        spectro.plot(show_audio=False, fancy=False)
        plt.show()

In [None]:
%%time
from sp14.model import *
for i, config in enumerate([
    # Dict(n=None, proj_skm_config=dict(k=40, normalize=False, standardize=False, pca_whiten=True, do_pca=True)),  # Full (slow)
    Dict(n=50, proj_skm_config=dict(k=40, normalize=False, standardize=False, pca_whiten=True, do_pca=True)),  # Defaults
    Dict(n=50, proj_skm_config=dict(k=40, normalize=False, standardize=False, pca_whiten=False, do_pca=False)),
    Dict(n=50, proj_skm_config=dict(k=40, normalize=True, standardize=False, pca_whiten=False, do_pca=False)),
    Dict(n=50, proj_skm_config=dict(k=40, normalize=False, standardize=True, pca_whiten=False, do_pca=False)),
    Dict(n=50, proj_skm_config=dict(k=40, normalize=True, standardize=True, pca_whiten=False, do_pca=False)),
    Dict(n=50, proj_skm_config=dict(k=40, normalize=True, standardize=True, pca_whiten=True, do_pca=True)),
]):
    print(f'\n\ni[{i}] config[{config}]\n\n')
    model = Model(
        verbose_config=False,
        proj_skm_config=config.proj_skm_config,
    )
    # model.proj_skm_.args  # [Can't run until after model.fit_proj, below]

    # %%
    recs_test_n = (recs_test
        .pipe(sklearn.utils.shuffle, random_state=0)
        [:config.n]
    )

    # %%time
    # One skm example per rec (not one per recs_multi)
    #   - recs: 248s, 645 recs, 552765 patches
    #   - recs[:100]: 30s, 100 recs, 85700 patches
    #   - recs[:10]: (fast), 10 recs, 8570 patches
    model.fit_proj(recs_test_n)

    # %%
    # # TODO
    # #   - [ ] Inspect learned centroid patches: do they look plausible?

    # %%
    # # skm.transform(X)
    # # = (skm._pca_transform(X).T @ skm.D).T
    # # = (skm.pca.transform(X.T) @ skm.D).T
    # # = (X.T @ skm.pca.components_.T @ skm.D).T
    # # = skm.D.T @ skm.pca.components_ @ X
    # #
    # # skm.transform
    # # = skm.D.T @ skm.pca.components_

    # %%
    skm = model.proj_skm_
    skm_transform = (skm.D.T @ skm.pca.components_).T
    # display(
    #     skm.D.shape,
    #     skm.D,
    #     skm.pca.components_.shape,
    #     skm.pca.components_,
    #     skm_transform.shape,
    #     skm_transform,
    # )

    # %%
    # plt.imshow(skm.D, origin='lower')

    # %%
    # plt.imshow(skm_transform, origin='lower')

    # %%
    (fp, k) = skm_transform.shape
    p = 4
    f = fp // p
    x = np.array([
        skm_transform[i*f:(i+1)*f, j]
        for i in range(p)
        for j in range(k)
    ]).T
    # display(skm_transform.shape)
    # display(x.shape)
    plt.pcolormesh(x[:, :160])
    plt.show()

    # # %%
    # model.fit_class(recs_test_n)
    # display(
    #     model.test(recs_test_n, 'classes'),
    #     model.test(recs_test_n, 'kneighbors'),
    # )