In [None]:
from notebooks import *
sg.init(app=None)

INFO     [19:18:28.379] [50511]  52 logging_/init_logging: {logging_yaml: None}




INFO     [19:18:28.684] [50511]  29 api.server_globals/init




INFO     [19:18:29.376] [50511]  31 api.server_globals/init: done




In [None]:

def opt(*args, **kwargs):
    verbose_keys = ['cache', 'audio']
    verbose(*args, **{k: v for k, v in kwargs.items() if k in verbose_keys})
    par(**{k: v for k, v in kwargs.items() if k not in verbose_keys})

def par(**kwargs):
    """
    Usage:
        par(use=None)                         # No par, no progress bars (easier to see audio read/write and cache hit/miss)
        par(use='sync')                       # No par, cooperative progress bars (no lines printing over each other)
        par(use='dask', scheduler='threads')  # Par, uncooperative progress bars (lines print over each other)
    """
    progress_kwargs(override=kwargs)

def verbose(
    both=None,       # Shorthand, e.g. verbose('warn') = verbose(cache='warn', audio='warn')
    *,
    cache='debug',   # Show cache hit/miss lines # WARNING Frequent hangs during bigger xc_similar_html calls...
    # cache='info',  # Show cache hit/miss chars # WARNING (same)
    # cache='warn',  # Quiet
    audio='debug',   # Show read/write
    # audio='info',  # Show write
    # audio='warn',  # Quiet
):
    if both:
        cache = both
        audio = both
    memory.log.level = cache
    log_levels({'load': audio.upper()})

def n_recs_by_sp_quality(df):
    return (df
        # Count recs by (species, quality)
        .pipe(df_remove_unused_categories)
        .assign(n=1).groupby(['species', 'quality']).n.sum().reset_index()
        # Strip and restore .species cat around .pivot_table to workaround category error when adding .total
        .pipe(df_cat_to_str)
        # Pivot
        .pivot_table(index='species', columns='quality', values='n', fill_value=0, aggfunc='sum',
            margins=True, margins_name='total',  # FIXME 'total' ends up as NaN because we cat .species
        )
        # Restore .species cat
        .reset_index().astype({'species': metadata.species.df.shorthand.dtype}).sort_values('species').set_index('species')
        # Drop columns.name from .pivot_table ('quality')
        .T.pipe(df_set_index_name, None).T
        # Add .total
        # .pipe(df_assign_first, total=lambda df: df.sum(axis=1))
        .pipe(df_reorder_cols, first=['total'])
        # Sort
        .sort_values('total', ascending=False)
    )

In [None]:
# How many recs per (species, quality)?
(sg.xc_meta
    # [lambda df: df.species.isin(['SNGO', 'HOFI', 'GWTE', 'YHBL'])]  # Faster dev
    .pipe(n_recs_by_sp_quality)
    .iloc[:20]
)

Unnamed: 0_level_0,total,A,B,C,D,E,no score
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,35233,12109,14928,5434,1168,301,1293
RECR,659,94,281,209,63,12,0
SOSP,648,251,246,116,19,8,8
BEWR,522,168,252,69,3,0,30
SPTO,521,234,198,70,8,2,9
FOSP,501,267,153,59,14,1,7
RWBL,497,142,247,79,19,5,5
HOWR,476,163,219,55,13,2,24
AMRO,443,140,205,75,12,8,3
NOCA,407,128,182,71,15,6,5


In [None]:
%%memit -c
opt('warn', use='dask', scheduler='threads')  # Quiet + par + progress bars
# opt('warn', use='sync')
# opt('debug', use='sync')  # Verbose + sync + no progress bars
with ExitStack() as stack:
    stack.enter_context(print_mem_delta(collect_before=True, collect_after=False))
    stack.enter_context(cache_control(
        tags_fail_on_miss=['rec'],  # Require warmed cache for 'rec' funcs
        tags_refresh=['recs'],  # Measure un-warmed cache for 'recs' funcs
    ))
    display(
        xc_similar_html(
            sort='d_fc',
            sp_cols='species',
            view=False,  # Disable html view else n_recs_by_sp_quality gets junk .species values

            # HACK Drop uncached audios to avoid big slow O(n) "Falling back"
            #   - Good: this correctly drops audios whose input file is invalid, and thus doesn't produce a sliced cache/audio/ file
            #   - Bad: this incorrectly drops any valid audios that haven't been _manually_ cached warmed
            #   - TODO Figure out a better way to propagate invalid audios (e.g. empty cache file) so we can more robustly handle this
            drop_uncached_slice=True,

            # Don't load audio for intermediate pre-ranking recs, only for the final n_total results
            #   - TODO Clean up code to make this the default behavior
            skip_load_audio=True,

            # [XXX] Timing: quiet + par + progress bars + cache hits
            # xc_id=381417, n_total=None, n_sp=1,    sample_r=1,     # t[1.3s]  m[   0mb] n_recs[      2] p[audio  .1                       spectro  .1 plot  .1]
            # xc_id=381417, n_total=None, n_sp=1,    sample_r=None,  # t[2.4s]  m[  35mb] n_recs[     33] p[audio  .9                       spectro  .1 plot  .5]
            # xc_id=381417, n_total=None, n_sp=2,    sample_r=None,  # t[4.7s]  m[  88mb] n_recs[     77] p[audio  .2 slice  .2 persist 2.4 spectro  .2 plot 1.2]
            # xc_id=381417, n_total=None, n_sp=10,   sample_r=None,  # t[ 40s]  m[3500mb] n_recs[    718] p[audio 1.9 slice 1.4 persist  20 spectro 1.2 plot  13]
            # xc_id=381417, n_total=None, n_sp=20,   sample_r=None,  # t[118s]  m[5300mb] n_recs[   1817] p[audio   7 slice   4 persist  58 spectro   3 plot  33]
            # xc_id=381417, n_total=None, n_sp=30,   sample_r=None,  # t[172s]  m[2600mb] n_recs[   2562] p[audio 9.4 slice   6 persist  92 spectro   4 plot  45]
            # xc_id=381417, n_total=None, n_sp=40,   sample_r=None,  # t[242s]  m[2650mb] n_recs[   3880] p[audio  21 slice  12 persist 105 spectro 6.4 plot  73]
            # xc_id=381417, n_total=None, n_sp=50,   sample_r=None,  # t[  ?s]  m[   ?mb] n_recs[   5334] p[] FIXME Seemed not mem safe? ^C'd

            # [XXX] Redo after to_paths_sliced fix + skip_load_audio
            # xc_id=381417, n_total=1,    n_sp=2,    sample_r=None,  # t[3.4s] %m[  32mb] n_recs[      –] p[paths  .1 audio 1.1 spectro  .1 plot  .1]
            # xc_id=381417, n_total=1,    n_sp=10,   sample_r=None,  # t[ 14s] %m[ 323mb] n_recs[      –] p[paths  .1 audio 9.6 spectro  .1 plot  .1]
            # xc_id=381417, n_total=1,    n_sp=20,   sample_r=None,  # t[ 36s] %m[ 830mb] n_recs[      –] p[paths  .1 audio  28 spectro  .1 plot  .1]
            # xc_id=381417, n_total=1,    n_sp=40,   sample_r=None,  # t[ 86s] %m[1617mb] n_recs[      –] p[paths  .1 audio  69 spectro  .1 plot  .1]
            # xc_id=381417, n_total=1,    n_sp=80,   sample_r=None,  # t[  ?s] %m[   ?mb] n_recs[      –] p[paths  .1 audio   ? spectro  .1 plot  .1]
            # xc_id=381417, n_total=1,    n_sp=160,  sample_r=None,  # t[  ?s] %m[   ?mb] n_recs[      –] p[paths  .1 audio   ? spectro  .1 plot  .1]
            # xc_id=381417, n_total=1,    n_sp=334,  sample_r=None,  # t[  ?s] %m[   ?mb] n_recs[      –] p[paths  .1 audio   ? spectro  .1 plot  .1]

            # [XXX] Redo after fixing slow audio_metadata for uncached batch
            # xc_id=381417, n_total=10,   n_sp=1,    sample_r=None,  # t[4.3s] %m[ 140mb] n_recs[      –] p[meta 0.1 feat 0.1 d_p2 0.1 d_pc 0.1]
            # xc_id=381417, n_total=10,   n_sp=10,   sample_r=None,  # t[ 11s] %m[ 101mb] n_recs[      –] p[meta 0.8 feat 1.8 d_p2 1.0 d_pc 1.0]
            # xc_id=381417, n_total=10,   n_sp=40,   sample_r=None,  # t[ 41s] %m[ 192mb] n_recs[      –] p[meta 3.7 feat 7.8 d_p2 6.0 d_pc 5.6]
            # xc_id=381417, n_total=10,   n_sp=80,   sample_r=None,  # t[ 99s] %m[ 280mb] n_recs[      –] p[meta 7.6 feat  20 d_p2  15 d_pc  15]
            # xc_id=381417, n_total=10,   n_sp=160,  sample_r=None,  # t[241s] %m[ 605mb] n_recs[      –] p[meta  16 feat  38 d_p2  43 d_pc  43]
            # xc_id=381417, n_total=10,   n_sp=None, sample_r=None,  # t[413s] %m[1040mb] n_recs[      –] p[meta  25 feat  70 d_p2  70 d_pc  72]

            # [XXX] Redo after fixing one_progress+dask
            # xc_id=381417, n_total=10,   n_sp=1,    sample_r=None,  # t[3.8s] %m[ 120mb] n_recs[      –] p[meta 0.1 feat 0.1 d_p2 0.1 d_pc 0.1]
            # xc_id=381417, n_total=10,   n_sp=10,   sample_r=None,  # t[  8s] %m[ 119mb] n_recs[      –] p[meta 0.7 feat 1.6 d_p2 1.0 d_pc 1.0]
            # xc_id=381417, n_total=10,   n_sp=40,   sample_r=None,  # t[ 30s] %m[ 233mb] n_recs[      –] p[meta 3.7 feat 8.2 d_p2 5.4 d_pc 5.6]
            # xc_id=381417, n_total=10,   n_sp=80,   sample_r=None,  # t[ 66s] %m[ 352mb] n_recs[      –] p[meta 7.6 feat  17 d_p2  15 d_pc  15]
            # xc_id=381417, n_total=10,   n_sp=160,  sample_r=None,  # t[147s] %m[ 691mb] n_recs[      –] p[meta  14 feat  31 d_p2  41 d_pc  41]
            # xc_id=381417, n_total=10,   n_sp=None, sample_r=None,  # t[255s] %m[ 953mb] n_recs[      –] p[meta  25 feat  55 d_p2  71 d_pc  72]

            # Redo after deduping species_proba computation
            #   - meta: O(n) cache hits
            #   - feat: O(n) cache hits
            #   - p:    O(n) sg.search.predict_proba [@cache would achieve perf ~between meta and feat, which is only ~2x speedup]
            # xc_id=381417, n_total=10,   n_sp=1,    sample_r=None,  # t[4.0s] %m[ 123mb] n_recs[   32,0] p[meta 0.1 feat 0.1 p 0.1]
            # xc_id=381417, n_total=10,   n_sp=10,   sample_r=None,  # t[7.1s] %m[ 107mb] n_recs[  717,0] p[meta 0.8 feat 1.5 p 0.5]
            # xc_id=381417, n_total=10,   n_sp=40,   sample_r=None,  # t[ 25s] %m[ 300mb] n_recs[ 3879,1] p[meta 3.7 feat 7.6 p 5.1]
            # xc_id=381417, n_total=10,   n_sp=80,   sample_r=None,  # t[ 51s] %m[ 310mb] n_recs[ 8203,1] p[meta 7.6 feat  18 p  14]
            # xc_id=381417, n_total=10,   n_sp=160,  sample_r=None,  # t[111s] %m[ 637mb] n_recs[15284,1] p[meta  14 feat  36 p  42]
            # xc_id=381417, n_total=10,   n_sp=None, sample_r=None,  # t[192s] %m[1186mb] n_recs[27033,2] p[meta  25 feat  62 p  72]

            # NOTE n_sp=None -> 27035/35233 recs because quality='ab'
            # TODO TODO Then continue [from notes]

        )
        # .pipe(n_recs_by_sp_quality)
    )

INFO     [19:31:02.255] [50511] 831 datasets/xc_meta_to_raw_recs: Loading xc.metadata -> xc_raw_recs (.audio, more metadata)... [slower]




INFO     [19:31:02.281] [50511] 816 datasets/xc_meta_to_paths: Converting xc_meta -> xc_paths...




to_paths_sliced: [

                                                                                 

] |   0% (1) |  0.0s

to_paths_sliced: [

#################################################################################

] | 100% (1) |  0.1s




audio_metadata: [

                                                                                  

] |   0% (1) |  0.0s

audio_metadata: [

##################################################################################

] | 100% (1) |  0.1s




feat: [

                                                                                            

] |   0% (1) |  0.0s

feat: [

############################################################################################

] | 100% (1) |  0.1s




INFO     [19:31:02.929] [50511] 831 datasets/xc_meta_to_raw_recs: Loading xc.metadata -> xc_raw_recs (.audio, more metadata)... [slower]




INFO     [19:31:02.947] [50511] 816 datasets/xc_meta_to_paths: Converting xc_meta -> xc_paths...









to_paths_sliced: [

                                                                              

] |   0% (5334) |  0.0s

to_paths_sliced: [

##############################################################################

] | 100% (5334) |  0.1s




audio_metadata: [

                                                                               

] |   0% (5333) |  0.0s

audio_metadata: [

                                                                               

] |   0% (5333) |  0.1s

audio_metadata: [

                                                                               

] |   0% (5333) |  0.2s

audio_metadata: [

                                                                               

] |   0% (5333) |  0.4s

audio_metadata: [

                                                                               

] |   0% (5333) |  0.5s

audio_metadata: [

                                                                               

] |   0% (5333) |  0.6s

audio_metadata: [

                                                                               

] |   0% (5333) |  0.7s

audio_metadata: [

                                                                               

] |   0% (5333) |  0.8s

audio_metadata: [

                                                                               

] |   0% (5333) |  0.9s

audio_metadata: [

############                                                                   

] |  16% (5333) |  1.1s




[mem_delta] {
  'rss': '2052 KB',
  'vms': '2164 KB',
  'pfaults': '21 KB',
  'pageins': '0 KB',
  'uss': '87520 KB'
}




KeyboardInterrupt: 