In [None]:
module = 'notebooks._181211_train_cr'
exec(f'import {module}; import importlib; importlib.reload({module})')
exec(f'from {module} import *')

In [None]:
load_for_eval()

In [None]:
# Precompute eval metrics + dims for the plots below
#   - Grain: models
#   - Metrics: train_score, test_score, ...
#   - Dims: model_id, fold, params_data, params_complexity, params_model, ...
facet = (
    # facet_wrap('params_data_and_model')
    facet_grid('params_data ~ params_model')
    # facet_grid('params_model ~ params_data')
    # facet_grid('n_recs ~ n_species', labeller='label_both')
)
figsize_width = 12 * 1
theme_ = (
    geom_blank()
    # theme(strip_text_x=element_text(angle=5))
    # theme(strip_text_y=element_text(angle=-85))
)
# FIXME Default theme_gray() plots non-transparent bg, but theme_minimal() reverts back to transparent bg
theme_minimal_white = lambda *args, **kwargs: theme_minimal(*args, **kwargs) + theme(plot_background=element_rect('white'))
ks_params_data = {
    'n_species': 'sp',
    'n_recs': 'recs',
}
ks_params_complexity = [
    # 'c_n_estimators',
    'c_max_depth',
    'c_C',
    'c_alpha',
    # 'c_class_weight',
]
abbrev_param_model = lambda k, v: {
    'c_cls': '%(v)s',
    'c_solver': '%(v)s',
    'c_class_weight': '%(v)s',
}.get(k, '%(k)s[%(v)s]') % dict(k=k, v=v)
# params_model = ...  # Everything else
# log.debug('Start')
cv_models = (cv_results_splits_df(cv.cv_results_)
    # Slow yaml parsing, compute up front
    .assign(params_dict=lambda df: df.apply(axis=1, func=lambda row: dict(
        **{strip_startswith(k, 'param_'): row[k] for k in df if k.startswith('param_') and k not in ['param_classifier']},
        # HACK 'classifier.foo' -> 'c_foo'
        #   - TODO Refactor Search.classifier to get rid of the yaml strs ('c_cls', 'c_n_estimators', ...)
        **{'c_' + k: v for k, v in yaml.safe_load('{%s}' % row.param_classifier).items()},
        # 'classifier': yaml.safe_load('{%s}' % row.param_classifier),
    )))
    [lambda df: [c for c in df if not c.startswith('param_')]]
    # Dims from params (for validation curves, learning curves, etc.)
    .assign(
        n_species=lambda df: df.params_dict.apply(lambda d: d['n_species']),
        n_recs=lambda df: df.params_dict.apply(lambda d: d['n_recs']),
        params_data=lambda df: df.params_dict.apply(lambda d: ', '.join(
            '%s[%s]' % (k_abbrev, d[k]) for k, k_abbrev in ks_params_data.items() if k in d
        )),
        params_complexity=lambda df: df.params_dict.apply(lambda d: ', '.join(
            '%s[%s]' % (k, d[k]) for k in ks_params_complexity if k in d
        )),
        params_model=lambda df: df.params_dict.apply(lambda d: ', '.join(
            abbrev_param_model(k, d[k]) for k in d if k not in list(ks_params_data) + ks_params_complexity
        )),
        params_data_and_model=lambda df: df.apply(axis=1, func=lambda row: (
            '\n'.join([row.params_data, row.params_model])
        )),
        params_model_and_complexity=lambda df: df.apply(axis=1, func=lambda row: (
            '\n'.join([row.params_model, row.params_complexity])
        )),
    )
    # HACK Convert yaml strs ('x: y') to bracket style ('x[y]'), for visual consistency
    #   - TODO Refactor Search.classifier to get rid of the yaml strs ('c_cls', 'c_n_estimators', ...)
    .applymap(lambda x: x if not isinstance(x, str) else (
        re.sub(r'([^][:, ]+):\s+([^][:,]+)(, )?', r'c_\1[\2]\3',
            re.sub(r'classifier\[([^]]+)\]', r'\1',
                x,
            ),
        )
    ))
    #   - Restore the cats we just destroyed [copied from cv_results_splits_df]
    .pipe(df_ordered_cat,
        model_id=lambda df: df.model_id.unique(),
        params=lambda df: df.params.unique(),
        # Order params_data by (sp, recs) descending
        params_data=lambda df: sorted(
            df.params_data.unique(),
            reverse=True,
            key=lambda s: [parse.parse('{}[{:g}]', t).fixed for t in s.split(', ')],
        ),
        # Order params_complexity like c_max_depth
        params_complexity=lambda df: sorted(
            df.params_complexity.unique(),
            reverse=True,  # Match how the normal .unique() would come out (not clear why, and don't care)
            key=lambda s: one(
                (x['name'], or_else(-np.inf, lambda: float({'None': 'inf'}.get(x['value'], x['value']))))
                for x in [
                    parse.search('{name}[{value}]', s) or  # Parses first match, ignores rest
                    {'name': 'unk', 'value': None}
                ]
            ),
        ),
        params_model=lambda df: df.params_model.unique(),
    )
    .pipe(df_ordered_cat,
        params_data_and_model=lambda df: (df
            .sort_values(['params_data', 'params_model']).params_data_and_model.unique()
        ),
        params_model_and_complexity=lambda df: (df
            .sort_values(['params_model', 'params_complexity']).params_model_and_complexity.unique()
        ),
    )
    # .eval
    .assign(
        train_evals=lambda df: np.vectorize(SearchEvals)(
            i=df.pop('train_i'),
            y=df.pop('train_y'),
            classes=df['classes'],
            y_scores=df.pop('train_predict_proba'),
            drop_missing_classes_for_n_species=True,  # HACK Migrate to sk Pipeline to avoid this
        ),
        test_evals=lambda df: np.vectorize(SearchEvals)(
            i=df.pop('test_i'),
            y=df.pop('test_y'),
            classes=df['classes'],
            y_scores=df.pop('test_predict_proba'),
            drop_missing_classes_for_n_species=True,  # HACK Migrate to sk Pipeline to avoid this
        ),
    )
    # .pipe(tap, f=lambda df: log.debug('SearchEvals.score'))
    .assign(
        # HACK Recompute (train_score, test_score) using SearchEvals.score so it can drop_missing_classes_for_n_species
        #   - Else you'll get scores that are too low (bad) because they include some -np.inf's in the median
        train_score=lambda df: df.train_evals.map(lambda x: x.score()),
        train_mean_score=lambda df: df.train_evals.map(lambda x: x.score(agg=np.mean)),
        train_50p_score=lambda df: df.train_evals.map(lambda x: x.score(agg=partial(np.percentile, q=50))),
        train_75p_score=lambda df: df.train_evals.map(lambda x: x.score(agg=partial(np.percentile, q=75))),
        train_95p_score=lambda df: df.train_evals.map(lambda x: x.score(agg=partial(np.percentile, q=95))),
        test_score=lambda df: df.test_evals.map(lambda x: x.score()),  # (= 50p)
        test_mean_score=lambda df: df.test_evals.map(lambda x: x.score(agg=np.mean)),
        test_50p_score=lambda df: df.test_evals.map(lambda x: x.score(agg=partial(np.percentile, q=50))),
        test_75p_score=lambda df: df.test_evals.map(lambda x: x.score(agg=partial(np.percentile, q=75))),
        test_95p_score=lambda df: df.test_evals.map(lambda x: x.score(agg=partial(np.percentile, q=95))),
        # train_score=lambda df: map_progress(lambda x: x.score(), df.train_evals, use='dask', scheduler='threads'),
        # test_score=lambda df: map_progress(lambda x: x.score(), df.test_evals, use='dask', scheduler='threads'),
    )
    .drop(columns=[
        'classes',
    ])
    # proc_stats
    # .pipe(tap, f=lambda df: log.debug('proc_stats'))
    .pipe(lambda df: df.join(df
        .apply(axis=1, func=lambda row: (row.proc_stats.stats
            .groupby('pid')[['cpu_user', 'cpu_system', 'mem_rss', 'mem_vms']].agg(lambda g: g.max() - g.min())
            .sum(axis=0)
        ))
        .rename(columns={
            'cpu_user': 'cpu_user_time',
            'cpu_system': 'cpu_system_time',
            'mem_rss': 'mem_rss_delta',
            'mem_vms': 'mem_vms_delta',
        })
    ))
    .assign(
        cpu_time=lambda df: df.cpu_user_time + df.cpu_system_time,
        cpu_time_m=lambda df: df.cpu_time / 60,  # s -> m
    )
    # For xgb_rf, rf, ovr-rf
    .assign(
        c_multiclass=lambda df: df.params_dict.str.get('c_multiclass'),
        c_max_depth=lambda df: df.params_dict.str.get('c_max_depth'),
        # c_rf_max_depth=lambda df: df.apply(axis=1, func=lambda row: (
        #     row['c_max_depth'] * (10 if row['c_multiclass'] == 'ovr' else 1)  # HACK Undo rf_max_depth -> ovr_rf_max_depth
        # )),
    )
    # model_stats
    .assign(
        n_iters=lambda df: df.model_stats.map(lambda stats: or_else(None, lambda: stats.n_iter.tolist())),
        forest_depth_mean=lambda df: df.model_stats.map(lambda stats: or_else(None, lambda: stats.depth.mean())),
        forest_depth_std=lambda df: df.model_stats.map(lambda stats: or_else(None, lambda: stats.depth.std())),
    )
    # Reorder
    .pipe(df_reorder_cols,
        first=['model_id', 'params', 'params_dict', 'params_data', 'params_complexity', 'params_model'],
        last=['train_evals', 'test_evals', 'proc_stats', 'model_stats', 'model'],
    )
    # .pipe(tap, f=lambda df: log.debug('display'))
    .pipe(tap, f=lambda df: display(
        # df_summary(df).T,
        # df,
        len(df),
        df[:5],
    ))
    # .pipe(tap, f=lambda df: log.debug('Done'))
)

In [None]:
# Train/test scores
#   - Grain: models x {train,test}
#   - Metrics: score
#   - Dims: group, params
metrics = [
    'train_mean_score', 'test_mean_score',
    'train_50p_score', 'test_50p_score',
    'train_75p_score', 'test_75p_score',
    'train_95p_score', 'test_95p_score',
]
(cv_models
    .pipe(df_reverse_cat, 'params_complexity')
    .pipe(lambda df: pd.melt(df,
        id_vars=[
            'params', 'params_data', 'params_model', 'params_complexity',
            'params_data_and_model', 'params_model_and_complexity',
            'n_species', 'n_recs',
            'fold',
        ],
        value_vars=metrics,
        var_name='split_metric',
        value_name='score'),
    )
    .assign(
        split_metric=lambda df: df.split_metric.str.replace('_score', ''),
        split=lambda df: df.split_metric.str.split('_').str[0],
        metric=lambda df: df.split_metric.str.split('_').str[1],
        group=lambda df: df.params_complexity.str.cat(df.split_metric, '/'),
    )
    .pipe(df_ordered_cat,
        split_metric=[strip_endswith(x, '_score') for x in metrics],
        group=lambda df: reversed(df.group.unique()),
    )
    .pipe(lambda df: (df
        .pipe(ggplot)
        + aes(x='params_complexity')
        + aes(y='score')
        + aes(color='metric')
        # + facet_grid('params_data ~ params_model')
        + facet + theme_
        + geom_hline(yintercept=-1, color='lightgrey')  # -1 is the max score (1 is the min coverage_error)
        + geom_point(df[df.split == 'train'], alpha=.3, fill='none')
        + geom_point(df[df.split == 'test'], alpha=.8)
        + geom_line(df[df.split == 'train'], alpha=.3, mapping=aes(group='split_metric', color='metric'))
        + geom_line(df[df.split == 'test'], alpha=.8, mapping=aes(group='split_metric', color='metric'))
        # TODO How to manually add a legend that shows test:filled, train:unfilled?
        # + geom_jitter(fill='none', size=3, height=1e-9, width=.05)
        # + geom_count(aes(size='..n..')) + scale_size_area()
        # + stat_summary(aes(group='group'), fun_data='mean_cl_boot', random_state=0, geom='errorbar')
        + coord_flip(
            ylim=(-17, 0),
        )
        # + scale_y_continuous(breaks=np.arange(-10, 0))  # TODO TODO XXX
        + scale_color_cmap_d('Set1')
        + theme_figsize(width=figsize_width, aspect_ratio=1/1.5)
        + ylab('score (-coverage_error)')
        + ggtitle(f'Train/test scores ({recs_stats})')
    ))
)

In [None]:
(cv_models
    .pipe(ggplot)
    + aes(x='cpu_time_m')
    + aes(color='params_complexity')
    + facet + theme_
    + geom_line(aes(y='train_mean_score'), alpha=.3, color='darkgray')
    + geom_line(aes(y='test_mean_score'), alpha=.8, color='darkgray')
    + geom_point(aes(y='train_mean_score'), alpha=.3, fill='none')
    + geom_point(aes(y='test_mean_score'), alpha=.8)
    + geom_hline(yintercept=0, color='grey')
    + expand_limits(x=0)
    + coord_flip(
        # + ylim(-30, 0)
    )
    + scale_color_cmap_d(mpl_cmap_concat('tab20', 'tab20b', 'tab20c'))
    + guides(color=guide_legend(ncol=1)) + theme(legend_position='right', legend_box_spacing=.4, legend_key_height=8)
    + theme_figsize(width=figsize_width, aspect_ratio=1/1.5)
    + ggtitle('mean_score ~ cpu_time_m')
)

In [None]:
(cv_models
    .pipe(ggplot)
    + aes(x='cpu_time_m')
    + aes(color='params_complexity')
    + facet + theme_
    + geom_line(aes(y='train_75p_score'), alpha=.3, color='darkgray')
    + geom_line(aes(y='test_75p_score'), alpha=.8, color='darkgray')
    + geom_point(aes(y='train_75p_score'), alpha=.3, fill='none')
    + geom_point(aes(y='test_75p_score'), alpha=.8)
    + geom_hline(yintercept=0, color='grey')
    + expand_limits(x=0)
    + coord_flip(
        # + ylim(-30, 0)
    )
    + scale_color_cmap_d(mpl_cmap_concat('tab20', 'tab20b', 'tab20c'))
    + guides(color=guide_legend(ncol=1)) + theme(legend_position='right', legend_box_spacing=.4, legend_key_height=8)
    + theme_figsize(width=figsize_width, aspect_ratio=1/1.5)
    + ggtitle('75p_score ~ cpu_time_m')
)

In [None]:
(cv_models
    .pipe(ggplot)
    + aes(x='model_size')
    + aes(color='params_complexity')
    + facet + theme_
    + geom_line(aes(y='train_75p_score'), alpha=.3, color='darkgray')
    + geom_line(aes(y='test_75p_score'), alpha=.8, color='darkgray')
    + geom_point(aes(y='train_75p_score'), alpha=.3, fill='none')
    + geom_point(aes(y='test_75p_score'), alpha=.8)
    + geom_hline(yintercept=0, color='grey')
    + scale_x_continuous(labels=labels_bytes(), breaks=breaks_bytes())
    + expand_limits(x=0)
    + coord_flip(
        # + ylim(-30, 0)
    )
    + scale_color_cmap_d(mpl_cmap_concat('tab20', 'tab20b', 'tab20c'))
    + guides(color=guide_legend(ncol=1)) + theme(legend_position='right', legend_box_spacing=.4, legend_key_height=8)
    + theme_figsize(width=figsize_width, aspect_ratio=1/1.5)
    + ggtitle('test_75p_score ~ model_size')
)

In [None]:
# Tune n_iter
(cv_models
    .merge(how='left', on='model_id', right=df_flatmap(cv_models, lambda row: (
        dict(model_id=row.model_id, n_iter=n_iter)
        for n_iter in coalesce(row.n_iters, [0])
    )))
    # Manually compute y.mean() per group
    # .groupby('params').apply(lambda g: g.assign(cpu_time_m_mean=lambda df: df.cpu_time_m.mean()))
    .pipe(df_reverse_cat, 'params_complexity')
    .pipe(ggplot)
    + aes(x='params_complexity')
    + facet + theme_
    + geom_count(aes(y='n_iter', size='..n..')) + scale_size_area()
    + expand_limits(y=0)
    + coord_flip(
        ylim=(0, 100),  # TODO
    )
    + theme_figsize(width=figsize_width, aspect_ratio=1/1.5)
    + ggtitle(f'n_iter ({recs_stats})')
)

In [None]:
# Train time ~ n_species
(cv_models
    .pipe(df_reverse_cat, 'params_complexity')
    .pipe(ggplot)
    + facet_grid('-n_recs ~ params_model')
    + aes(color='params_complexity')
    + aes(x='n_species', y='cpu_time_m')
    + geom_point()
    + geom_line()
    + scale_color_cmap_d(mpl_cmap_concat('tab20', 'tab20b', 'tab20c'))
    + guides(color=guide_legend(ncol=1)) + theme(legend_position='right', legend_box_spacing=.4, legend_key_height=8)
    + theme_figsize(width=figsize_width, aspect_ratio=1/1.5)
    + ggtitle(f'Train time ~ n_species ({recs_stats})')
)

In [None]:
# Train time ~ n_recs
(cv_models
    .pipe(df_reverse_cat, 'params_complexity')
    .pipe(ggplot)
    + facet_grid('-n_species ~ params_model')
    + aes(color='params_complexity')
    + aes(x='n_recs', y='cpu_time_m')
    + geom_point()
    + geom_line()
    + scale_color_cmap_d(mpl_cmap_concat('tab20', 'tab20b', 'tab20c'))
    + guides(color=guide_legend(ncol=1)) + theme(legend_position='right', legend_box_spacing=.4, legend_key_height=8)
    + theme_figsize(width=figsize_width, aspect_ratio=1/1.5)
    + ggtitle(f'Train time ~ n_recs ({recs_stats})')
)

In [None]:
(cv_models
    # Manually compute y.mean() per group
    .groupby('params').apply(lambda g: g.assign(cpu_time_m_mean=lambda df: df.cpu_time_m.mean()))
    .pipe(df_reverse_cat, 'params_complexity')
    .pipe(ggplot)
    + aes(x='params_complexity')
    + facet + theme_
    + geom_col(aes(y='cpu_time_m_mean'), fill='darkgray', position=position_dodge())
    + geom_point(aes(y='cpu_time_m'), color='black', fill='none', size=2)
    + coord_flip()
    + theme_figsize(width=figsize_width, aspect_ratio=1/1.5)
    + ggtitle(f'Train + score times ({recs_stats})')
)

In [None]:
(cv_models
    # Manually compute y.mean() per group
    .groupby('params').apply(lambda g: g.assign(model_size_mean=lambda df: df.model_size.mean()))
    .pipe(df_reverse_cat, 'params_complexity')
    .pipe(ggplot)
    + aes(x='params_complexity')
    + facet + theme_
    + geom_col(aes(y='model_size_mean'), fill='darkgray', position=position_dodge()) # Summary per fold
    + geom_point(aes(y='model_size'), color='black', fill='none', size=2) # Distribution of folds
    + scale_y_continuous(labels=labels_bytes(), breaks=breaks_bytes())
    + coord_flip()
    + ylab('model_size')
    + theme_figsize(width=figsize_width, aspect_ratio=1/1.5)
    + ggtitle(f'Model size ({recs_stats})')
)

In [None]:
# TODO Make not barf on non-tree models (blocks rest of notebook)
(cv_models
    # Manually compute boxplot stats, else it's _really_ slow (or you have to downsample, which misses outliers)
    #   - Based on stat_boxplot.compute_group
    .merge(how='left', on='params', right=cv_models.groupby('params').apply(lambda g: one(
        pd.Series(dict(
            params=g.name,
            ymin=boxplot['whislo'],
            lower=boxplot['q1'],
            middle=boxplot['med'],
            upper=boxplot['q3'],
            ymax=boxplot['whishi'],
            outliers=np.unique(boxplot['fliers']),  # np.unique else really slow, because lots of repeated (int) points
        ))
        for [boxplot] in [mpl.cbook.boxplot_stats(  # [boxplot] is 1 elem because X.ndim = 1
            X=np.concatenate([[] if x is None else x.depth for x in g.model_stats]),
            whis=1.5,
        )]
    )))
    .pipe(df_reverse_cat, 'params_complexity')
    .pipe(ggplot)
    + aes(x='params_complexity')
    + facet + theme_
    + geom_boxplot(
        stat='identity',
        outlier_size=.5,
        mapping=aes(ymin='ymin', ymax='ymax', upper='upper', lower='lower', middle='middle', outliers='outliers',
            width=.8,  # Close enough to geom_boxplot defaults [TODO Maybe should compute based on num categorical x's?]
        ),
    )
    + ylab('tree_depth')
    + coord_flip()
    + theme_figsize(width=figsize_width, aspect_ratio=1/1.5)
    + ggtitle(f'Tree depth ({recs_stats})')
)

In [None]:
SLOW_PLOTS_NEXT  # TODO(train_us)

# Model diagnostics: all models, all folds

In [None]:
print('models:')
print('  params[*/%s]' % len(cv_models.params.cat.categories))
print('  fold[*/%s]' % cv.cv.n_splits)

In [None]:
# TODO Cache
# Coverage errors: all models, all folds
#   - Subset: all models
#   - Grain: sum(recs[model.test_i] for model)
#   - Dims: model_id, params, fold, y_true, rec_id
#   - Metrics: coverage_error
coverage_errors_all_all = (cv_models
    # .sample(n=5, random_state=0)  # For faster dev
    .pipe(lambda df: DF(
        OrderedDict(
            # **row[['model_id', 'params', 'fold']],  # Slow (in this inner loop), unpack manually instead
            model_id=row.model_id,
            params=row.params,
            params_data=row.params_data,
            params_model=row.params_model,
            params_complexity=row.params_complexity,
            params_data_and_model=row.params_data_and_model,
            params_model_and_complexity=row.params_model_and_complexity,
            fold=row.fold,
            i=i,
            y_true=y_true,
            coverage_error=coverage_error,
        )
        for row in iter_progress(df_rows(df), n=len(df))
        for i, y_true, coverage_error in zip(
            row.test_evals.i,
            row.test_evals.y,
            row.test_evals.coverage_errors(),
        )
    ))
    .astype(dict(
        model_id=cv_models.model_id.dtype,
        params=cv_models.params.dtype,
        params_data=cv_models.params_data.dtype,
        params_model=cv_models.params_model.dtype,
        params_complexity=cv_models.params_complexity.dtype,
        params_data_and_model=cv_models.params_data_and_model.dtype,
        params_model_and_complexity=cv_models.params_model_and_complexity.dtype,
    ))
    .pipe(tap, lambda df: display(
        df_summary(df).T,
        df[:10],
    ))
)

In [None]:
[print(x) for x in coverage_errors_all_all.params_data.unique()];
params_data = (
    'sp[331], recs[1.0]'
)
n_species = parse.search('sp[{n_species:d}], recs[{n_recs:f}]', params_data)['n_species']
assert params_data in list(coverage_errors_all_all.params_data), params_data

In [None]:
# TODO Class imbalance
#   1. Is class imbalance causing a problem? [-> maybe]
#   2. Is class_weight solving it? [-> maybe a little bit]
#   - TODO Try again with larger class imbalance
(coverage_errors_all_all
    [lambda df: df.params_data == params_data]
    # .sample(100, random_state=0)  # Faster dev
    .merge(how='left', on='y_true', right=(recs
        .assign(n_recs=1).groupby('species')['n_recs'].sum().reset_index()
        .rename(columns={'species': 'y_true'})
    ))
    # .pipe(puts, f=lambda df: df[:3])  # XXX Debug
    .pipe(ggplot)
    # + facet_grid('params_complexity ~ params_model')
    + facet_wrap('params_model_and_complexity')
    + aes(x='n_recs', y='coverage_error')
    + geom_count(aes(size='..n..')) + scale_size_area()
    + expand_limits(x=0, y=0)
    + coord_flip()
    + theme_figsize(width=figsize_width, aspect_ratio=1/1)
    + ggtitle(rf'Coverage error by n_recs per species ({recs_stats}) [{params_data}]')
)

In [None]:
# Facet by params_no_ns, order by mean(coverage_error)
#   - Subset: all models
#   - Grain: sum(recs[model.test_i].groupby(params, y_true) for model)
#       - Over: fold, rec_id
#   - Dims: params, y_true
#   - Metrics: coverage_error.mean
# in: coverage_errors_all_all, recs
(coverage_errors_all_all
    [lambda df: df.params_data == params_data]
    # .sample(100, random_state=0)  # Faster dev
    # .pipe(df_reverse_cat, 'params', 'params_no_ns', 'ns')
    .pipe(df_ordered_cat,
        y_true=lambda df: (
            # Sort species by mean(coverage_error) (across all models)
            df.groupby('y_true').agg({'coverage_error': np.mean}).reset_index().sort_values('coverage_error').y_true
            # Sort species by taxo (hard to compare across models, unless they're pretty low noise)
            # reversed(recs.species.cat.categories)
        ),
    )
    .pipe(ggplot)
    + aes(x='y_true', y='coverage_error')
    + aes(color='y_true')
    # + facet_grid('params_complexity ~ params_model')
    + facet_wrap('params_model_and_complexity',
        # Bug: dir='v' inverts nrow/ncol [https://github.com/has2k1/plotnine/issues/163]
        # dir='v', nrow=coverage_errors_all_all.params_model.nunique(),
    )
    # + geom_line(aes(group='params'), stat='summary', fun_y=np.mean)  # TODO Bad interpolation with n_species
    + geom_point(aes(group='params'), stat='summary', fun_y=np.mean)
    + coord_flip()
    # + geom_hline(yintercept=recs.species.nunique(), color='grey')
    + scale_color_cmap_d(mpl_cmap_repeat(10, 'tab20', 'tab20b', 'tab20c'))
    + theme_minimal_white()  # Before other theme()
    + guides(color=guide_legend(nrow=70))
    + theme(legend_position='right', legend_box_spacing=.4, legend_key_height=8, legend_text=element_text(size=8))
    + theme(axis_text_y=element_blank())
    + theme_figsize(width=figsize_width, aspect_ratio=1/2)
    + ggtitle(rf'Coverage error over fold $\times$ instance ({recs_stats}) [{params_data}]')
)

In [None]:
# TODO Slow, bad for notebook dev loop -- move lower, or disable by default?
# Facet by species
#   - Subset: all models
#   - Grain: sum(recs[model.test_i].groupby(params, y_true) for model)
#       - Over: fold, rec_id
#   - Dims: params, y_true
#   - Metrics: coverage_error.mean
# in: coverage_errors_all_all, recs
[print(x) for x in coverage_errors_all_all.params_model.unique()];
params_model = [
    'ovr-logreg_ovr, liblinear, balanced',
]
(coverage_errors_all_all
    [lambda df: df.params_data == params_data]
    [lambda df: df.params_model.isin(params_model)]
    # .sample(200, random_state=0)  # Faster dev
    # [lambda df: df.y_true.isin(df.y_true.drop_duplicates().sample(n=3, random_state=0))]  # Faster dev
    .astype({'y_true': metadata.species.df.shorthand.dtype})
    # .pipe(df_reverse_cat, 'params_complexity')
    .pipe(ggplot)
    + aes(x='params_complexity', y='coverage_error')
    + aes(color='params_complexity')
    + facet_wrap('y_true',
        ncol=int((n_species * 1/(2/3)) ** .5),
    )
    + geom_hline(yintercept=1, color='grey')
    # + geom_hline(yintercept=recs.species.nunique(), color='grey')
    # Percentiles (faster, no overplot)
    # + geom_point(stat='summary', fun_y=np.mean)
    # + geom_linerange(stat='summary', fun_ymin=partial(np.percentile, q=25), fun_ymax=partial(np.percentile, q=75))
    # Violin (slow, no overplot)
    # + geom_violin()
    # Boxplot (very slow, no overplot)
    # + geom_boxplot()
    # Points (medium cost, high overplot)
    #   - n (count) instead of prop (proportion)
    #   - scale_size_area() instead of default scale_size(), because it's a count [I don't grok this but it looks good]
    + geom_count(aes(size='..n..'), color='lightgray')
    + scale_size_area()
    # + geom_point(stat='summary', fun_y=np.mean, shape='|', size=5, stroke=2)
    + geom_point(stat='summary', fun_y=np.mean, size=5)
    + coord_flip(
        ylim=(0, 40),
    )
    # + scale_color_cmap_d(mpl_cmap_repeat(10, 'tab10'))  # Strong
    + scale_color_cmap_d(mpl_cmap_repeat(1, 'tab20', 'tab20b', 'tab20c'))  # FIXME repeat(10) makes lots of blue/gray
    + guides(color=guide_legend(reverse=True))
    + theme(axis_text_y=element_text(size=6))
    + theme_minimal_white()  # [TODO Before other theme()]
    + theme_figsize(width=figsize_width, aspect_ratio=1/1)
    + ggtitle(rf'Coverage error over fold $\times$ instance, by params_complexity ({recs_stats}) [{params_data}, {params_model}]')
)

In [None]:
# TODO Slow, bad for notebook dev loop -- move lower, or disable by default?
# Facet by species
#   - Subset: all models
#   - Grain: sum(recs[model.test_i].groupby(params, y_true) for model)
#       - Over: fold, rec_id
#   - Dims: params, y_true
#   - Metrics: coverage_error.mean
# in: coverage_errors_all_all, recs
[print(x) for x in coverage_errors_all_all.params_complexity.unique()];
params_complexity = [
    # 'c_C[0.1]',
    'c_C[0.001]',
    # '',
]
(coverage_errors_all_all
    [lambda df: df.params_data == params_data]
    [lambda df: df.params_complexity.isin(params_complexity)]
    # [lambda df: df.params.astype(str).str.contains(r'c_n_estimators\[100\]')]  # XXX Subset models
    # .sample(200, random_state=0)  # Faster dev
    # [lambda df: df.y_true.isin(df.y_true.drop_duplicates().sample(n=3, random_state=0))]  # Faster dev
    .astype({'y_true': metadata.species.df.shorthand.dtype})
    # .pipe(df_reverse_cat, 'params_model')
    .pipe(ggplot)
    + aes(x='params_model', y='coverage_error')
    + aes(color='params_model')
    + facet_wrap('y_true',
        ncol=int((n_species * 1/(2/3)) ** .5),
    )
    + geom_hline(yintercept=1, color='grey')
    # + geom_hline(yintercept=recs.species.nunique(), color='grey')
    # Percentiles (faster, no overplot)
    # + geom_point(stat='summary', fun_y=np.mean)
    # + geom_linerange(stat='summary', fun_ymin=partial(np.percentile, q=25), fun_ymax=partial(np.percentile, q=75))
    # Violin (slow, no overplot)
    # + geom_violin()
    # Boxplot (very slow, no overplot)
    # + geom_boxplot()
    # Points (medium cost, high overplot)
    #   - n (count) instead of prop (proportion)
    #   - scale_size_area() instead of default scale_size(), because it's a count [I don't grok this but it looks good]
    + geom_count(aes(size='..n..'), color='lightgray')
    + scale_size_area()
    # + geom_point(stat='summary', fun_y=np.mean, shape='|', size=5, stroke=2)
    + geom_point(stat='summary', fun_y=np.mean, size=5)
    + coord_flip(
        ylim=(0, 40),
    )
    + scale_color_cmap_d(mpl_cmap_repeat(1, 'tab20', 'tab20b', 'tab20c'))  # FIXME repeat(10) makes lots of blue/gray
    + guides(color=guide_legend(reverse=True))
    + theme(axis_text_y=element_text(size=6))
    + theme_minimal_white()  # [TODO Before other theme()]
    + theme_figsize(width=figsize_width, aspect_ratio=1/1)
    + ggtitle(rf'Coverage error over fold $\times$ instance, by params_model ({recs_stats}) [{params_data}, {params_complexity}]')
)

In [None]:
ONE_MODEL_PLOTS_NEXT  # TODO(train_us)

# Model diagnostics: one model, all folds

In [None]:
params = (
    'n_species[331], n_recs[1.0], c_cls[ovr-logreg_ovr],c_solver[liblinear],c_C[0.001],c_class_weight[balanced]'
)
print("params:\n%s" % '\n'.join('  %s: %r' % (i, x) for i, x in enumerate(cv_models.params.cat.categories)))
print()
print('models:')
params_i = list(cv_models.params.cat.categories).index(params)
print('  params[%s/%s]: %r' % (params_i, len(cv_models.params.cat.categories), params))
print('  fold[*/%s]' % cv.cv.n_splits)

In [None]:
# Coverage errors: one model, all folds
#   - Subset: models.params == params
#   - Grain: sum(recs[model.test_i] for model)
#   - Dims: model_id, params, fold, y_true, rec_id
#   - Metrics: coverage_error
coverage_errors_one_all = (coverage_errors_all_all
    [lambda df: df.params == params]  # One model, all folds
)
display(
    df_summary(coverage_errors_one_all).T,
    coverage_errors_one_all[:5],
)

In [None]:
# Coverage error by n_recs per species
scale = [
    # 'linear',
    # 'logy',
    'logy', 'logx',
]
summary_fun_y = (
    # 'mean'
    # '75p'
    '80p'
    # '90p'
)
summary_fun = lambda s: (
    np.mean if s == 'mean' else
    partial(np.percentile, q=parse.parse('{:g}p', s)[0])
)
logy_breaks=[1, 5, 10, 20, 30, 40, 50, 100, 200, 300]  # HACK Data dependent
logx_breaks=[1, 10, 20, 30, 40, 50, 100, 200, 300, 400, 500, 600, 700]  # HACK Data dependent
for _params in [params]:
    print(_params)
    repr(coverage_errors_all_all
        [lambda df: df.params == _params]  # One model, all folds
        # .sample(n=100, random_state=0)  # XXX Faster dev
        .merge(how='left', on='y_true', right=(recs
            .assign(n_recs=1).groupby('species')['n_recs'].sum().reset_index()
            .rename(columns={'species': 'y_true'})
        ))
        .merge(how='left', on='y_true', right=(metadata.species.df
            [['shorthand', 'species_group', 'family', 'order']]
            .rename(columns={'shorthand': 'y_true'})
        ))
        .astype({'y_true': metadata.species.df.shorthand.dtype})
        .pipe(df_reverse_cat, 'y_true')
        .pipe(df_remove_unused_categories)
        .pipe(ggplot)
        + theme_figsize('inline')
        + aes(x='n_recs', y='coverage_error')
        + geom_point(color='lightgray', size=.1)
        + stat_summary(
            fun_y=summary_fun(summary_fun_y),
            geom='point',
            mapping=aes(
                group='y_true', color='species_group',  # Color by species_group
                # group='y_true', color='y_true',  # Color by species
            ),
        )
        + stat_summary(
            fun_y=summary_fun(summary_fun_y),
            geom='text', size=5,
            mapping=aes(
                group='y_true', label='y_true', color='species_group',  # Color by species_group
                # group='y_true', color='y_true', label='y_true',  # Color by species
                # group='y_true', color='y_true', label='''[  # Color by species, label only if f(x,y)
                #     color if y >= 30 or x >= 300 else None  # (Great trick!)
                #     for color, x, y in zip(..color.., ..x.., ..y..)
                # ]''',
            ),
            nudge_y=4 if 'logy' not in scale else .02,
        )
        + geom_smooth(method='lm', se=True)
        + theme_minimal_white()  # Before other theme()
        + (geom_blank() if 'logy' not in scale else scale_y_log10(
            breaks=logy_breaks,
            labels=lambda breaks: [int(round(x)) for x in breaks],
        ))
        + (geom_blank() if 'logx' not in scale else scale_x_log10(
            breaks=logx_breaks,
            labels=lambda breaks: [int(round(x)) for x in breaks],
        ))
        + scale_color_cmap_d(mpl_cmap_with_colors('plasma_r', lambda colors: colors[256//8:]))  # Exclude the hard-to-see yellows
        # + guides(color=False)
        + guides(color=guide_legend(ncol=1))
        + theme(legend_position='right', legend_key_height=8, legend_text=element_text(size=6), legend_entry_spacing=0)
        + theme_figsize(width=18)
        + ggtitle(f'{summary_fun_y} coverage error by n_recs per species ({recs_stats})\n{_params}')
    )

In [None]:
#   - Subset: models.params == params
#   - Grain: sum(recs[model.test_i].groupby(y_true) for model)
#       - Over: fold, rec_id
#   - Dims: y_true
#   - Metrics: count, coverage_error.percentiles
# in: coverage_errors_one_all, recs
for _params in [params]:
    print(_params)
    repr(coverage_errors_all_all
        [lambda df: df.params == _params]  # One model, all folds
        .astype({'y_true': metadata.species.df.shorthand.dtype})
        .pipe(df_reverse_cat, 'y_true')
        .pipe(ggplot, aes(x='y_true', y='coverage_error'))
        + geom_hline(yintercept=1, color='grey')
        + geom_hline(yintercept=10, color='grey')
        # + geom_hline(yintercept=recs.species.nunique(), color='grey')
        + geom_count(aes(size='..n..'), alpha=1)  # n (count) instead of prop (proportion)
        + scale_size_area()  # Instead of default scale_size(), because it's a count [I don't grok this but it looks good]
        + geom_point(stat='summary', fun_y=np.mean, alpha=1, color='red', shape='|', size=6, stroke=2)
        + coord_flip(
            ylim=(1, n_species),
        )
        # + theme_figsize('inline')
        # + theme_figsize('square')
        # + theme_figsize('half')
        + theme_figsize('half_dense')
        # + theme_figsize('full')
        # + theme_figsize('full_dense')
        + ggtitle(f'Coverage error over fold $\\times$ instance ({recs_stats})\n{_params}')
    )

In [None]:
# One-model/all-folds confusion matrix
for _params in [params]:
    print(_params)
    with figsize(
        # 'square',
        # 'full',
        'full_dense',
    ):
        repr(cv_models
            [lambda df: df.params == _params]
            .pipe(lambda df: plot_confusion_matrix(
                classes=df.iloc[0].test_evals.classes,
                M=np.array([
                    row.test_evals.confusion_matrix_prob()
                    for row in df_rows(df)
                ]).sum(axis=0),
                # normalize=False,  # For counts
                raw=True, scale=10,  # Faster dev
                format=None,  # Omit numbers, too dense
                title=f'({recs_stats})',
            ))
        )

In [None]:
BORING_ONE_MODEL_ONE_FOLD_PLOTS_NEXT

# Model diagnostics: one model, one fold

In [None]:
fold = 0
# params_i = ...  # Comment out to reuse from above
params = cv_models.params.cat.categories[params_i]
[(_, model)] = list(cv_models[lambda df: (df.params == params) & (df.fold == fold)].iterrows())
print("params:\n%s" % '\n'.join('  %s: %r' % (i, x) for i, x in enumerate(cv_models.params.cat.categories)))
print()
print('model:')
print('  params[%s/%s]: %r' % (params_i, len(cv_models.params.cat.categories), model.params))
print('  fold[%s/%s]' % (model.fold, cv.cv.n_splits))
print()
print(model)

In [None]:
# in: model
model_id = model.model_id
params = model.params
fold = model.fold
# train_evals = model.train_evals
test_evals = model.test_evals

# in: model, recs
# train_recs = recs.iloc[train_evals.i]
# train_X = Search.X(recs)[train_evals.i]
# train_y = Search.y(recs)[train_evals.i]
test_recs = recs.iloc[test_evals.i]
test_X = Search.X(recs)[test_evals.i]
test_y = Search.y(recs)[test_evals.i]  # (Don't need to store cv_models.test_evals.y if we have recs -- which sometimes we don't?)

display(
    # len(train_recs),
    len(test_recs),
)

In [None]:
# TODO Restore this plot like 'Coverage error over ...' above, so we can see _one_ model instead of aggregating over n_splits models
# # TODO Update [kill the .merge, then species -> y_true]
# (search.coverage_error_by(test_recs, 'id')
#     [:5]
#     # .merge(test_recs[['id', 'species']], on='id', how='left')
#     # .pipe(ggplot, aes(x='species', y='coverage_error'))
#     # + geom_count(aes(size='..n..'))
#     # + stat_summary(fun_y=np.mean, geom='point', color='red', alpha=.5, shape='|', size=6, stroke=1)
#     # + stat_summary(
#     #     fun_ymin=partial(np.percentile, q=25), fun_ymax=partial(np.percentile, q=75),
#     #     geom='linerange', color='red', alpha=.5, size=1,
#     # )
#     # + coord_flip()
#     # + geom_hline(yintercept=len(search.classes_), color='grey')
#     # + scale_x_discrete(limits=list(reversed(test_recs.species.cat.categories)))
#     # + theme_figsize('square')
#     # + ggtitle(rf'Coverage error over instance ({model_id}) ({recs_stats})')
# )

In [None]:
# One-model/one-fold confusion matrix
with figsize(
    'square',
    # 'full',
    # 'full_dense',
):
    plot_confusion_matrix_df(
        confusion_matrix_prob_df(model.test_evals.y, model.test_evals.y_scores, model.test_evals.classes),
        title=model.model_id,
        # normalize=False,  # For counts
        raw=True, scale=10,  # Faster dev
        title=f'({recs_stats})',
    )

In [None]:
DEBUG_PLOTS_NEXT

# Debug plots, ignored by default

## Debug: resource usage

In [None]:
(cv_models
    .pipe(df_reverse_cat, 'params', 'params_no_ns', 'ns')
    # Manually compute y.mean() per group
    .groupby('params').apply(lambda g: g.assign(mem_rss_delta_mean=lambda df: df.mem_rss_delta.mean()))
    .pipe(ggplot, aes(x='ns', group='params_no_ns'))
    + geom_col(aes(y='mem_rss_delta_mean', fill='params_no_ns'), position=position_dodge()) # Summary per fold
    + geom_point(aes(y='mem_rss_delta'), fill='none', size=2, position=position_dodge(width=.9)) # Distribution of folds
    + coord_flip()
    + scale_fill_cmap_d(mpl_cmap_concat('tab20', 'tab20b', 'tab20c'))
    + scale_y_continuous(labels=labels_bytes(), breaks=breaks_bytes())
    + guides(fill=guide_legend(reverse=True))
    + theme(legend_position='bottom', legend_direction='vertical', legend_box_spacing=.4, legend_key_height=8)
    + theme_figsize(aspect_ratio=1/3*2)
    + ggtitle(f'Mem rss spread ($max-min$) ({recs_stats})')
)

In [None]:
# TODO Very slow with ~200 models
cv_models_proc_stats = (cv_models
    # Enable these as needed (at the cost of mem usage)
    [[
        'model_id',
        'params',
        # 'params_dict',
        'params_data',
        'params_complexity',
        'params_model',
        'fold',
        # 'train_score',
        # 'test_score',
        # 'fit_time',
        # 'score_time',
        'mem_rss_delta',
        # 'mem_vms_delta',
        # 'train_evals',
        # 'test_evals',
        'proc_stats',
    ]]
    # TODO Faster way to do this? (.merge is >2x slower)
    .pipe(df_flatmap, lambda row: [
        row.append(pd.Series(dict(**stats)))
        for stats in row.proc_stats.stats
    ])
    # .pipe(lambda df: (df
    #     .merge(how='left',
    #         right=DF(
    #             OrderedDict(model_id=row.model_id, **stats)
    #             for row in df_rows(df)
    #             for stats in row.proc_stats.stats
    #         )
    #     )
    # ))
    # HACK Restore the cats that the df_flatmap just destroyed [copied from cv_results_splits_df]
    .pipe(df_ordered_cat,
        model_id=lambda df: df.model_id.unique(),
        params=lambda df: df.params.unique(),
    )
    .assign(
        cpu_user=lambda df: df.cpu_user.diff() * 100,
        cpu_system=lambda df: df.cpu_system.diff() * 100,
    )
    .pipe(tap, f=lambda df: display(
        df_summary(df).T,
        df[:5],
    ))
)

In [None]:
# TODO Might need to stack/geom_area for overlapping runs
(cv_models_proc_stats
    .pipe(ggplot)
    + aes(x='time', color='model_id')
    + geom_line(aes(y='cpu_system'), linetype='dashed', size=.5)
    + geom_line(aes(y='cpu_system + cpu_user'), linetype='solid', size=.5)
    + expand_limits(y=0)
    + ylab('cpu')
    + scale_x_datetime(date_labels='%H:%M:%S')
    + scale_y_continuous(labels=lambda labels: ['%.3g%%' % x for x in labels])
    + theme(legend_position='bottom', legend_direction='vertical', legend_box_spacing=.4, legend_key_height=8)
    + theme_figsize('inline_short')
    + ggtitle('cpu over time (system + user)')
)

In [None]:
(cv_models_proc_stats
    .pipe(ggplot)
    + aes(x='time', color='model_id')
    + geom_line(aes(y='mem_rss'), linetype='solid', size=.5)
    + geom_line(aes(y='mem_vms'), linetype='dashed', size=.5)
    + expand_limits(y=0)
    + ylab('mem')
    + scale_x_datetime(date_labels='%H:%M:%S')
    + scale_y_continuous(labels=labels_bytes(), breaks=breaks_bytes(pow=3))
    + theme(legend_position='bottom', legend_direction='vertical', legend_box_spacing=.4, legend_key_height=8)
    + theme_figsize('inline_short')
    + ggtitle('Mem over time (rss, vms)')
)

## Debug: RF tuning

In [None]:
# TODO Revive if insightful
(cv_models
    .pipe(df_reverse_cat, 'params_no_ns')
    .assign(n_species=lambda df: df.params_dict.str['n_species'])
    .pipe(ggplot, aes(x='n_species', y='forest_depth_mean', color='params_no_ns'))
    # + facet_wrap('params_no_ns', ncol=2)
    + geom_point()
    + geom_pointrange(aes(ymin='forest_depth_mean - 2*forest_depth_std', ymax='forest_depth_mean + 2*forest_depth_std'))
    + geom_smooth(method='lm', se=False)  # Disable se because it only knows forest_depth_mean, no measure of spread
    + expand_limits(x=0)
    + scale_color_cmap_d(mpl_cmap_concat('tab20', 'tab20b', 'tab20c'))
    + guides(color=guide_legend(reverse=True))
    + theme(legend_position='bottom', legend_direction='vertical', legend_box_spacing=.4, legend_key_height=8)
    # + theme_figsize(aspect_ratio=1/3)
    + theme_figsize(aspect_ratio=1/3)
    + ggtitle(f'RF depth vs. n_species ({recs_stats})')
)