# Processor Data Analysis

This document contains debugging and exploratory information based on data collected as part of the CODEX processor CLI (saved as ```processor_data.json```) and is often helpful for the following:
- Tracking large (and possibly erroneous) drift compensations
- Viewing best focal plane selections for each region and tile (often useful for finding bad parts of a sample)
- Diagnosing over/under saturation in individual channels
- Examining execution times by operation

### Parameters

In [None]:
# Parameters
processor_data_path = None

In [None]:
# Validation
assert processor_data_path is not None, 'Must set parameter for path to processor data json file'

### Load Processor Data

In [None]:
from codex.ops import op, best_focus, deconvolution, drift_compensation, tile_summary, tile_generator
from codex import cli
from IPython import display
import plotnine as pn
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import json

# Set expected keys for operations dynamically based on class names (to avoid
# future refactoring synchronization problems)
drift_comp_op = op.CodexOp.get_op_for_class(drift_compensation.CodexDriftCompensator)
best_focus_op = op.CodexOp.get_op_for_class(best_focus.CodexFocalPlaneSelector)
tile_summary_op = op.CodexOp.get_op_for_class(tile_summary.CodexTileSummary)
decon_op = op.CodexOp.get_op_for_class(deconvolution.CodexDeconvolution)
tile_gen_op = op.CodexOp.get_op_for_class(tile_generator.CodexTileGenerator)

# Load processor data as a dict of data frames, separated by operation type
data = cli.read_processor_data(processor_data_path)

# Explicitly ignore tile generator stats
if tile_gen_op in data:
    del data[tile_gen_op]

In [None]:
# Display the head rows for each operation type
for k in data:
    display.display(data[k].head().style.set_caption(k))

### Drift Compensation Summary

In [None]:
df = data[drift_comp_op].copy()
dims = ['z', 'y', 'x'][-len(df['translation'].iloc[0]):]
df = df.groupby(['region', 'target_cycle'])['translation'].first().reset_index()
for i, dim in enumerate(dims):
    df['delta_' + dim] = df['translation'].apply(lambda v: v[i])

sns.heatmap(
    df.set_index(['region', 'target_cycle']).drop('translation', axis=1),
    annot=True, cmap='viridis',
    yticklabels=1, cbar=False
)
plt.gcf().set_size_inches(5, 3 * df['region'].nunique())
plt.gca().set_ylabel('Region Index - Target Cycle Index')
plt.gca().set_title('Drift Compensation Translation by Region/Cycle')

### Focal Plane Summary

In [None]:
df = data[best_focus_op].copy()
(
    pn.ggplot(
        df.assign(
            z=df['best_z'].apply(lambda v: '{}'.format(v)),
            region_index=df['region'].apply(lambda v: 'Region {}'.format(v))
        ), 
        pn.aes(x='tile_x', y='tile_y', fill='z', width=.95, height=.95)
    ) +
    pn.scale_y_reverse() +
    pn.scale_fill_brewer(palette='YlOrRd', guide=pn.guide_legend(title='Best Z (0-Based)')) +
    pn.facet_wrap('~region_index', ncol=2) +
    pn.geom_tile() + 
    pn.xlab('Tile X Coordinate') +
    pn.ylab('Tile Y Coordinate') +
    pn.ggtitle('Best Focal Plane by Region/Tile') +
    pn.theme(figure_size=(12, 4 * (df['region'].nunique() + 1) // 2))
)

### Tile Summary Statistics

In [None]:
df = data[tile_summary_op].copy()
(
    pn.ggplot(
        df.assign(
            region=df['region'].apply(lambda v: 'Region {}'.format(v)),
            channel=pd.Categorical(
                df['channel'],
                categories=df.groupby('channel')['mean'].mean().sort_values().index, ordered=True
            )
        ), 
        pn.aes(x='channel', y='mean')
    ) + 
    pn.geom_boxplot(outlier_size=0, alpha=.5) +
    pn.geom_jitter(pn.aes(color='tile'), width=.1) +
    pn.scale_y_log10() +
    pn.scale_color_continuous(guide=pn.guide_legend(title='Tile Index')) +
    pn.facet_wrap('~region', ncol=2) +
    pn.theme(figure_size=(24, 6 * (df['region'].nunique() + 1) // 2), axis_text_x=pn.element_text(rotation=90)) +
    pn.ggtitle('Mean Intensity Distributions Across Tiles') +
    pn.xlab('Channel Name') +
    pn.ylab('Mean Intensity')
)

### Execution Time Breakdown

In [None]:
df = pd.concat(
    [data[k].groupby(['region', 'tile'])['time'].first().rename(k) for k in data.keys()],
    axis=1
)
df.columns.name = 'operation'
df = df.stack().rename('time').reset_index()
(
    pn.ggplot(
        df.assign(operation=pd.Categorical(
            df['operation'], 
            categories=df.groupby('operation')['time'].median().sort_values().index,
            ordered=True
        )),
        pn.aes(x='operation', y='time')
    ) +
    pn.geom_boxplot() +
    pn.geom_jitter(alpha=.1) +
    pn.xlab('Operation') + 
    pn.ylab('Execution Time (Seconds)') +
    pn.ggtitle('Execution Time Distrubtions Across Tiles') +
    pn.theme(figure_size=(16, 4))
)