In [None]:
from IPython.lib.deepreload import reload
%load_ext autoreload
%autoreload 1

In [None]:
%reload_ext autoreload

In [None]:
import logging
import random

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as plt_colors
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')
import scipy.stats as st
import holoviews as hv
hv.extension('bokeh')
from holoviews import dim
from IPython.display import Markdown, display
from IPython.core.display import HTML

import matplotlib
fontsize = 16
matplotlib.rc('xtick', labelsize=fontsize)     
matplotlib.rc('ytick', labelsize=fontsize)
matplotlib.rc('axes', labelsize=fontsize, titlesize=fontsize)

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from bokeh.models.formatters import DatetimeTickFormatter

formatter = DatetimeTickFormatter(months='%b %Y')

from counts_analysis.c_utils import COUNTS_CSV, CLASSES, set_settings, set_counts_v2, rename_columns

#== Load Datasets ==#
df = pd.read_csv(COUNTS_CSV['counts'])
df = rename_columns(df)
# Dataset without problematic classes (Gyrodinium, Pseudo-nitzchia chain)
df_ = df[df['class'].isin(CLASSES)].reset_index(drop=True)
df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d')
data = df.copy()

def printmd(string):
    display(Markdown(string))

#=== Set count forms & settings ===#
# COUNT
volumetric_counts = set_counts_v2('cells/mL', micro_default=True)

raw_counts = set_counts_v2('count', micro_default=False)
raw_counts_pred = set_counts_v2('count', micro_default=False, automated=True)

vol_time_counts = set_counts_v2('count', micro_default=True)

class_percentage_counts = set_counts_v2('class percentage', micro_default=False) 

rel_counts = set_counts_v2('relative abundance', micro_default=False)
rel_counts = ['Lab-micro cells/mL relative abundance'] + list(rel_counts[1:])

In [None]:
def compute_volumetric(spc_camera, data):
    """
    Usage
    
    >>> compute_volumetric(spc_camera='Auto-Pier', data)
    >>> compute_volumetric(spc_camera='SPC-Pier', data)
    
    """
    if 'Pier' in spc_camera:
        normalization_factor = 160
    else:
        normalization_factor = 60
    data[f'{spc_camera} cells/mL'] = data[f'{spc_camera} count'] / normalization_factor
    return data

def compute_class_percentage(raw_count, data):
    if 'cells/mL' in raw_count:
        relative_column = 'Lab-micro cells/mL class percentage'
    else:
        relative_column = '{} class percentage'.format(raw_count.split(" count")[0])
    data[relative_column] = data.groupby('datetime')[raw_count].apply(lambda x: x / x.sum() * 100.0 if sum(x) != 0 else x)
    return data

def compute_relative_abundance(raw_count, data):
    if 'cells/mL' in raw_count:
        relative_column = 'Lab-micro cells/mL relative abundance'
    else:
        relative_column = '{} relative abundance'.format(raw_count.split(" count")[0])
    data[relative_column] = data.groupby('class')[raw_count].apply(lambda x: x / x.sum() * 100.0 if sum(x) != 0 else x)
    return data

def preprocess_raw_counts(count_form, raw_counts, data):
    if count_form == 'volumetric':
        compute_fn = compute_volumetric
        
    elif count_form == 'class percentage':
        compute_fn = compute_class_percentage
        
    elif count_form == 'relative abundance':
        compute_fn = compute_relative_abundance
    
    for rc in raw_counts:
        data = compute_fn(rc, data)
    return data

#todo maybe load dataset???

data = df.copy()

data = preprocess_raw_counts('class percentage', raw_counts, data)
data = preprocess_raw_counts('relative abundance', raw_counts, data)

dominant_cls = ['Lingulodinium polyedra', 'Prorocentrum micans', 'Pseudo-nitzschia chain']
dominant_cls_df = data[data['class'].isin(dominant_cls)].reset_index(drop=True)
rare_cls_df = data[~data['class'].isin(dominant_cls)].reset_index(drop=True)
rare_cls = sorted(rare_cls_df['class'].unique())

verbose = False
if verbose:
    display(data[['class', 'datetime'] + volumetric_counts].head(9))

    display(data[['class', 'datetime'] + raw_counts].head(9))

    display(data[['class', 'datetime'] + vol_time_counts].head(9))
    
    display(data[['class', 'datetime'] + class_percentage_counts].head(9))
    
    display(data[['class', 'datetime'] + rel_counts].head(9))


In [None]:
COUNT_FORM = raw_counts

# ================ Lab-Micro ================

In [None]:
SMPL_METHOD = COUNT_FORM[0]
import hvplot.pandas
title = 'Total Counts'
color = 'blue'

printmd(f'## [{SMPL_METHOD}] {title} Time Series & Distribution')
x = data.groupby('datetime')[SMPL_METHOD].sum().hvplot.line(rot=30, color=color, xformatter=formatter).opts(height=300, width=600, tools=['hover'])

y = data.groupby('datetime')[SMPL_METHOD].sum().hvplot.box(rot=0, color=color).opts(tools=['hover'])

z = data.groupby('datetime')[SMPL_METHOD].sum().hvplot.hist(rot=0, color=color).opts(tools=['hover'])

display(data.groupby('datetime')[SMPL_METHOD].sum().to_frame().transpose())
display(data.groupby('datetime')[SMPL_METHOD].sum().describe().to_frame().transpose())

hv.Layout(x + y + z).cols(3)

#### Analysis
- Median (131) seems to be reliable value for understanding total counts collected over 26 days
- #TODO what's making up most of these high consistent counts???

In [None]:
data.groupby('datetime')['Lab-micro volume counted (ml)']

In [None]:
"""
META DATA
"""
from counts_analysis.gtruth_analysis import plot_meta_time_series
printmd(f'## [{SMPL_METHOD}] Meta Data')
printmd('### Lab-micro Volume Counted (mL) Time Series')
display(data.groupby('datetime')['Lab-micro volume counted (ml)'].agg('mean').to_frame().transpose())
display(data.groupby('datetime')['Lab-micro volume counted (ml)'].agg('mean').describe().transpose())
plot_meta_time_series('Lab-micro volume counted (ml)', data, logged=False)

#### Analysis of Volume Counted
- Data points of high volumes counted: 08-05, 09-03, 09-16
- #TODO figure out how total counts correlate with this

In [None]:
printmd('### Lab-micro Cell Detection Count Limit Time Series')
display(data.groupby('datetime')['Lab-micro cell count detection limit'].unique().to_frame().transpose())
display(data.groupby('datetime')['Lab-micro cell count detection limit'].unique().describe().transpose())
plot_meta_time_series('Lab-micro cell count detection limit', data, logged=False)

#### Analysis of Cell Detection Count Limit
- Low cell points correspond with volume counted ~ 08-05 (131), 09-03 (136), 09-16 (136)

In [None]:
"""
CLASS DISTRIBUTIONS
"""
printmd(f'## [{SMPL_METHOD}] Class Distribution')
from counts_analysis.gtruth_analysis import plot_total_distribution, plot_class_distribution_over_period
display(data.groupby('class')[SMPL_METHOD].sum().to_frame().transpose())
display(data.groupby('class')[SMPL_METHOD].sum().describe().to_frame().transpose())

plot_total_distribution(SMPL_METHOD, data=data, logged=False)

In [None]:
printmd('#### Total Counts Distribution for Dominant Classes Only')
display(dominant_cls_df.groupby('class')[SMPL_METHOD].sum().describe().to_frame().transpose())

printmd('#### Total Counts Distribution for Rare Classes Only')
display(rare_cls_df.groupby('class')[SMPL_METHOD].sum().describe().to_frame().transpose())

#### Analysis of Total Class Distribution
- Rare classes seem to be Akashiwo, Ceratiums, Chattonella, & Gyrodinium from Lab-micro perspective
- Median count per class after 26 days is 55. Very wide IQR & Range (21-1527; 1-1919).
- Clear there is a difference in distribution for DOMINANT (median: 1798) vs RARE classes (median: 25)

In [None]:
printmd('### Average Distribution over 26 Days for Class Counts')
plot_class_distribution_over_period(stats_descriptor='mean', smpl_technique=SMPL_METHOD, data=data); plt.show();

# plot_class_distribution_over_period(stats_descriptor='median', smpl_technique=SMPL_METHOD, data=data); plt.show();

#### Analysis for Average Distribution over 26 Days for Class Counts
- Seems like it's obvious here that the median is zero for the rare classes. Averages are also fairly close.

In [None]:
"""
DOMINANT X RARE CLASS AVG DISTRIBUTION
"""
printmd('#### Dominant Class Time Series')
plot_class_distribution_over_period(stats_descriptor='mean', smpl_technique=SMPL_METHOD, data=dominant_cls_df); plt.show();

smpl_technique = COUNT_FORM[0]
stacked = dominant_cls_df.set_index(['datetime', 'class'])[smpl_technique].unstack().reset_index()
display(stacked[dominant_cls].apply(pd.Series.value_counts).fillna(0).astype(int).transpose())
stacked.hvplot.hist(y=dominant_cls, bins=[0,1,5,10,15,20,50,100,150,185,245,255,385,400,990,1050], alpha=0.5, subplots=True)

In [None]:
non_detected_days = stacked[dominant_cls].apply(pd.Series.value_counts).fillna(0).astype(int).transpose()[0] / 26
display(non_detected_days.to_frame().transpose())
display(non_detected_days.describe().to_frame().transpose())
non_detected_days.hvplot.bar(rot=30).opts(title='Number of Zero Count Days per Species', ylabel='Percentage of Days (N=26)')

In [None]:
printmd('#### Rare Class Time Series')
plot_class_distribution_over_period(stats_descriptor='mean', smpl_technique=SMPL_METHOD, data=rare_cls_df); plt.show();

smpl_technique = COUNT_FORM[0]
stacked = rare_cls_df.set_index(['datetime', 'class'])[smpl_technique].unstack().reset_index()
display(stacked[rare_cls].apply(pd.Series.value_counts).fillna(0).astype(int).transpose())
stacked.hvplot.hist(y=rare_cls, bins=[0,1,3,5,7,10,15,20,25], alpha=0.5, subplots=True)

In [None]:
non_detected_days = stacked[rare_cls].apply(pd.Series.value_counts).fillna(0).astype(int).transpose()[0] / 26
display(non_detected_days.to_frame().transpose())
display(non_detected_days.describe().to_frame().transpose())
non_detected_days.hvplot.bar(rot=30).opts(title='Number of Zero Count Days per Species', ylabel='Percentage of Days (N=26)')

#### Analysis for Dominant x Rare Class
- For all rare classes average percentage of zero count days over 6 species is 75% with std dev of 0.17. This is quite high for the number of species and dates and helps back up what we saw from the total distribution of the dominant species

In [None]:
"""
CLASS TIME SERIES
"""
from counts_analysis.gtruth_analysis import plot_class_time_series
title = 'Class'
printmd(f'### [{SMPL_METHOD}] {title} Time Series')
plot_class_time_series(SMPL_METHOD, data, logged=False)
printmd(f'### [{SMPL_METHOD}] {title} Time Series (Logged)')
plot_class_time_series(SMPL_METHOD, data, logged=True)

In [None]:
printmd('#### Dominant Class Time Series')
plot_class_time_series(SMPL_METHOD, dominant_cls_df, logged=False)

In [None]:
printmd('#### Rare Class Time Series')
plot_class_time_series(SMPL_METHOD, rare_cls_df, logged=False)

#### Analysis for Class Time Series
- From here it seems like there's two peaks within the microscopy of when all of the counts showed up. It seems to be a new date (2019-08-26). 
- Also of the elevated period, 2019-05-28, the ability to find Akashiwo which previously had a huge elevation on 2019-05-23 was worse.
- It seems the ceratium classes appeared together

In [None]:
SMPL_METHOD = COUNT_FORM[0]
import hvplot.pandas
title = 'Total Counts x Dominant Cls x Rare Cls'
color = 'blue'

printmd(f'## [{SMPL_METHOD}] {title} Time Series & Distribution')
x = data.groupby('datetime')[SMPL_METHOD].sum().hvplot.line(rot=30, color=color, label='Total Counts', xformatter=formatter).opts(height=300, width=600, tools=['hover'])

y = dominant_cls_df.groupby('datetime')[SMPL_METHOD].sum().hvplot.line(rot=30, color='red', label='Dominant Class', xformatter=formatter).opts(height=300, width=600, tools=['hover'])

z = rare_cls_df.groupby('datetime')[SMPL_METHOD].sum().hvplot.line(rot=30, color='orange', label='Rare Class', xformatter=formatter).opts(height=300, width=600, tools=['hover'])

printmd('Total Distribution')
# display(data.groupby('datetime')[SMPL_METHOD].sum().to_frame().transpose())
display(data.groupby('datetime')[SMPL_METHOD].sum().describe().to_frame().transpose())

printmd('Dominant Class Totals Distribution')
# display(dominant_cls_df.groupby('datetime')[SMPL_METHOD].sum().to_frame().transpose())
display(dominant_cls_df.groupby('datetime')[SMPL_METHOD].sum().describe().to_frame().transpose())

printmd('Rare Class Totals Distribution')
# display(rare_cls_df.groupby('datetime')[SMPL_METHOD].sum().to_frame().transpose())
display(rare_cls_df.groupby('datetime')[SMPL_METHOD].sum().describe().to_frame().transpose())

hv.Layout(x * y * z)

#### Analysis of Total x Dominant x Rare
- From this perspective it's very clear that the dominant class groups make up the total samples from the microscopy samples

## Interaction of Classes

In [None]:
"""
CLASS HEATMAP
"""
from counts_analysis.gtruth_analysis import plot_heatmap
title = 'Class x Datetime x '
printmd(f'## [{SMPL_METHOD}] {title} HeatMap')
lab_micro_heatmap = plot_heatmap(SMPL_METHOD, data)
lab_micro_heatmap

In [None]:
from counts_analysis.gtruth_analysis import plot_heatmap

title = 'Class x Datetime x [Dominant x Rare]'
printmd(f'## [{SMPL_METHOD}] {title} HeatMap')

sample_technique = SMPL_METHOD
counts_df = dominant_cls_df.copy()
counts_df = counts_df.sort_values(by=['class', 'datetime'])
sdata = hv.Dataset(data=counts_df, kdims=['class', 'datetime'])
dominant_heatmap = sdata.to(hv.HeatMap, ['datetime', 'class'], sample_technique).opts(
    title=sample_technique, colorbar=True, width=1000, height=300, xrotation=60, tools=['hover'], shared_axes=True)

# heatmap_data
dominant_heatmap

In [None]:
counts_df = rare_cls_df.copy()
counts_df = counts_df.sort_values(by=['class', 'datetime'])
sdata = hv.Dataset(data=counts_df, kdims=['class', 'datetime'])
rare_heatmap = sdata.to(hv.HeatMap, ['datetime', 'class'], sample_technique).opts(
    title=sample_technique, colorbar=True, width=1000, height=300, xrotation=60, tools=['hover'], shared_axes=True)
rare_heatmap

#### Analysis of Dominant x Rare Class HeatMap
- 2019-09-03 had a high count of Chattonella which corresponds to the change in cell volume.
- 2019-08-26 also seemed like a high peak of rare species

In [None]:
formatter = DatetimeTickFormatter(months='%b %Y')

def plot_stacked_bar_chart(smpl_technique, data):
    stacked = data.set_index(['datetime', 'class'])[smpl_technique].unstack().reset_index()
    stacked['datetime'] = pd.to_datetime(stacked['datetime'], format="%Y-%m-%d")
    print(stacked.info())
    print(stacked.head())
    plot = stacked.hvplot.bar(x='datetime', y=sorted(list(data['class'].unique())), stacked=True, rot=30).opts(width=1000, height=600, title=smpl_technique)
    return plot

display(data.groupby('datetime')[SMPL_METHOD].sum().to_frame().transpose())
display(data.groupby('datetime')[SMPL_METHOD].sum().describe().to_frame().transpose())

micro = plot_stacked_bar_chart(COUNT_FORM[0], data)
micro.opts(logy=True)

In [None]:
micro_class_percentage = plot_stacked_bar_chart(class_percentage_counts[0], data)
micro_class_percentage

"""
CLASS HEATMAP
"""
from counts_analysis.gtruth_analysis import plot_heatmap
title = 'Class x Datetime x '
lab_micro_heatmap = plot_heatmap(SMPL_METHOD, data)

micro_table = data.groupby('datetime')[SMPL_METHOD].sum().to_frame().transpose().hvplot.table(width=1750)

hv.Layout(micro_class_percentage + lab_micro_heatmap + micro_table).cols(1)

In [None]:
printmd('#### Average Pseudo-nitzschia Class Percentage Distribution')
display(data[data['class'] == 'Pseudo-nitzschia chain'][class_percentage_counts[0]].describe().to_frame().transpose())

printmd('#### Average Prorocentrum micans Class Percentage Distribution')
display(data[data['class'] == 'Prorocentrum micans'][class_percentage_counts[0]].describe().to_frame().transpose())

printmd('#### Average Lingulodinium polyedra Class Percentage Distribution')
display(data[data['class'] == 'Lingulodinium polyedra'][class_percentage_counts[0]].describe().to_frame().transpose())

print('Sum of averaged class percentage between all 3 species: ', 54.323748 + 26.227725 + 15.930692)

#### Analysis of Stacked Bar Chart
- From here it seems like a majority of the percentage of counts are from the Pseudo-nitzschia chain during ambient days, followed by Prorocentrum micans.
- For almost all samples, almost 96% of the time these species were the Dominant Species. [54, 26, 15]
- Seems like there's a difference in detecting Prorocentrum micans vs Lingulodinium polyedras
- During the ambient period, it seems like the Ceratium Falcatiforme or Fusus popped up a lot with the Prorocentrum micans. Same with Chattonella
- Cochlodinium showed up the day several days before the spike in Prorocentrum micans

In [None]:
title = 'Raw Data'
printmd(f'## [{SMPL_METHOD}] {title}')
data[['class', 'datetime', 'Lab-micro volume counted (ml)', 'Lab-micro cell count detection limit', 'Lab-micro cells/mL', 'Lab-micro count']].head(27)

# ================ SPC-Lab ================

In [None]:
SMPL_METHOD_1 = COUNT_FORM[1]
import hvplot.pandas
title = 'Total Counts'
color = 'orange'

printmd(f'## [{SMPL_METHOD_1}] {title} Time Series & Distribution')
x = data.groupby('datetime')[SMPL_METHOD_1].sum().hvplot.line(rot=30, color=color, xformatter=formatter).opts(height=300, width=600, tools=['hover'])

y = data.groupby('datetime')[SMPL_METHOD_1].sum().hvplot.box(rot=0, color=color).opts(tools=['hover'])

z = data.groupby('datetime')[SMPL_METHOD_1].sum().hvplot.hist(rot=0, color=color).opts(tools=['hover'])

display(data.groupby('datetime')[SMPL_METHOD_1].sum().to_frame().transpose())
display(data.groupby('datetime')[SMPL_METHOD_1].sum().describe().to_frame().transpose())

hv.Layout(x + y + z).cols(3)

#### Analysis of Total Counts
- Total Counts overall are much lower than Lab-micro. [Lab-micro median (131) vs SPC-Lab median (18)] MUCH lower. 

In [None]:
"""
CLASS DISTRIBUTIONS
"""
printmd(f'## [{SMPL_METHOD_1}] Class Distribution')
from counts_analysis.gtruth_analysis import plot_total_distribution, plot_class_distribution_over_period

display(data.groupby('class')[SMPL_METHOD_1].sum().to_frame().transpose())
display(data.groupby('class')[SMPL_METHOD_1].sum().describe().to_frame().transpose())

plot_total_distribution(SMPL_METHOD_1, data=data, logged=False)

#### Analysis for Accumulated Class Count Distribution
- Overall it seems like there's more rare classes accounted for even though the total counts are lower.
- Prorocentrum micans make up 50% of the accumulated class counts

In [None]:
printmd('#### Total Counts Distribution for Dominant Classes Only')
display(dominant_cls_df.groupby('class')[SMPL_METHOD_1].sum().describe().to_frame().transpose())

printmd('#### Total Counts Distribution for Rare Classes Only')
display(rare_cls_df.groupby('class')[SMPL_METHOD_1].sum().describe().to_frame().transpose())

#### Analysis for Dominant vs Rare Accumulated Distribution
- Yes seems like therer's a difference between the two group of classes. Average is definitely much higher here.

In [None]:
printmd('### Average Distribution over 26 Days for Class Counts')

plot_class_distribution_over_period(stats_descriptor='mean', smpl_technique=SMPL_METHOD_1, data=data); plt.show();

# plot_class_distribution_over_period(stats_descriptor='median', smpl_technique=SMPL_METHOD_1, data=data); plt.show();

#### Analysis
- From here the rare classes are still the same. Also same median zero abundance (raw count). 
- SPC-Lab seems to validate and agree with Lab-micro but doesn't accumulate as many counts. 

In [None]:
"""
DOMINANT X RARE CLASS AVG DISTRIBUTION
"""
printmd('#### Dominant Class Time Series')
plot_class_distribution_over_period(stats_descriptor='mean', smpl_technique=SMPL_METHOD_1, data=dominant_cls_df); plt.show();

smpl_technique = COUNT_FORM[1]
stacked = dominant_cls_df.set_index(['datetime', 'class'])[smpl_technique].unstack().reset_index()
display(stacked[dominant_cls].apply(pd.Series.value_counts).fillna(0).astype(int).transpose())
stacked.hvplot.hist(y=dominant_cls, alpha=0.5, subplots=True)

In [None]:
non_detected_days = stacked[dominant_cls].apply(pd.Series.value_counts).fillna(0).astype(int).transpose()[0] / 26
display(non_detected_days.to_frame().transpose())
display(non_detected_days.describe().to_frame().transpose())
non_detected_days.hvplot.bar(rot=30).opts(title='Number of Zero Count Days per Species', ylabel='Percentage of Days (N=26)')

In [None]:
printmd('#### Rare Class Time Series')
plot_class_distribution_over_period(stats_descriptor='mean', smpl_technique=SMPL_METHOD_1, data=rare_cls_df); plt.show();

smpl_technique = COUNT_FORM[1]
stacked = rare_cls_df.set_index(['datetime', 'class'])[smpl_technique].unstack().reset_index()
display(stacked[rare_cls].apply(pd.Series.value_counts).fillna(0).astype(int).transpose())
stacked.hvplot.hist(y=rare_cls, bins=[0,1,3,5,7,10,15,20,25], alpha=0.5, subplots=True)

In [None]:
non_detected_days = stacked[rare_cls].apply(pd.Series.value_counts).fillna(0).astype(int).transpose()[0] / 26
display(non_detected_days.to_frame().transpose())
display(non_detected_days.describe().to_frame().transpose())
non_detected_days.hvplot.bar(rot=30).opts(title='Number of Zero Count Days per Species', ylabel='Percentage of Days (N=26)')

#### Analysis for Dominant x Rare Classes Distributions
- Okay the average is definitely a little bit lower here (SPC-Lab 62% (std 26%) vs Lab-micro 75% (std )
- However, this could be due to the Gyrodinium dropping the average.
- Species against each other, they actually have very similar capture rates of not detecting these species.
- They both couldn't capture ceratium furca 50% of the times and akashiwo 80% >. Cochlodinium 65% vs 69%

In [None]:
"""
CLASS TIME SERIES
"""
from counts_analysis.gtruth_analysis import plot_class_time_series
title = 'Class'
printmd(f'### [{SMPL_METHOD_1}] {title} Time Series')
plot_class_time_series(SMPL_METHOD_1, data, logged=False)
printmd(f'### [{SMPL_METHOD_1}] {title} Time Series (Logged)')
plot_class_time_series(SMPL_METHOD_1, data, logged=True)

In [None]:
printmd('#### Rare Class Time Series')
plot_class_time_series(SMPL_METHOD_1, rare_cls_df, logged=False)

# filtered_data = rare_cls_df[rare_cls_df['datetime'] <= '2019-06-03']
# plot_class_time_series(SMPL_METHOD_1, filtered_data, logged=False)

#### Analysis of Class time Series
- Seems there was a consistent peak for the Lab around 08-26 and 09-03, which was one of the dates the cell detection count limit changed. 9-16 did not appear though.
- Also it seems only the bloom affected the Prorocentrum micans & Lingulodinium polyedra. No pseudo-nitzschia chains. 
- The Rare species collected by SPC-Lab did not seeem to experience an inflation compared to Lab-micro

Question: what would be the reason why we're not able to detect some of these rare species that often?

In [None]:
SMPL_METHOD = COUNT_FORM[1]
import hvplot.pandas
title = 'Total Counts x Dominant Cls x Rare Cls'
color = 'blue'

printmd(f'## [{SMPL_METHOD}] {title} Time Series & Distribution')
x = data.groupby('datetime')[SMPL_METHOD].sum().hvplot.line(rot=30, color=color, label='Total Counts', xformatter=formatter).opts(height=300, width=600, tools=['hover'])

y = dominant_cls_df.groupby('datetime')[SMPL_METHOD].sum().hvplot.line(rot=30, color='red', label='Dominant Class', xformatter=formatter).opts(height=300, width=600, tools=['hover'])

z = rare_cls_df.groupby('datetime')[SMPL_METHOD].sum().hvplot.line(rot=30, color='orange', label='Rare Class', xformatter=formatter).opts(height=300, width=600, tools=['hover'])

printmd('Total Distribution')
# display(data.groupby('datetime')[SMPL_METHOD].sum().to_frame().transpose())
display(data.groupby('datetime')[SMPL_METHOD].sum().describe().to_frame().transpose())

printmd('Dominant Class Totals Distribution')
# display(dominant_cls_df.groupby('datetime')[SMPL_METHOD].sum().to_frame().transpose())
display(dominant_cls_df.groupby('datetime')[SMPL_METHOD].sum().describe().to_frame().transpose())

printmd('Rare Class Totals Distribution')
# display(rare_cls_df.groupby('datetime')[SMPL_METHOD].sum().to_frame().transpose())
display(rare_cls_df.groupby('datetime')[SMPL_METHOD].sum().describe().to_frame().transpose())

hv.Layout(x * y * z)

#### Analysis
- From this perspective, it seems that the SPC-Lab does in fact validate the Microscopy in dominant classes making up the total counts.
- #TODO write justification for low counts

In [None]:
def plot_stacked_bar_chart(smpl_technique, data):
    stacked = data.set_index(['datetime', 'class'])[smpl_technique].unstack().reset_index()
    plot = stacked.hvplot.bar(x='datetime', y=sorted(list(data['class'].unique())), stacked=True, rot=30).opts(width=1000, height=600, title=smpl_technique)
    print(plot.dimensions)
    return plot

display(data.groupby('datetime')[SMPL_METHOD_1].sum().to_frame().transpose())
display(data.groupby('datetime')[SMPL_METHOD_1].sum().describe().to_frame().transpose())

micro = plot_stacked_bar_chart(COUNT_FORM[1], data)
micro.opts(logy=True)

In [None]:
lab_class_percentage = plot_stacked_bar_chart(class_percentage_counts[1], data)

"""
CLASS HEATMAP
"""
from counts_analysis.gtruth_analysis import plot_heatmap
title = 'Class x Datetime x '
spc_lab_heatmap = plot_heatmap(SMPL_METHOD_1, data)

lab_table = data.groupby('datetime')[SMPL_METHOD_1].sum().to_frame().transpose().hvplot.table(width=1750)

hv.Layout(lab_class_percentage + spc_lab_heatmap + lab_table).cols(1)

#### Analysis for Stacked Bar Chart
- From this aspect it seems that the lab system would illustrate largely what the Lab-micro samples would look like if the formaldehyde did not disolve the naked dinoflagellate.
- Of the dominant species, the lab system definitely had a harder time identifying Lingulodinium polyedras and Pseudo-nitzschia chains. Also it seems sampling is an issue for the system.

# ================ SPC-Pier ================

In [None]:
SMPL_METHOD_2 = COUNT_FORM[2]
import hvplot.pandas
title = 'Total Counts'
color = 'green'

printmd(f'## [{SMPL_METHOD}] {title} Time Series & Distribution')
x = data.groupby('datetime')[SMPL_METHOD_2].sum().hvplot.line(rot=30, color=color, xformatter=formatter).opts(height=300, width=600, tools=['hover'])

y = data.groupby('datetime')[SMPL_METHOD_2].sum().hvplot.box(rot=0, color=color).opts(tools=['hover'])

z = data.groupby('datetime')[SMPL_METHOD_2].sum().hvplot.hist(rot=0, color=color).opts(tools=['hover'])

display(data.groupby('datetime')[SMPL_METHOD_2].sum().to_frame().transpose())
display(data.groupby('datetime')[SMPL_METHOD_2].sum().describe().to_frame().transpose())


hv.Layout(x + y + z).cols(3)

#### Analysis of Total Counts
- Okay definitely seems like the pier was undercounting. These low total counts could be due to hard_to_id images, which has been a large issue for trying to fairly compare abundances.
- Another reason could be due to the high pseudo-nitzschia chain counts that makes up most of the Microscopy counts

- It is important to note that during the days of the blooms, that the Pier was able to pick up comparable numbers to the Lab-micro.

In [None]:
"""
CLASS DISTRIBUTIONS
"""
printmd(f'## [{SMPL_METHOD_2}] Class Distribution')
from counts_analysis.gtruth_analysis import plot_total_distribution, plot_class_distribution_over_period

display(data.groupby('class')[SMPL_METHOD_2].sum().to_frame().transpose())
display(data.groupby('class')[SMPL_METHOD_2].sum().describe().to_frame().transpose())

plot_total_distribution(SMPL_METHOD_2, data=data, logged=False)

#### Analysis of Accumulated Class Counts Distribution
- Pretty similar to the SPC-Lab in terms of distribution, except for absent Pseudo-nitzschia chains in the Pier
- The Pier system definitely does seem to be the double in counts vs the SPC-Lab. 
- #TODO determine if it's correct compare the SPC-Lab with the SPC-Pier under these circumstances of sampling 17 min before AND after vs just sequential...

In [None]:
printmd('### Average Distribution over 26 Days for Class Counts')
plot_class_distribution_over_period(stats_descriptor='mean', smpl_technique=SMPL_METHOD_2, data=data); plt.show();

# plot_class_distribution_over_period(stats_descriptor='median', smpl_technique=SMPL_METHOD_2, data=data); plt.show();

#### Analysis of Average Distributions
- Hmmmm seems like the medians for the Rare classes are still all zero... so not good

In [None]:
"""
DOMINANT X RARE CLASS AVG DISTRIBUTION
"""
printmd('#### Dominant Class Time Series')
plot_class_distribution_over_period(stats_descriptor='mean', smpl_technique=SMPL_METHOD_2, data=dominant_cls_df); plt.show();

smpl_technique = COUNT_FORM[2]
stacked = dominant_cls_df.set_index(['datetime', 'class'])[smpl_technique].unstack().reset_index()
display(stacked[dominant_cls].apply(pd.Series.value_counts).fillna(0).astype(int).transpose())
stacked.hvplot.hist(y=dominant_cls, bins=[0,1,5,10,15,20,50,100,150,185,245,255,385,400,990,1050], alpha=0.5, subplots=True)

In [None]:
non_detected_days = stacked[dominant_cls].apply(pd.Series.value_counts).fillna(0).astype(int).transpose()[0] / 26
display(non_detected_days.to_frame().transpose())
display(non_detected_days.describe().to_frame().transpose())
non_detected_days.hvplot.bar(rot=30).opts(title='Number of Zero Count Days per Species', ylabel='Percentage of Days (N=26)')

#### Analysis of Dominant Cls Distribution
- Using imaging systems, you're subjected to missing the collection of species DUE to hard_to_id images for the most part (examples are pseudo-nitzschia chains; lingulodinium polyedra)
- Comparing to this to the microrscopy, there's definitely a better precision of capturing these frequent classes. This is an area the imaging systems need to do better.

In [None]:
printmd('#### Rare Class Time Series')
plot_class_distribution_over_period(stats_descriptor='mean', smpl_technique=SMPL_METHOD_2, data=rare_cls_df); plt.show();

smpl_technique = COUNT_FORM[2]
stacked = rare_cls_df.set_index(['datetime', 'class'])[smpl_technique].unstack().reset_index()
display(stacked[rare_cls].apply(pd.Series.value_counts).fillna(0).astype(int).transpose())
stacked.hvplot.hist(y=rare_cls, bins=[0,1,3,5,7,10,15,20,25], alpha=0.5, subplots=True)

In [None]:
non_detected_days = stacked[rare_cls].apply(pd.Series.value_counts).fillna(0).astype(int).transpose()[0] / 26
display(non_detected_days.to_frame().transpose())
display(non_detected_days.describe().to_frame().transpose())
non_detected_days.hvplot.bar(rot=30).opts(title='Number of Zero Count Days per Species', ylabel='Percentage of Days (N=26)')

#### Analysis of Zero Count Days
- Okay given this plot, the SPC-Pier seems to have the upperhand in detecting these rare species on a regular basis (relative to lab-micro) 51% (25%) VS 75% (std 17%).
- Even for comparing species for species, the SPC-Pier does do better in detecting these species.
- So in reflection of the SPC-Pier vs Micro, it seems that the SPC-Pier would be the ideal choice for sampling a higher diversity of classes

In [None]:
"""
CLASS TIME SERIES
"""
from counts_analysis.gtruth_analysis import plot_class_time_series
title = 'Class'
printmd(f'### [{SMPL_METHOD_2}] {title} Time Series')
plot_class_time_series(SMPL_METHOD_2, data, logged=False)
printmd(f'### [{SMPL_METHOD_2}] {title} Time Series (Logged)')
plot_class_time_series(SMPL_METHOD_2, data, logged=True)

In [None]:
printmd('#### Rare Class Time Series')
plot_class_time_series(SMPL_METHOD_2, rare_cls_df, logged=False)

#### Analysis for Class Time Series
- Okay definitely clear that pier is better in sampling these species. Again however, could be subjected to the Gyrodinium.

In [None]:
"""
CLASS HEATMAP
"""
from counts_analysis.gtruth_analysis import plot_heatmap
title = 'Class x Datetime x '
printmd(f'## [{SMPL_METHOD_2}] {title} HeatMap')
lab_micro_heatmap = plot_heatmap(SMPL_METHOD_2, data)
lab_micro_heatmap

In [None]:
def plot_stacked_bar_chart(smpl_technique, data):
    stacked = data.set_index(['datetime', 'class'])[smpl_technique].unstack().reset_index()
    plot = stacked.hvplot.bar(x='datetime', y=sorted(list(data['class'].unique())), stacked=True, rot=30).opts(width=1000, height=600, title=smpl_technique)
    print(plot.dimensions)
    return plot

display(data.groupby('datetime')[SMPL_METHOD_2].sum().to_frame().transpose())
display(data.groupby('datetime')[SMPL_METHOD_2].sum().describe().to_frame().transpose())

micro = plot_stacked_bar_chart(COUNT_FORM[2], data)
micro.opts(logy=True)

In [None]:
pier_class_percentage = plot_stacked_bar_chart(class_percentage_counts[2], data)

"""
CLASS HEATMAP
"""
from counts_analysis.gtruth_analysis import plot_heatmap
title = 'Class x Datetime x '
spc_pier_heatmap = plot_heatmap(SMPL_METHOD_2, data)

pier_table = data.groupby('datetime')[SMPL_METHOD_2].sum().to_frame().transpose().hvplot.table(width=1750)

hv.Layout(pier_class_percentage + spc_pier_heatmap + pier_table).cols(1)

# SPC-Pier, SPC-Lab, Lab-micro Comparison

In [None]:
COUNTS = COUNT_FORM
import hvplot.pandas
title = 'Total Counts'
SMPL_METHOD = 'SPC-Pier, SPC-Lab, Lab-micro Comparison'
data 
printmd(f'## [{SMPL_METHOD}] {title} Time Series & Distribution')
x = data.groupby('datetime')[COUNTS].sum().hvplot.line(ylabel='Count', xformatter=formatter, line_width=3).opts(height=500, width=1200, tools=['hover'], fontscale=1.5)

y = data.groupby('datetime')[COUNTS].sum().hvplot.box(rot=0).opts(tools=['hover'])

z = data.groupby('datetime')[COUNTS].sum().hvplot.hist(rot=0).opts(tools=['hover'])

display(data.groupby('datetime')[COUNT_FORM].sum().transpose())

sum_counts = pd.DataFrame(data.groupby('datetime')[COUNT_FORM].sum().transpose().sum(axis=1), columns=['sum'])
data_desc = pd.concat([data.groupby('datetime')[COUNT_FORM].sum().describe().transpose(), sum_counts], axis=1)
display(data_desc)

hv.Layout(x + y + z).cols(3)

In [None]:
printmd(f'## [{SMPL_METHOD}] Class Distribution')
# Plot sampling techniques for each class against each other

current_palette_7 = sns.color_palette("muted", 3)
sns.set_palette(current_palette_7)

y = 'raw count'
sm = data[['class', 'datetime'] + COUNT_FORM]
sm = sm.melt(id_vars=['class', 'datetime'], var_name=['setting'], value_name=y)
sm = sm.sort_values('class')

In [None]:
# plt.figure(figsize=(10, 6))

# sns.barplot(x='class', y=y, hue='setting', data=sm)
# plt.xlabel('Class')
# plt.xticks(rotation=30)
# if 'relative' in y:
#     plt.ylim(0, 100)
# plt.tight_layout()

def plot_compared_methods_class_distribution(COUNT_FORM, data, y_label, ax=None):
    y = 'raw count'
    sm = data[['class', 'datetime'] + COUNT_FORM]
    sm = sm.melt(id_vars=['class', 'datetime'], var_name=['setting'], value_name=y)
    sm = sm.sort_values('class')
    
    if ax:
        sns.barplot(x='class', y=y, hue='setting', data=sm, ax=ax)
        ax.set_ylabel(y_label)
        ax.set_xticklabels(labels=sorted(sm['class'].unique()), rotation=30)
    else:
        sns.barplot(x='class', y=y, hue='setting', data=sm)
        plt.ylabel(y_label)
        plt.xticks(rotation=30)

printmd('#### Absolute Counts')
fig,ax = plt.subplots(1, 2, figsize=(25, 6))
plot_compared_methods_class_distribution(raw_counts, rare_cls_df, ax=ax[0], y_label='Absolute Counts')    
plot_compared_methods_class_distribution(raw_counts, dominant_cls_df, ax=ax[1], y_label='Absolute Counts')
plt.show()

printmd('#### Class Percentage')
fig,ax = plt.subplots(1, 2, figsize=(25, 6))
plot_compared_methods_class_distribution(class_percentage_counts, rare_cls_df, ax=ax[0], y_label='Class Percentage')    
plot_compared_methods_class_distribution(class_percentage_counts, dominant_cls_df, ax=ax[1], y_label='Class Percentage')
plt.show()

In [None]:
printmd(f'## [{SMPL_METHOD}] Class Time Series')
for smpl_technique in COUNT_FORM:
    printmd(f'### {smpl_technique}')
    plot_class_time_series(smpl_technique, data, logged=False)

## Zero counts for Dominant & Rare Species

In [None]:
"""
RARE SPECIES
"""
COUNTS = raw_counts_pred

rare_cls_df = data[~data['class'].isin(dominant_cls)].reset_index(drop=True)
rare_cls = sorted(rare_cls_df['class'].unique())

def get_non_detected_days(data, smpl_technique, classes):
    printmd(smpl_technique)
    stacked = data.set_index(['datetime', 'class'])[smpl_technique].unstack().reset_index()
    non_detected_days = 1.0 - stacked[classes].apply(pd.Series.value_counts).fillna(0).astype(int).transpose()[0] / 26
    display(non_detected_days.to_frame().transpose())
    display(non_detected_days.describe().to_frame().transpose())
    return non_detected_days

non_detected_days = get_non_detected_days(rare_cls_df, COUNTS[0], rare_cls)
micro = non_detected_days.hvplot.bar(rot=30).opts(title='Number of Zero Count Days per Species', ylabel='Percentage of Days (N=26)')

non_detected_days1 = get_non_detected_days(rare_cls_df, COUNTS[1], rare_cls)
lab = non_detected_days1.hvplot.bar(rot=30).opts(title='Number of Zero Count Days per Species', ylabel='Percentage of Days (N=26)')

non_detected_days2 = get_non_detected_days(rare_cls_df, COUNTS[2], rare_cls)
pier = non_detected_days2.hvplot.bar(rot=30).opts(title='Number of Zero Count Days per Species', ylabel='Percentage of Days (N=26)')

# hv.Layout(micro+lab+pier).cols(1)

detection_col = ['Lab-micro', 'SPC-Lab', 'SPC-Pier']
m = non_detected_days.to_frame().rename({0:detection_col[0]}, axis=1)
l = non_detected_days1.to_frame().rename({0:detection_col[1]}, axis=1)
p = non_detected_days2.to_frame().rename({0:detection_col[2]}, axis=1)

rare_zeros = m.merge(l, on='class')
rare_zeros = rare_zeros.merge(p, on='class')
rare_zeros['class'] = rare_zeros.index
rare_zeros = rare_zeros.reset_index(drop=True)

def plot_compared_methods_zero_counts(COUNT_FORM, data, y_label, ax=None):
    y = 'Detection rate'
    sm = data[['class'] + COUNT_FORM]
    var_name = 'sampling technique'
    sm = sm.melt(id_vars=['class'], var_name=[var_name], value_name=y)
    sm = sm.sort_values(['class', 'sampling technique'])
    
    if ax:
        sns.barplot(x='class', y=y, hue=var_name, data=sm, ax=ax)
        ax.set_ylabel(y_label)
        ax.set_xticklabels(labels=sorted(sm['class'].unique()), rotation=90)
    else:
        sns.barplot(x='class', y=y, hue=var_name, data=sm)
        plt.ylabel(y_label)
        plt.xticks(rotation=30, ha='right')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

current_palette_7 = sns.color_palette("muted", 3)
sns.set_palette(current_palette_7[:2])
    
plot_compared_methods_zero_counts(detection_col[:2], rare_zeros, y_label='Detection Rate'); plt.show();

sns.set_palette([current_palette_7[0], current_palette_7[2]])
plot_compared_methods_zero_counts([detection_col[0], detection_col[2]], rare_zeros, y_label='Detection Rate'); plt.show();


sns.set_palette(current_palette_7[1:])
plot_compared_methods_zero_counts(detection_col[1:], rare_zeros, y_label='Detection Rate'); plt.show();

sns.set_palette(current_palette_7)
plot_compared_methods_zero_counts(detection_col, rare_zeros, y_label='Detection Rate'); plt.show();

In [None]:
printmd('## Dominant Species (zero count days)')
non_detected_days = get_non_detected_days(dominant_cls_df, COUNTS[0], dominant_cls)
micro = non_detected_days.hvplot.bar(rot=30).opts(title='Number of Zero Count Days per Species', ylabel='Percentage of Days (N=26)')

non_detected_days1 = get_non_detected_days(dominant_cls_df, COUNTS[1], dominant_cls)
lab = non_detected_days1.hvplot.bar(rot=30).opts(title='Number of Zero Count Days per Species', ylabel='Percentage of Days (N=26)')

non_detected_days2 = get_non_detected_days(dominant_cls_df, COUNTS[2], dominant_cls)
pier = non_detected_days2.hvplot.bar(rot=30).opts(title='Number of Zero Count Days per Species', ylabel='Percentage of Days (N=26)')

# hv.Layout(micro+lab+pier).cols(1)

detection_col = ['Lab-micro', 'SPC-Lab', 'SPC-Pier']
m = non_detected_days.to_frame().rename({0:detection_col[0]}, axis=1)
l = non_detected_days1.to_frame().rename({0:detection_col[1]}, axis=1)
p = non_detected_days2.to_frame().rename({0:detection_col[2]}, axis=1)

rare_zeros = m.merge(l, on='class')
rare_zeros = rare_zeros.merge(p, on='class')
rare_zeros['class'] = rare_zeros.index
rare_zeros = rare_zeros.reset_index(drop=True)

def plot_compared_methods_zero_counts(COUNT_FORM, data, y_label, ax=None):
    y = 'Detection rate'
    sm = data[['class'] + COUNT_FORM]
    var_name = 'sampling technique'
    sm = sm.melt(id_vars=['class'], var_name=[var_name], value_name=y)
    sm = sm.sort_values(['class', 'sampling technique'])
    
    if ax:
        sns.barplot(x='class', y=y, hue=var_name, data=sm, ax=ax)
        ax.set_ylabel(y_label)
        ax.set_xticklabels(labels=sorted(sm['class'].unique()), rotation=90)
    else:
        sns.barplot(x='class', y=y, hue=var_name, data=sm)
        plt.ylabel(y_label)
        plt.xticks(rotation=30, ha='right')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

current_palette_7 = sns.color_palette("muted", 3)
sns.set_palette(current_palette_7[:2])
    
plot_compared_methods_zero_counts(detection_col[:2], rare_zeros, y_label='Detection Rate'); plt.show();

sns.set_palette([current_palette_7[0], current_palette_7[2]])
plot_compared_methods_zero_counts([detection_col[0], detection_col[2]], rare_zeros, y_label='Detection Rate'); plt.show();


sns.set_palette(current_palette_7[1:])
plot_compared_methods_zero_counts(detection_col[1:], rare_zeros, y_label='Detection Rate'); plt.show();

sns.set_palette(current_palette_7)
plot_compared_methods_zero_counts(detection_col, rare_zeros, y_label='Detection Rate'); plt.show();

In [None]:
detection_col = ['Lab-micro', 'SPC-Lab', 'SPC-Pier']
m = non_detected_days.to_frame().rename({0:detection_col[0]}, axis=1)
l = non_detected_days1.to_frame().rename({0:detection_col[1]}, axis=1)
p = non_detected_days2.to_frame().rename({0:detection_col[2]}, axis=1)

rare_zeros = m.merge(l, on='class')
rare_zeros = rare_zeros.merge(p, on='class')
rare_zeros['class'] = rare_zeros.index
rare_zeros = rare_zeros.reset_index(drop=True)

def plot_compared_methods_zero_counts(COUNT_FORM, data, y_label, ax=None):
    y = 'Detection rate'
    sm = data[['class'] + COUNT_FORM]
    var_name = 'sampling technique'
    sm = sm.melt(id_vars=['class'], var_name=[var_name], value_name=y)
    sm = sm.sort_values(['class', 'sampling technique'])
    
    if ax:
        sns.barplot(x='class', y=y, hue=var_name, data=sm, ax=ax)
        ax.set_ylabel(y_label)
        ax.set_xticklabels(labels=sorted(sm['class'].unique()), rotation=90)
    else:
        sns.barplot(x='class', y=y, hue=var_name, data=sm)
        plt.ylabel(y_label)
        plt.xticks(rotation=90)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

current_palette_7 = sns.color_palette("muted", 3)
sns.set_palette(current_palette_7[:2])
    
plot_compared_methods_zero_counts(detection_col[:2], rare_zeros, y_label='Detection Rate'); plt.show();

sns.set_palette([current_palette_7[0], current_palette_7[2]])
plot_compared_methods_zero_counts([detection_col[0], detection_col[2]], rare_zeros, y_label='Detection Rate'); plt.show();


sns.set_palette(current_palette_7[1:])
plot_compared_methods_zero_counts(detection_col[1:], rare_zeros, y_label='Detection Rate'); plt.show();

sns.set_palette(current_palette_7)
plot_compared_methods_zero_counts(detection_col, rare_zeros, y_label='Detection Rate'); plt.show();

In [None]:
%%output backend='bokeh'
def plot_stacked_bar_chart(smpl_technique, data):
    stacked = data.set_index(['datetime', 'class'])[smpl_technique].unstack().reset_index()
    plot = stacked.hvplot.bar(x='datetime', y=sorted(list(data['class'].unique())), stacked=True, rot=30, logy=True).opts(width=750, height=600, title=smpl_technique).redim(Variable=hv.Dimension('Variable', range=(-10, 90)))
    print(plot.dimensions)
    return plot

micro = plot_stacked_bar_chart(COUNT_FORM[0], data)
lab = plot_stacked_bar_chart(COUNT_FORM[1], data)
pier = plot_stacked_bar_chart(COUNT_FORM[2], data)
hv.Layout(micro + lab + pier).cols(1)

In [None]:
def plot_stacked_bar_chart(smpl_technique, data):
    stacked = data.set_index(['datetime', 'class'])[smpl_technique].unstack().reset_index()
    plot = stacked.hvplot.bar(x='datetime', y=sorted(list(data['class'].unique())), stacked=True, rot=30).opts(width=1200, height=600, title=smpl_technique, ylabel='Class Percentage', fontscale=1.5).redim(Variable=hv.Dimension('Variable', range=(-10, 90)))
    print(plot.dimensions)
    return plot

micro = plot_stacked_bar_chart(class_percentage_counts[0], data)
lab = plot_stacked_bar_chart(class_percentage_counts[1], data)
pier = plot_stacked_bar_chart(class_percentage_counts[2], data)

lab_micro_heatmap = plot_heatmap(COUNTS[0], data)
spc_lab_heatmap = plot_heatmap(COUNTS[1], data)
spc_pier_heatmap = plot_heatmap(COUNTS[2], data)

micro_table = data.groupby('datetime')[raw_counts[0]].sum().to_frame().transpose().hvplot.table(width=1750)
lab_table = data.groupby('datetime')[raw_counts[1]].sum().to_frame().transpose().hvplot.table(width=1750)
pier_table = data.groupby('datetime')[raw_counts[2]].sum().to_frame().transpose().hvplot.table(width=1750)

# hv.Layout(lab_micro_heatmap + spc_lab_heatmap + spc_pier_heatmap).cols(1)

# hv.Layout(micro + lab + pier).cols(1)

hv.Layout(micro + lab_micro_heatmap + micro_table + lab + spc_lab_heatmap + lab_table + pier+ spc_pier_heatmap + pier_table).cols(1)

#### Analysis of Class Percentages across 3 methods
- Class percentages could definitely be skewed by the low total counts from the SPCS. So this might not be a good comparison.
- But we could be hopeful that 


In [None]:
matplotlib.rc('xtick', labelsize=16)     
matplotlib.rc('ytick', labelsize=16)
matplotlib.rc('axes', labelsize=16, titlesize=16)

current_palette_7 = sns.color_palette("Set2", 3)
sns.set_palette(current_palette_7[::-1])

from counts_analysis.gtruth_analysis import plot_correlation

plot_correlation(data, COUNT_FORM)

In [None]:
COUNT_FORM = class_percentage_counts
plot_correlation(data, COUNT_FORM)

In [None]:
def plot_correlation(data, counts, automated=False):
    from validate_exp.v_utils import best_fit

    NUM_COLS = 3
    fig, ax = plt.subplots(4, NUM_COLS, figsize=(10, 10))
    count_type = 'Auto' if automated else 'SPC'
    sns.scatterplot(x=data[counts[0]], y=data[counts[1]], ax=ax[0, 0],
                    label=f'{count_type}-Lab (Y) - Lab-micro (X)')
    sns.scatterplot(x=data[counts[0]], y=data[counts[2]], ax=ax[0, 0],
                    label=f'{count_type}-Pier (Y) - Lab-micro (X)')
    sns.scatterplot(x=data[counts[1]], y=data[counts[2]], ax=ax[0, 0],
                    label=f'{count_type}-Pier (Y) - SPC-Lab (X)')

    ax[0, 0].set_xlabel('Count (X)')
    ax[0, 0].set_ylabel('Count (Y)')

    plt.tight_layout()
    classes = sorted(data['class'].unique())
    for i_ax, cls in enumerate(classes):
        cls_df = data[data['class'] == cls]
        ax_idx = ax[int((i_ax + 1) / NUM_COLS), (i_ax + 1) % NUM_COLS]
        sns.scatterplot(x=cls_df[counts[0]], y=cls_df[counts[1]], ax=ax_idx,
                        label=f'{count_type}-Lab (Y) - Lab-micro (X)')
        sns.scatterplot(x=cls_df[counts[0]], y=cls_df[counts[2]], ax=ax_idx,
                        label=f'{count_type}-Pier (Y) - Lab-micro (X)')
        sns.scatterplot(x=cls_df[counts[1]], y=cls_df[counts[2]], ax=ax_idx,
                        label=f'{count_type}-Pier (Y) - SPC-Lab (X)')
        Xfit, Yfit = best_fit(cls_df[counts[0]], cls_df[counts[1]], False, verbose=False)
        ax_idx.plot(Xfit, Yfit)

        Xfit, Yfit = best_fit(cls_df[counts[0]], cls_df[counts[2]], False, verbose=False)
        ax_idx.plot(Xfit, Yfit)

        Xfit, Yfit = best_fit(cls_df[counts[1]], cls_df[counts[2]], False, verbose=False)
        ax_idx.plot(Xfit, Yfit)

        ax_idx.set_xlabel('Count (X)')
        ax_idx.set_ylabel('Count (Y)')

        ymin, ymax = ax_idx.get_ylim()
        xmin, xmax = ax_idx.get_xlim()

        max_val = xmax if xmax >= ymax else ymax
        ax_idx.set_ylim(0, max_val)
        ax_idx.set_xlim(0, max_val)

        ax_idx.set_title(cls)
        #     set_plotting_opts(ax_idx, logged=LOGGED)
    plt.tight_layout()
    plt.show()

COUNT_FORM = raw_counts
plot_correlation(data, COUNT_FORM)

In [None]:
COUNTS = raw_counts
data_ = data.copy()
data_['datetime'] = pd.to_datetime(data_['datetime'], format='%Y-%m-%d')
data_.groupby('datetime')[COUNTS].sum().hvplot(xformatter=formatter)