In [None]:
from IPython.lib.deepreload import reload
%load_ext autoreload
%autoreload 1

In [None]:
%reload_ext autoreload

In [None]:
import logging
import random

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as plt_colors
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')
import scipy.stats as st
import holoviews as hv
hv.extension('bokeh')
from holoviews import dim
from IPython.display import Markdown, display
from IPython.core.display import HTML

import matplotlib
matplotlib.rc('xtick', labelsize=14)     
matplotlib.rc('ytick', labelsize=14)
matplotlib.rc('axes', labelsize=14, titlesize=14)

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import hvplot.pandas

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import summary_table

from counts_analysis.c_utils import COUNTS_CSV, CLASSES, set_settings, set_counts_v2, rename_columns

#== Load Datasets ==#
df = pd.read_csv(COUNTS_CSV['counts'])
df = rename_columns(df)
# Dataset without problematic classes (Gyrodinium, Pseudo-nitzchia chain)
df_ = df[df['class'].isin(CLASSES)].reset_index(drop=True)
data = df.copy()

def printmd(string):
    display(Markdown(string))

#=== Set count forms & settings ===#
# COUNT
volumetric_counts = set_counts_v2('cells/mL', micro_default=True)

raw_counts = set_counts_v2('count', micro_default=False)
raw_counts_pred = set_counts_v2('count', micro_default=False, automated=True)

vol_time_counts = set_counts_v2('count', micro_default=True)

class_percentage_counts = set_counts_v2('class percentage', micro_default=False) 

rel_counts = set_counts_v2('relative abundance', micro_default=False)
rel_counts = ['Lab-micro count relative abundance'] + list(rel_counts[1:])

class_percentage_counts_pred = set_counts_v2('class percentage', micro_default=False, automated=True)
class_percentage_counts_pred = ['Lab-micro class percentage'] + list(class_percentage_counts_pred[1:])

In [None]:
def display_side_by_side(dfs:list, captions:list):
    """Display tables side by side to save vertical space
    Input:
        dfs: list of pandas.DataFrame
        captions: list of table captions
    """
    output = ""
    combined = dict(zip(captions, dfs))
    for caption, df in combined.items():
        output += df.style.set_table_attributes("style='display:inline'").set_caption(caption)._repr_html_()
        output += "\xa0\xa0\xa0"
    display(HTML(output))

In [None]:
from counts_analysis.plot_class_summary import plot_summary_both_count_forms, plot_class_summary

def compute_volumetric(spc_camera, data):
    """
    Usage
    
    >>> compute_volumetric(spc_camera='Auto-Pier', data)
    >>> compute_volumetric(spc_camera='SPC-Pier', data)
    
    """
    if 'Pier' in spc_camera:
        normalization_factor = 160
    else:
        normalization_factor = 60
    data[f'{spc_camera} cells/mL'] = data[f'{spc_camera} count'] / normalization_factor
    return data

def compute_class_percentage(raw_count, data):
    if 'cells/mL' in raw_count:
        relative_column = 'Lab-micro cells/mL class percentage'
    else:
        relative_column = '{} class percentage'.format(raw_count.split(" count")[0])
    data[relative_column] = data.groupby('datetime')[raw_count].apply(lambda x: x / x.sum() * 100.0 if sum(x) != 0 else x)
    return data

def compute_relative_abundance(raw_count, data):
    if 'cells/mL' in raw_count:
        relative_column = 'Lab-micro cells/mL relative abundance'
    else:
        relative_column = '{} relative abundance'.format(raw_count.split(" count")[0])
    data[relative_column] = data.groupby('class')[raw_count].apply(lambda x: x / x.sum() * 100.0 if sum(x) != 0 else x)
    return data

def preprocess_raw_counts(count_form, raw_counts, data):
    if count_form == 'volumetric':
        compute_fn = compute_volumetric
        
    elif count_form == 'class percentage':
        compute_fn = compute_class_percentage
        
    elif count_form == 'relative abundance':
        compute_fn = compute_relative_abundance
    
    for rc in raw_counts:
        data = compute_fn(rc, data)
    return data

#todo maybe load dataset???

data = df.copy()

data = preprocess_raw_counts('class percentage', raw_counts, data)
data = preprocess_raw_counts('relative abundance', raw_counts, data)

data = preprocess_raw_counts('class percentage', raw_counts_pred, data)

In [None]:
def plot_class_summary(counts, data, relative=False):
    """ Plot individual summaries of each class

    Usage

    >>> plot_class_summary(rc_counts, cls_df)

    Args:
        counts:
        data:
        relative:

    Returns:

    """
    fontscale = 1.5
    title_pre = 'Compared Counts' if not relative else '[Relative Abundance]'
    xy = 'Count' if not relative else 'Relative Abundance'
    max_val = max(data[list(counts)].max()) + 10

    # boxwhisker plot
    rot = 0 if not relative else 5
    bx = data.groupby('datetime')[counts].sum().hvplot.hist(y=list(counts),
                                                           group_label='Sampling Technique',
                                                           value_label=xy,
                                                           label='{} Distribution'.format(title_pre),
                                                           rot=rot)\
        .opts(tools=['hover'], width=400, height=500, show_legend=False, fontscale=fontscale)

    # time series
    ts = data.groupby('datetime')[counts].sum().hvplot.line(rot=30, value_label='Total Count', group_label='Sampling Techniques', label=f'{title_pre} Time Series').\
        opts(height=500, width=800, legend_position='top_right', fontscale=fontscale)

    # correlation plot
    dot_size, alpha = 8, 0.6
    line_width = 5

    sc1 = hv.Scatter(data, counts[0], [counts[1], 'datetime', 'class'], label='lab - micro').opts(size=dot_size, alpha=alpha, tools=['hover'], fontscale=fontscale, color='blue')
#     reg = hv.Slope.from_scatter(sc1).opts(alpha=alpha, tools=['hover'], fontscale=fontscale, color='blue', line_width=line_width)
    reg, predict_ci, predict_mean_ci = get_linear_fit([counts[0], counts[1]], data, color='blue', fontscale=1.5, line_width=5)
    corr1 = sc1 * reg * predict_ci * predict_mean_ci

    sc2 = hv.Scatter(data, counts[0], [counts[2], 'datetime', 'class'], label='pier - micro').opts(size=dot_size, alpha=alpha, tools=['hover'], fontscale=fontscale, color='gold')
#     reg2 = hv.Slope.from_scatter(sc2).opts(alpha=alpha, tools=['hover'], fontscale=fontscale, color='gold', line_width=line_width)
    reg, predict_ci, predict_mean_ci = get_linear_fit([counts[0], counts[2]], data, color='gold', fontscale=1.5, line_width=5)
    corr2 = sc2 * reg * predict_ci * predict_mean_ci 

    sc3 = hv.Scatter(data, counts[1], [counts[2], 'datetime', 'class'], label='pier - lab').opts(size=dot_size, alpha=alpha, tools=['hover'], fontscale=fontscale, color='red')
#     reg3 = hv.Slope.from_scatter(sc3).opts(alpha=alpha, tools=['hover'], fontscale=fontscale, color='red', line_width=line_width)
    reg, predict_ci, predict_mean_ci = get_linear_fit([counts[1], counts[2]], data, color='red', fontscale=1.5, line_width=5)
    corr3 = sc3 * reg * predict_ci * predict_mean_ci 

#     corr = sc1*sc2*sc3*reg*reg2*reg3
    corr = (corr1 * corr2 * corr3).opts(xlabel=xy , ylabel=xy,
                                            title=f'{title_pre} Correlation', xlim=(0, max_val), ylim=(0, max_val), tools=['hover'], width=650, height=500, legend_position='right')

    cls_plot = hv.Layout(bx + ts + corr).cols(3)

    return cls_plot

In [None]:
def _plot_class_summary(cls, x, y, x_relative=False, y_relative=True, classifier=False):
    def plot_class_summary_2method(counts, data, relative=False, color=None):
        """ Plot individual summaries of each class

        Usage

        >>> plot_class_summary(rc_counts, cls_df)

        Args:
            counts:
            data:
            relative:

        Returns:

        """
        fontscale = 1.5
        title_pre = '[{}]'.format(cls) if not relative else '[Relative Abundance]'
        xy = 'Count' if not relative else 'Relative Abundance'
        max_val = max(data[list(counts)].max()) + 10

        # boxwhisker plot
        rot = 0 if not relative else 5
        bx = data.groupby('datetime')[counts].sum().hvplot.hist(y=list(counts),
                                                               label='{} Distribution'.format(title_pre),
                                                                xlabel='Count Value', ylabel='Frequency of Count',
                                                               rot=rot,
                                                               color=color[:2])\
            .opts(tools=['hover'], width=400, height=500, fontscale=fontscale, show_legend=False)

        # time series
        ts = data.groupby('datetime')[counts].sum().hvplot.line(rot=30, value_label='Count', group_label='Sampling Techniques', label=f'{title_pre} Time Series', color=color[:2]).\
            opts(height=500, width=800, legend_position='top_right', fontscale=fontscale)

        # correlation plot
        dot_size, alpha = 8, 0.6
        line_width = 5

        sc1 = hv.Scatter(data, counts[0], [counts[1], 'datetime', 'class'], label='Raw Data').opts(size=dot_size, alpha=alpha, tools=['hover'], color=color[-1], fontscale=fontscale)
#         reg = hv.Slope.from_scatter(sc1).opts(alpha=alpha, tools=['hover'], color=color[-1], fontscale=fontscale, line_width=line_width)
        reg, predict_ci, predict_mean_ci = get_linear_fit(counts, data, color=color[-1], fontscale=1.5, line_width=5)
#         corr = (sc1*reg * predict_ci * predict_mean_ci)

        corr = (sc1*reg * predict_ci * predict_mean_ci).opts(xlabel=counts[0] , ylabel=counts[1],
                                                title=f'{title_pre} Correlation', xlim=(0, max_val), ylim=(0, max_val), tools=['hover'], width=650, height=500, legend_position='right')

        cls_plot = hv.Layout(bx + ts + corr).cols(3)

        return cls_plot
    
    def filter_class(cls, x_data, y_data):
        x_cls_df = x_data[x_data['class'] == cls].reset_index(drop=True)
        y_cls_df = y_data[y_data['class'] == cls].reset_index(drop=True)
        return x_cls_df, y_cls_df

    def plot_summary(x, y):
        x_count, x_data, x_relative = x
        y_count, y_data, y_relative = y
        datetime_col = ['datetime']
        display_side_by_side([x_data[datetime_col + list(x_count)], y_data[datetime_col + list(y_count)]], ['raw', 'class percentage'])
        printmd('### Sum total over N=26 days')
        sum_counts = pd.DataFrame(x_data[list(x_count)].sum(), columns=['sum']).T
        x_desc = pd.concat([x_data[list(x_count)].describe(), sum_counts])
        sum_counts = pd.DataFrame(y_data[list(y_count)].sum(), columns=['sum']).T
        y_desc = pd.concat([y_data[list(y_count)].describe(), sum_counts])

        display_side_by_side([x_desc, y_desc], ['raw descriptors', 'relative descriptors'])
        
        
        plot1 = plot_class_summary_2method([x_count[0], x_count[1]], x_data, relative=x_relative, color=['blue', 'orange', 'blue'])
        plot2 = plot_class_summary_2method([x_count[0], x_count[2]], x_data, relative=x_relative, color=['blue', 'green', 'gold'])
        plot3 = plot_class_summary_2method([x_count[1], x_count[2]], x_data, relative=x_relative, color=['orange', 'green', 'red'])
        
        plot4 = plot_class_summary(x_count, x_data, relative=x_relative)

        return hv.Layout(plot4 + plot1 + plot2 + plot3 ).cols(3).opts(shared_axes=False)

    x_df, y_df = filter_class(cls, x, y)
    printmd(f'# {cls} | Classifier Counts ({classifier})')
    if classifier:
        x_counts, y_counts = raw_counts_pred, class_percentage_counts_pred
    else:
        x_counts, y_counts = raw_counts, class_percentage_counts
    return plot_summary((x_counts, x_df, x_relative), (y_counts, y_df, y_relative))

printmd('# Sample of Running `_plot_class_summary()`')
x_relative = False
y_relative = True
cls = 'Akashiwo'

DATASET = data
CLASSIFIER = False
# _plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=CLASSIFIER)

In [None]:
def get_linear_fit(counts, data, color, fontscale=1.5, line_width=5):
    x = data[counts[0]].values
    y = data[counts[1]].values
    X = sm.add_constant(x)
    res = sm.OLS(y, X).fit()
    printmd(f'### {counts}')
    print(res.summary())
    
    st, reg_data, ss2 = summary_table(res, alpha=0.05)
    fittedvalues = reg_data[:, 2]
    predict_mean_se  = reg_data[:, 3]
    predict_mean_ci_low, predict_mean_ci_upp = reg_data[:, 4:6].T
    predict_ci_low, predict_ci_upp = reg_data[:, 6:8].T
    
    X = X[:,1]
    reg = hv.Curve(list(zip(X, fittedvalues)), label='Linear Fit').opts(tools=['hover'], color=color, fontscale=fontscale, line_width=line_width, alpha=0.8)
    predict_ci = hv.Area((X, predict_ci_low, predict_ci_upp), vdims=['y', 'y2'], label='95% Confidence Band').opts(alpha=0.2, color=color)
    predict_mean_ci = hv.Area((X, predict_mean_ci_low, predict_mean_ci_upp), vdims=['y', 'y2'], label='95% Prediction Band').opts(alpha=0.4, color=color)
    
    return reg, predict_ci, predict_mean_ci

def plot_class_summary_2method(counts, data, relative=False, color=None):
    """ Plot individual summaries of each class

    Usage

    >>> plot_class_summary(rc_counts, cls_df)

    Args:
        counts:
        data:
        relative:

    Returns:

    """
    fontscale = 1.5
    title_pre = '[{}]'.format(cls) if not relative else '[Relative Abundance]'
    xy = 'Count' if not relative else 'Relative Abundance'
    max_val = max(data[list(counts)].max()) + 10

    # correlation plot
    dot_size, alpha = 8, 0.6
    line_width = 5

    sc1 = hv.Scatter(data, counts[0], [counts[1], 'datetime', 'class'], label='Raw Data').opts(size=dot_size, tools=['hover'], color=color[-1], fontscale=fontscale)
#     reg = hv.Slope.from_scatter(sc1, label='Linear Fit').opts(alpha=alpha, tools=['hover'], color=color[-1], fontscale=fontscale, line_width=line_width)
    
#     x = data[counts[0]].values
#     y = data[counts[1]].values
#     X = sm.add_constant(x)
#     res = sm.OLS(y, X).fit()
    
#     st, reg_data, ss2 = summary_table(res, alpha=0.05)
#     fittedvalues = reg_data[:, 2]
#     predict_mean_se  = reg_data[:, 3]
#     predict_mean_ci_low, predict_mean_ci_upp = reg_data[:, 4:6].T
#     predict_ci_low, predict_ci_upp = reg_data[:, 6:8].T
        
#     X = X[:,1]
#     reg = hv.Curve(list(zip(X, fittedvalues)), label='Linear Fit').opts(tools=['hover'], color=color[-1], fontscale=fontscale, line_width=line_width, alpha=0.8)
#     hv_predict_ci_low = hv.Curve(list(zip(X, predict_ci_low)), label='95% Confidence Band').opts(color='blue', line_dash='dashed')
#     hv_predict_ci_upp = hv.Curve(list(zip(X, predict_ci_upp)), label='95% Confidence Band').opts(color='blue', line_dash='dashed')
#     hv_predict_mean_ci_low = hv.Curve(list(zip(X, predict_mean_ci_low)), label='predict_mean_ci_low').opts(color='green', line_dash='dashed')
#     hv_predict_mean_ci_upp = hv.Curve(list(zip(X, predict_mean_ci_upp)), label='predict_mean_ci_upp').opts(color='green', line_dash='dashed')
#     predict_ci = hv_predict_ci_low * hv_predict_ci_upp * hv_predict_mean_ci_low * hv_predict_mean_ci_upp
    
#     predict_ci = hv.Area((X, predict_ci_low, predict_ci_upp), vdims=['y', 'y2'], label='95% Confidence Band').opts(alpha=0.2, color=color[-1])
#     predict_mean_ci = hv.Area((X, predict_mean_ci_low, predict_mean_ci_upp), vdims=['y', 'y2'], label='95% Prediction Band').opts(alpha=0.4, color=color[-1])
    reg, predict_ci, predict_mean_ci = get_linear_fit(counts, data, color=color[-1], fontscale=1.5, line_width=5)

    corr = (sc1*reg * predict_ci * predict_mean_ci).opts(xlabel=counts[0] , ylabel=counts[1],
                                            title=f'{title_pre} Correlation', xlim=(0, max_val), ylim=(0, max_val), tools=['hover'], width=650, height=500, legend_position='right')

    cls_plot = hv.Layout(corr).cols(3)

    return cls_plot

cls = 'Prorocentrum micans'
x_data = data[data['class'] == cls].reset_index(drop=True)
plot_class_summary_2method([raw_counts[0], raw_counts[1]], x_data, relative=False, color=['blue', 'orange', 'black'])

In [None]:
import numpy as np
from matplotlib import pyplot as plt

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import summary_table

x = x_data[raw_counts[0]].values
y = x_data[raw_counts[1]].values
X = sm.add_constant(x)
res = sm.OLS(y, X).fit()
print(res.summary())

st, reg_data, ss2 = summary_table(res, alpha=0.05)
fittedvalues = reg_data[:, 2]
predict_mean_se  = reg_data[:, 3]
predict_mean_ci_low, predict_mean_ci_upp = reg_data[:, 4:6].T
predict_ci_low, predict_ci_upp = reg_data[:, 6:8].T

X = X[:,1]
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(x, y, 'o', label="Raw Data")
ax.plot(X, fittedvalues, 'r-', label='Linear Fit')
ax.plot(X, predict_ci_low, 'b--')
ax.plot(X, predict_ci_upp, 'b--', label='95% Confidence Band')
ax.plot(X, predict_mean_ci_low, 'g--')
ax.plot(X, predict_mean_ci_upp, 'g--', label='95% Prediction Band')
# ax.fill_between(X, predict_mean_ci_low, predict_mean_ci_upp, color='red', alpha='0.2')
# ax.fill_between(X, predict_ci_low, predict_ci_upp, color='blue', alpha='0.2')
ax.legend(loc='best');
plt.show()

In [None]:
X, predict_ci_upp

In [None]:
cls = 'Lingulodinium polyedra'
x_data = data[data['class'] == cls].reset_index(drop=True)
ci = 95
sns.lmplot(x=raw_counts[0], y=raw_counts[1], data=x_data, ci=ci, fit_reg=True, n_boot=10000)
sns.lmplot(x=raw_counts[0], y=raw_counts[2], data=x_data, ci=ci, fit_reg=True, n_boot=10000)
sns.lmplot(x=raw_counts[1], y=raw_counts[2], data=x_data, ci=ci, fit_reg=True, n_boot=10000)

In [None]:
cls = 'Akashiwo'
_plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=False)

In [None]:
cls = 'Ceratium falcatiforme or fusus'
_plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=False)

In [None]:
cls = 'Ceratium furca'
_plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=False)

In [None]:
cls = 'Chattonella'
_plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=False)

In [None]:
cls = 'Cochlodinium'
_plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=False)

In [None]:
cls = 'Gyrodinium'
_plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=False)

In [None]:
cls = 'Lingulodinium polyedra'
_plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=False)

In [None]:
# %%opts Scatter [tools=['hover'], width=600, height=600, legend_position='right', logx=True, logy=True, xlim=(-1, None), ylim=(-1, None)]
# %%opts Slope [logx=True, logy=True, xlim=(-1, None), ylim=(-1, None)]

cls = 'Prorocentrum micans'
_plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=False)

In [None]:
cls = 'Pseudo-nitzschia chain'
_plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=False)

## Classifier

In [None]:
CLASSIFIER = True

In [None]:
cls = 'Akashiwo'
_plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=CLASSIFIER)

In [None]:
cls = 'Ceratium falcatiforme or fusus'
_plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=CLASSIFIER)

In [None]:
cls = 'Ceratium furca'
_plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=CLASSIFIER)

In [None]:
cls = 'Cochlodinium'
_plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=CLASSIFIER)

In [None]:
cls = 'Gyrodinium'
_plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=CLASSIFIER)

In [None]:
cls = 'Lingulodinium polyedra'
_plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=CLASSIFIER)

In [None]:
cls = 'Prorocentrum micans'
_plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=CLASSIFIER)

In [None]:
cls = 'Pseudo-nitzschia chain'
_plot_class_summary(cls, DATASET, DATASET, x_relative=False, y_relative=True, classifier=CLASSIFIER)