In [None]:
from IPython.lib.deepreload import reload
%load_ext autoreload
%autoreload 1

In [None]:
import logging
import random

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as plt_colors
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')
import scipy.stats as st
import holoviews as hv
hv.extension('bokeh')
from holoviews import dim
from IPython.display import Markdown, display
from IPython.core.display import HTML

import matplotlib
matplotlib.rc('xtick', labelsize=14)     
matplotlib.rc('ytick', labelsize=14)
matplotlib.rc('axes', labelsize=14, titlesize=14)

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from counts_analysis.c_utils import COUNTS_CSV, CLASSES, set_settings, set_counts

#== Load Datasets ==#
df = pd.read_csv(COUNTS_CSV['counts'])
# Dataset without problematic classes (Gyrodinium, Pseudo-nitzchia chain)
df_ = df[df['class'].isin(CLASSES)].reset_index(drop=True)
data = df.copy()

def printmd(string):
    display(Markdown(string))

#=== Set count forms & settings ===#
# COUNT
# SETTING
# Original raw counts
volumetric_counts = set_counts('gtruth', 'cells/mL', micro_default=True)
rc_counts = set_counts('gtruth', 'raw count', micro_default=True)
rc_counts_pred = set_counts('predicted', 'raw count', micro_default=True)

raw_counts = set_counts('gtruth', 'raw count', micro_default=False)
raw_counts_pred = set_counts('predicted', 'raw count', micro_default=False)

rc_settings = set_settings(rc_counts)
print('Example of setting\n{}'.format(rc_settings))
# Relative abundance
rel_counts = set_counts('gtruth', 'relative abundance', micro_default=False)
rel_counts = ['micro cells/mL relative abundance'] + list(rel_counts[1:])
# Classifier predicted counts
rel_counts_pred = set_counts('predicted', 'relative abundance', micro_default=False)
rel_counts_pred = ['micro cells/mL relative abundance'] + list(rel_counts_pred[1:])

#=== Set classifier gtruth vs predictions
lab_gtruth_pred = ['lab {} raw count'.format(lbl) for lbl in ['gtruth', 'predicted']]
pier_gtruth_pred = ['pier {} raw count'.format(lbl) for lbl in ['gtruth', 'predicted']]

In [None]:
def compute_relative_abundance(raw_count, data):
    if 'micro' in raw_count:
        relative_column = 'micro cells/mL relative abundance'
    else:
        relative_column = f'{raw_count.split()[0]} {raw_count.split()[1]} relative abundance'
    data[relative_column] = data.groupby('class')[raw_count].apply(lambda x: x / x.sum() * 100.0 if sum(x) != 0 else x)
    return data

def filter_classes(df, classes):
    return df[~df['class'].isin(classes)].reset_index(drop=True)

def load_absl_counts_dataset(data):
    df = data.copy()
    
    return df

def load_baseline_dataset(data):
    df = data.copy()
    
    return df

def load_rel_class_sum_dataset(data):
    df = data.copy()

    # Compute relative abundance
    for rc in list(rc_counts + rc_counts_pred):
        df = compute_relative_abundance(rc, df)
    return df

def load_seasonal_dataset(data):
    df = data.copy()

    # Compute relative abundance
    for rc in list(rc_counts + rc_counts_pred):
        df = compute_relative_abundance(rc, df)

    # Separate into seasonal/nonseasonal dates
    dates = ['2019-05-23', '2019-05-28', '2019-06-03']
    seasonal = df[df['datetime'].isin(dates)]
    nonseasonal = df[~df['datetime'].isin(dates)]
    return seasonal, nonseasonal

absl_counts = load_absl_counts_dataset(df.copy())
baseline = load_baseline_dataset(df.copy())
rel_class_sum = load_rel_class_sum_dataset(df.copy())
seasonal, nonseasonal = load_seasonal_dataset(df.copy())

In [None]:
np.mean([0.80, 0.59, 0.56, 0.43, 0.70])

In [None]:
"""
# Raw Counts Dataset
"""
y = baseline.copy()
COUNTS = raw_counts
dataset_type = 'RAW_COUNTS'.upper()
printmd(f'# {dataset_type} ERROR ANALYSIS')
printmd('Camera Counts')
display(y[['class', 'datetime'] + list(COUNTS)].head(10))
# printmd('Automated Classifier Counts')
# display(y[['class', 'datetime'] + list(rc_counts_pred)].head(10))

#=== plot distributions ===#
from counts_analysis.plot_class_summary import plot_summary_sampling_class_dist
# printmd('Original Relative Abundance')
# plot_summary_sampling_class_dist(df, rel_counts, False)
printmd(f'### {dataset_type} Camera Distribution')
plot_summary_sampling_class_dist(y, COUNTS, True, relative=False)

# printmd(f'### {dataset_type} Automated Classifier Counts Distribution')
# plot_summary_sampling_class_dist(y, rc_counts_pred, True, relative=False)

from validate_exp.stat_fns import mase, investigate_mase, pearson, concordance_correlation_coefficient

# ms = investigate_mase(y.groupby('class').get_group('Prorocentrum micans'), gtruth=rc_counts[0], pred=rc_counts[1])
# ms['scaled_error'] = ms['error'] / ms['naive']
# display(ms)
# print(np.mean(ms['scaled_error']))

# Set evaluation metric
stat = mase

# Set settings
settings_ = [set_settings(count) for count in [raw_counts, raw_counts_pred]]
count_forms = dict(zip(['raw_counts', 'raw_counts_pred'], settings_))

from eval_counts import compare_count_forms

# Evaluate count forms
printmd(f'# {dataset_type} MASE')
settings_score = compare_count_forms(count_forms, stat, y)

# Evaluate count forms
printmd(f'# {dataset_type} Pearson')
settings_score = compare_count_forms(count_forms, pearson, y)

In [None]:
"""
# Volumetric Counts Dataset
"""
y = baseline.copy()
dataset_type = 'VOLUMETRIC_COUNTS'.upper()
printmd(f'# {dataset_type} ERROR ANALYSIS')
printmd('Camera Counts')
display(y[['class', 'datetime'] + list(volumetric_counts)].head(10))
# printmd('Automated Classifier Counts')
# display(y[['class', 'datetime'] + list(rc_counts_pred)].head(10))

#=== plot distributions ===#
from counts_analysis.plot_class_summary import plot_summary_sampling_class_dist
# printmd('Original Relative Abundance')
# plot_summary_sampling_class_dist(df, rel_counts, False)
printmd(f'### {dataset_type} Camera Distribution')
plot_summary_sampling_class_dist(y, rc_counts, True, relative=False)

# printmd(f'### {dataset_type} Automated Classifier Counts Distribution')
# plot_summary_sampling_class_dist(y, rc_counts_pred, True, relative=False)

from validate_exp.stat_fns import mase, investigate_mase, pearson, concordance_correlation_coefficient

# ms = investigate_mase(y.groupby('class').get_group('Prorocentrum micans'), gtruth=rc_counts[0], pred=rc_counts[1])
# ms['scaled_error'] = ms['error'] / ms['naive']
# display(ms)
# print(np.mean(ms['scaled_error']))

# Set evaluation metric
stat = mase

# Set settings
settings_ = [set_settings(count) for count in [volumetric_counts]]
count_forms = dict(zip(['volumetric'], settings_))

from eval_counts import compare_count_forms

# Evaluate count forms
printmd(f'# {dataset_type} MASE')
settings_score = compare_count_forms(count_forms, stat, y)

# Evaluate count forms
printmd(f'# {dataset_type} Pearson')
settings_score = compare_count_forms(count_forms, pearson, y)

In [None]:
"""
# Absolute Counts Dataset
"""
y = baseline.copy()
dataset_type = 'ABSL_COUNTS'.upper()
printmd(f'# {dataset_type} ERROR ANALYSIS')
printmd('Camera Counts')
display(y[['class', 'datetime'] + list(rc_counts)].head(10))
printmd('Automated Classifier Counts')
display(y[['class', 'datetime'] + list(rc_counts_pred)].head(10))

#=== plot distributions ===#
from counts_analysis.plot_class_summary import plot_summary_sampling_class_dist
# printmd('Original Relative Abundance')
# plot_summary_sampling_class_dist(df, rel_counts, False)
printmd(f'### {dataset_type} Camera Distribution')
plot_summary_sampling_class_dist(y, rc_counts, True, relative=False)

printmd(f'### {dataset_type} Automated Classifier Counts Distribution')
plot_summary_sampling_class_dist(y, rc_counts_pred, True, relative=False)

from validate_exp.stat_fns import mase, investigate_mase, pearson, concordance_correlation_coefficient

ms = investigate_mase(y.groupby('class').get_group('Prorocentrum micans'), gtruth=rc_counts[0], pred=rc_counts[1])
ms['scaled_error'] = ms['error'] / ms['naive']
# display(ms)
# print(np.mean(ms['scaled_error']))

# Set evaluation metric
stat = mase

# Set settings
settings_ = [set_settings(count) for count in [rc_counts, rc_counts_pred]]
count_forms = dict(zip(['raw', 'raw predicted'], settings_))

from eval_counts import compare_count_forms

# Evaluate count forms
printmd(f'# {dataset_type} MASE')
settings_score = compare_count_forms(count_forms, stat, y)

# Evaluate count forms
printmd(f'# {dataset_type} Pearson')
settings_score = compare_count_forms(count_forms, pearson, y)

In [None]:
"""
# BASELINE Relative Abundance
"""
y = baseline.copy()
dataset_type = 'BASELINE'.upper()
printmd(f'# {dataset_type} ERROR ANALYSIS')
printmd('Camera Counts')
display(y[['class', 'datetime'] + list(rel_counts)].head(10))
printmd('Automated Classifier Counts')
display(y[['class', 'datetime'] + list(rel_counts_pred)].head(10))

#=== plot distributions ===#
from counts_analysis.plot_class_summary import plot_summary_sampling_class_dist
# printmd('Original Relative Abundance')
# plot_summary_sampling_class_dist(df, rel_counts, False)
printmd(f'### {dataset_type} Camera Distribution')
plot_summary_sampling_class_dist(y, rel_counts, False, relative=True)

printmd(f'### {dataset_type} Automated Classifier Counts Distribution')
plot_summary_sampling_class_dist(y, rel_counts_pred, False, relative=True)

from validate_exp.stat_fns import mase, investigate_mase, pearson, concordance_correlation_coefficient

ms = investigate_mase(y.groupby('class').get_group('Prorocentrum micans'), gtruth=rel_counts[0], pred=rel_counts[1])
ms['scaled_error'] = ms['error'] / ms['naive']
# display(ms)
# print(np.mean(ms['scaled_error']))

# Set evaluation metric
stat = mase

# Set settings
settings_ = [set_settings(count) for count in [rel_counts, rel_counts_pred]]
count_forms = dict(zip(['relative', 'relative predicted'], settings_))

from eval_counts import compare_count_forms

# Evaluate count forms
printmd(f'# {dataset_type} MASE')
settings_score = compare_count_forms(count_forms, stat, y)

# Evaluate count forms
printmd(f'# {dataset_type} Pearson')
settings_score = compare_count_forms(count_forms, pearson, y)

In [None]:
"""
# Class Relative Abundance
"""
y = rel_class_sum.copy()
dataset_type = 'class relative abundance'.upper()
printmd(f'# {dataset_type} ERROR ANALYSIS')
printmd('Camera Counts')
display(y[['class', 'datetime'] + list(rel_counts)].head(10))
printmd('Automated Classifier Counts')
display(y[['class', 'datetime'] + list(rel_counts_pred)].head(10))

#=== plot distributions ===#
from counts_analysis.plot_class_summary import plot_summary_sampling_class_dist
# printmd('Original Relative Abundance')
# plot_summary_sampling_class_dist(df, rel_counts, False)
printmd(f'### {dataset_type} Camera Distribution')
plot_summary_sampling_class_dist(y, rel_counts, False, relative=True)

printmd(f'### {dataset_type} Automated Classifier Counts Distribution')
plot_summary_sampling_class_dist(y, rel_counts_pred, False, relative=True)

from validate_exp.stat_fns import mase, investigate_mase, pearson, concordance_correlation_coefficient

ms = investigate_mase(y.groupby('class').get_group('Prorocentrum micans'), gtruth=rel_counts[0], pred=rel_counts[1])
ms['scaled_error'] = ms['error'] / ms['naive']
# display(ms)
# print(np.mean(ms['scaled_error']))

# Set evaluation metric
stat = mase

# Set settings
settings_ = [set_settings(count) for count in [rel_counts, rel_counts_pred]]
count_forms = dict(zip(['relative', 'relative predicted'], settings_))

from eval_counts import compare_count_forms

# Evaluate count forms
printmd(f'# {dataset_type} MASE')
settings_score = compare_count_forms(count_forms, stat, y)

# Evaluate count forms
printmd(f'# {dataset_type} Pearson')
settings_score = compare_count_forms(count_forms, pearson, y)

## Additional Example (Seasonal data)

Seasonal data + Filtered Classes (exclude Gyrodinium Chattonella and Pseudo-nitzschia chain)

In [None]:
"""
# Seasonal (Class Relative Abundance)
"""
y = seasonal.copy()
dataset_type = 'seasonal'.upper()
printmd(f'# {dataset_type} ERROR ANALYSIS')
printmd('Camera Counts')
display(y[['class', 'datetime'] + list(rel_counts)].head(10))
printmd('Automated Classifier Counts')
display(y[['class', 'datetime'] + list(rel_counts_pred)].head(10))

#=== plot distributions ===#
from counts_analysis.plot_class_summary import plot_summary_sampling_class_dist
# printmd('Original Relative Abundance')
# plot_summary_sampling_class_dist(df, rel_counts, False)
printmd(f'### {dataset_type} Camera Distribution')
plot_summary_sampling_class_dist(y, rel_counts, False, relative=True)

printmd(f'### {dataset_type} Automated Classifier Counts Distribution')
plot_summary_sampling_class_dist(y, rel_counts_pred, False, relative=True)

from validate_exp.stat_fns import mase, investigate_mase, pearson, concordance_correlation_coefficient

ms = investigate_mase(y.groupby('class').get_group('Akashiwo'), gtruth=rel_counts[1], pred=rel_counts[2])
ms['scaled_error'] = ms['error'] / ms['naive']
display(ms)
print(np.mean(ms['scaled_error']))

# Set evaluation metric
stat = mase

# Set settings
settings_ = [set_settings(count) for count in [rel_counts, rel_counts_pred]]
count_forms = dict(zip(['relative', 'relative predicted'], settings_))

from eval_counts import compare_count_forms

# Evaluate count forms
printmd(f'# {dataset_type} MASE')
settings_score = compare_count_forms(count_forms, stat, y)

# Evaluate count forms
printmd(f'# {dataset_type} Pearson')
settings_score = compare_count_forms(count_forms, pearson, y)

## Additional Example 2 (Non Seasonal)

In [None]:
"""
# NonSeasonal (Class Relative Abundance)
"""
y = nonseasonal.copy()
dataset_type = 'nonseasonal'.upper()
printmd(f'# {dataset_type} ERROR ANALYSIS')
printmd('Camera Counts')
display(y[['class', 'datetime'] + list(rel_counts)].head(10))
printmd('Automated Classifier Counts')
display(y[['class', 'datetime'] + list(rel_counts_pred)].head(10))

#=== plot distributions ===#
from counts_analysis.plot_class_summary import plot_summary_sampling_class_dist
# printmd('Original Relative Abundance')
# plot_summary_sampling_class_dist(df, rel_counts, False)
printmd(f'### {dataset_type} Camera Distribution')
plot_summary_sampling_class_dist(y, rel_counts, True, relative=True)

printmd(f'### {dataset_type} Automated Classifier Counts Distribution')
plot_summary_sampling_class_dist(y, rel_counts_pred, False, relative=True)

from validate_exp.stat_fns import mase, investigate_mase, pearson, concordance_correlation_coefficient

ms = investigate_mase(y.groupby('class').get_group('Prorocentrum micans'), gtruth=rel_counts[0], pred=rel_counts[1])
ms['scaled_error'] = ms['error'] / ms['naive']
# display(ms)
# print(np.mean(ms['scaled_error']))

# Set evaluation metric
stat = mase

# Set settings
settings_ = [set_settings(count) for count in [rel_counts, rel_counts_pred]]
count_forms = dict(zip(['relative', 'relative predicted'], settings_))

from eval_counts import compare_count_forms

# Evaluate count forms
printmd(f'# {dataset_type} MASE')
settings_score = compare_count_forms(count_forms, stat, y)

# Evaluate count forms
printmd(f'# {dataset_type} Pearson')
settings_score = compare_count_forms(count_forms, pearson, y)

# Helper Visualization Modules

In [None]:
""" PLOT SUMMARY OF A SINGLE CLS DISTRIBUTION

[CLASS] ORIGINAL ABOVE
[CLASS] EXPERIMENTAL BELOW
"""
from counts_analysis.plot_class_summary import plot_summary_both_count_forms, plot_class_summary

def filter_class(cls, x_data, y_data):
    x_cls_df = x_data[x_data['class'] == cls].reset_index(drop=True)
    y_cls_df = y_data[y_data['class'] == cls].reset_index(drop=True)
    return x_cls_df, y_cls_df
                     
def plot_summary(rel_counts, x_data, y_data):
    return hv.Layout(plot_class_summary(rel_counts, x_data, relative=True) + plot_class_summary(rel_counts, y_data, relative=True)).cols(3).opts(shared_axes=False)

x_df, y_df = filter_class('Ceratium falcatiforme or fusus', df, filtered_data)
plot_summary(rel_counts, x_df, y_df) # PLOT SUMMARY ~ plot distribution, time series, correlation of a class

In [None]:
from counts_analysis.plot_daily_counts_summary import plot_summary_daily_counts

def plot_daily_count(date, data):
    date_data = data.groupby('datetime').get_group(date).sort_values(by='class')
    plot_summary_daily_counts(date_data, rc_counts, rel_counts)
    
"""[DATE] ORIGINAL DATASET"""
plot_daily_count('2019-09-11', df)

"""[DATE] EXPERIMENTAL DATASET"""
# plot_daily_count('2019-09-11', filtered_data)