In [4]:
from pathlib import Path
from bokeh.plotting import output_notebook, show, figure, ColumnDataSource, output_file
from bokeh.models import HoverTool, glyphs, FactorRange
from bokeh.transform import jitter
from bokeh.layouts import gridplot
from src.utils import get_data_dir
import pandas as pd
import numpy as np
from collections import Counter

# convenient access to data directories
DIRS = get_data_dir(str(Path('data').resolve()))

# force bokeh plot output to jupyter notebook
# output_notebook()
output_file(str(DIRS.ana / 'class_harmon.html'))

# Class Harmonization

In [2]:
src = pd.read_csv(str(DIRS.ana / 'class_harmon.csv'))

# initial data clean up
src.rename(columns=lambda x: x.upper() if x[:2] == 'jc' else x, inplace=True)
src.drop('smc0 smc10 smc20 smc30'.split(), axis=1, inplace=True)
src.drop(src.columns[0], axis=1, inplace=True)
src.dropna(axis=0, how='any', inplace=True)

# scatterplot data prep
melted = src.melt(id_vars='tile region'.split(), var_name='jc_class', value_name='score')
melted['colors'] = '#ffffff'
melted.loc[melted['jc_class'] == 'JC0', 'colors'] = '#e66101'
melted.loc[melted['jc_class'] == 'JC10', 'colors'] = '#fdb863'
melted.loc[melted['jc_class'] == 'JC20', 'colors'] = '#b2abd2'
melted.loc[melted['jc_class'] == 'JC30', 'colors'] = '#5e3c99'
melted.sort_values(by=['region', 'jc_class'], ascending=[True, True], inplace=True)

# boxplot data prep
frames = []
for key, df in src.groupby('region'):
    boxplot = df.quantile(q=(0.25, 0.5, 0.75)).T
    boxplot.columns = ['q1', 'q2', 'q3']
    boxplot['iqr'] = boxplot.q3 - boxplot.q1
    boxplot['tukey_lower_whisker'] = boxplot.q1 - 1.5 * boxplot.iqr
    boxplot['tukey_upper_whisker'] = boxplot.q3 + 1.5 * boxplot.iqr
    boxplot['q_lower_whisker'] = df.quantile(q=0.025)
    boxplot['q_upper_whisker'] = df.quantile(q=0.975)
    boxplot['min_whisker'] = df.min()
    boxplot['max_whisker'] = df.max()
    boxplot['means'] = df.mean()
    boxplot['region'] = pd.unique(df.region)[0]

    frames.append(boxplot)

box = pd.concat(frames)

In [5]:
# titel and histogram
# scatterplot
source = ColumnDataSource({'x': list(zip(melted.region, melted.jc_class)),
                           'y': melted.score,
                           'id': melted.tile,
                           'colors': melted.colors})
hover = HoverTool(tooltips=[('Region/Class', '@x'),
                            ('Tile', '@id'),
                            ('JC-Score', '@y'),])
factors = [(reg, cls) 
           for reg in pd.unique(melted.region) 
           for cls in pd.unique(melted.jc_class)]

scatter = figure(x_range=FactorRange(*factors), plot_width=950, plot_height=600,
              tools=[hover, 'pan', 'wheel_zoom', 'save', 'reset', 'box_zoom'],
              title="Jaccard score per forest cover class")

scatter.x(x=jitter('x', width=0.6, range=scatter.x_range), y='y', color='colors', source=source)

scatter.xgrid.grid_line_color = None
scatter.xaxis.axis_label = "Region/Class"
scatter.yaxis.axis_label = "Jaccard score"
scatter.y_range.start = -0.01

# boxplot
source = ColumnDataSource({'x': list(zip(box.region, box.index)),
                           'q1': box.q1,
                           'q2': box.q2,
                           'q3': box.q3,
                           'iqr': box.iqr,
                           'lw': box.min_whisker,
                           'uw': box.max_whisker,
                           'means': box.means})
hover = HoverTool(tooltips=[("Region/Class", "@x"),
                            ("Q1", "@q1"),
                            ("Q2", "@q2"),
                            ("Q3", "@q3"),
                            ("IQR", "@iqr"),
                            ("lWhisker", "@lw"),
                            ("uWhisker", "@uw"),
                            ("Mean", "@means"),])

plot = figure(x_range=scatter.x_range, y_range=scatter.y_range,
              plot_width=950, plot_height=300,
              tools=[hover, 'pan', 'wheel_zoom', 'save', 'reset', 'box_zoom'])

# box
plot.vbar(x='x', width=0.7, bottom='q1', top='q2',
          line_color='black', fill_color='#f7f7f7', fill_alpha=0.7, source=source)
plot.vbar(x='x', width=0.7, bottom='q2', top='q3',
          line_color='black', fill_color='#67a9cf', fill_alpha=0.7, source=source)

# whiskers
plot.rect(x='x', y='lw', width=0.2, height=0.001,
          line_color="black", source=source)
plot.rect(x='x', y='uw', width=0.2, height=0.001,
          line_color="black", source=source)

# stems
plot.segment(x0='x', y0='lw', x1='x', y1='q1',
             line_color='black', source=source)
plot.segment(x0='x', y0='q3', x1='x', y1='uw',
             color='black', source=source)

# mean cross
plot.x(x='x', y='means', color='#ef8a62', size=10, source=source)

plot.xgrid.grid_line_color = None
plot.xaxis.axis_label = "Region/Class"
plot.yaxis.axis_label = "Jaccard score"
plot.y_range.start = -0.01

# display plots
show(gridplot([[scatter],[plot]]))