In [1]:
from pathlib import Path
from bokeh.plotting import output_notebook, show, figure
from src.utils import get_data_dir
import pandas as pd
import numpy as np


output_notebook()

DIRS = get_data_dir(str(Path('data').resolve()))

# Pandas demo

In [62]:
src = pd.read_csv(str(DIRS.ana / 'class_harmon.csv'))
src.head()

Unnamed: 0.1,Unnamed: 0,tile,region,jc0,smc0,jc10,smc10,jc20,smc20,jc30,smc30
0,0,10N_114E,Asia,0.8391,0.9758,0.8382,0.9757,0.8353,0.9753,0.8318,0.9749
1,1,20N_084E,Asia,0.6087,0.9916,0.5962,0.9916,0.5611,0.9912,0.5083,0.9903
2,2,05N_048E,Africa,0.0293,0.9995,0.0,0.9994,0.0,0.9994,0.0,0.9994
3,3,05N_018E,Africa,0.6003,0.6057,0.6006,0.6062,0.6107,0.6229,0.6266,0.6483
4,4,20N_114E,Asia,0.5078,0.9985,0.5024,0.9986,0.496,0.9986,0.4863,0.9986


In [63]:
data = src['tile region jc0 jc10 jc20 jc30'.split()]
data.head()

Unnamed: 0,tile,region,jc0,jc10,jc20,jc30
0,10N_114E,Asia,0.8391,0.8382,0.8353,0.8318
1,20N_084E,Asia,0.6087,0.5962,0.5611,0.5083
2,05N_048E,Africa,0.0293,0.0,0.0,0.0
3,05N_018E,Africa,0.6003,0.6006,0.6107,0.6266
4,20N_114E,Asia,0.5078,0.5024,0.496,0.4863


In [64]:
# cleanup nan vals and check why they are nan
data.describe()

Unnamed: 0,jc0,jc10,jc20,jc30
count,286.0,284.0,283.0,283.0
mean,0.508069,0.511012,0.495707,0.484802
std,0.322871,0.327037,0.338859,0.341198
min,0.0,0.0,0.0,0.0
25%,0.205725,0.23005,0.1117,0.0726
50%,0.6028,0.60315,0.6038,0.5834
75%,0.7828,0.78755,0.78955,0.7816
max,0.9787,0.9792,0.9796,0.9798


In [65]:
# select nans
america = data[data.region == 'Americas']
asia = data[data.region == 'Asia'] 
africa = data[data.region == 'Africa']
oceania = data[data.region == 'Oceania']
sevenseas = data[data.region == 'Seven seas (open ocean)']

america.describe()

Unnamed: 0,jc0,jc10,jc20,jc30
count,79.0,79.0,79.0,79.0
mean,0.603339,0.605072,0.603666,0.601632
std,0.267238,0.267397,0.268865,0.271532
min,0.0,0.0,0.0,0.0
25%,0.42395,0.44285,0.46085,0.4522
50%,0.6663,0.6717,0.6611,0.6561
75%,0.79755,0.79895,0.79545,0.796
max,0.9787,0.9792,0.9796,0.9798


# Bokeh demo

In [69]:
data = america

In [117]:
boxplot = data.quantile(q=(0.25, 0.5, 0.75)).T
boxplot.columns = ['q1', 'q2', 'q3']
boxplot['iqr'] = boxplot.q3 - boxplot.q1
boxplot['tukey_lower_whisker'] = boxplot.q1 - 1.5 * boxplot.iqr
boxplot['tukey_upper_whisker'] = boxplot.q3 + 1.5 * boxplot.iqr
boxplot['q_lower_whisker'] = data.quantile(q=0.025)
boxplot['q_upper_whisker'] = data.quantile(q=0.975)
boxplot['min_whisker'] = data.min()
boxplot['max_whisker'] = data.max()
mean = data.mean()
boxplot['means'] = mean

boxplot

Unnamed: 0,q1,q2,q3,iqr,tukey_lower_whisker,tukey_upper_whisker,q_lower_whisker,q_upper_whisker,min_whisker,max_whisker,means
jc0,0.42395,0.6663,0.79755,0.3736,-0.13645,1.35795,0.0,0.92798,0,0.9787,0.603339
jc10,0.44285,0.6717,0.79895,0.3561,-0.0913,1.3331,0.0,0.92918,0,0.9792,0.605072
jc20,0.46085,0.6611,0.79545,0.3346,-0.04105,1.29735,0.0,0.93023,0,0.9796,0.603666
jc30,0.4522,0.6561,0.796,0.3438,-0.0635,1.3117,0.0,0.931365,0,0.9798,0.601632


In [127]:
# plotting
plot = figure(x_range=['jc0', 'jc10', 'jc20', 'jc30'])

# box
plot.vbar(x=boxplot.index, width=0.7, bottom=boxplot.q1, top=boxplot.q2,
          line_color='black', fill_color='#f7f7f7', fill_alpha=0.7)
plot.vbar(x=boxplot.index, width=0.7, bottom=boxplot.q2, top=boxplot.q3,
          line_color='black', fill_color='#67a9cf', fill_alpha=0.7)

# whiskers
plot.rect(x=boxplot.index, y=boxplot.min_whisker, width=0.2, height=0.001, line_color="black")
plot.rect(x=boxplot.index, y=boxplot.max_whisker, width=0.2, height=0.001, line_color="black")

# stems

# mean cross
plot.x(x=boxplot.index, y=boxplot.means, color='#ef8a62', size=15)

show(plot)