In [5]:
from pathlib import Path
from bokeh.plotting import output_notebook, show, figure
from src.utils import get_data_dir
import pandas as pd
import numpy as np


output_notebook()

DIRS = get_data_dir(str(Path('data').resolve()))

# Pandas demo

In [189]:
src = pd.read_csv(str(DIRS.ana / 'class_harmon.csv'))
src.head()

Unnamed: 0.1,Unnamed: 0,tile,region,jc0,smc0,jc10,smc10,jc20,smc20,jc30,smc30
0,0,10N_114E,Asia,0.8391,0.9758,0.8382,0.9757,0.8353,0.9753,0.8318,0.9749
1,1,20N_084E,Asia,0.6087,0.9916,0.5962,0.9916,0.5611,0.9912,0.5083,0.9903
2,2,05N_048E,Africa,0.0293,0.9995,0.0,0.9994,0.0,0.9994,0.0,0.9994
3,3,05N_018E,Africa,0.6003,0.6057,0.6006,0.6062,0.6107,0.6229,0.6266,0.6483
4,4,20N_114E,Asia,0.5078,0.9985,0.5024,0.9986,0.496,0.9986,0.4863,0.9986


In [190]:
data = src['tile region jc0 jc10 jc20 jc30'.split()]
data.head()

Unnamed: 0,tile,region,jc0,jc10,jc20,jc30
0,10N_114E,Asia,0.8391,0.8382,0.8353,0.8318
1,20N_084E,Asia,0.6087,0.5962,0.5611,0.5083
2,05N_048E,Africa,0.0293,0.0,0.0,0.0
3,05N_018E,Africa,0.6003,0.6006,0.6107,0.6266
4,20N_114E,Asia,0.5078,0.5024,0.496,0.4863


In [191]:
data.describe()

Unnamed: 0,jc0,jc10,jc20,jc30
count,286.0,284.0,283.0,283.0
mean,0.508069,0.511012,0.495707,0.484802
std,0.322871,0.327037,0.338859,0.341198
min,0.0,0.0,0.0,0.0
25%,0.205725,0.23005,0.1117,0.0726
50%,0.6028,0.60315,0.6038,0.5834
75%,0.7828,0.78755,0.78955,0.7816
max,0.9787,0.9792,0.9796,0.9798


In [192]:
# select nans
america = data[data.region == 'Americas']
asia = data[data.region == 'Asia'] 
africa = data[data.region == 'Africa']
oceania = data[data.region == 'Oceania']
sevenseas = data[data.region == 'Seven seas (open ocean)']

america.describe()

Unnamed: 0,jc0,jc10,jc20,jc30
count,79.0,79.0,79.0,79.0
mean,0.603339,0.605072,0.603666,0.601632
std,0.267238,0.267397,0.268865,0.271532
min,0.0,0.0,0.0,0.0
25%,0.42395,0.44285,0.46085,0.4522
50%,0.6663,0.6717,0.6611,0.6561
75%,0.79755,0.79895,0.79545,0.796
max,0.9787,0.9792,0.9796,0.9798


# Bokeh demo

In [195]:
data = data

In [196]:
boxplot = data.quantile(q=(0.25, 0.5, 0.75)).T
boxplot.columns = ['q1', 'q2', 'q3']
boxplot['iqr'] = boxplot.q3 - boxplot.q1
boxplot['tukey_lower_whisker'] = boxplot.q1 - 1.5 * boxplot.iqr
boxplot['tukey_upper_whisker'] = boxplot.q3 + 1.5 * boxplot.iqr
boxplot['q_lower_whisker'] = data.quantile(q=0.025)
boxplot['q_upper_whisker'] = data.quantile(q=0.975)
boxplot['min_whisker'] = data.min()
boxplot['max_whisker'] = data.max()
mean = data.mean()
boxplot['mean'] = mean

boxplot

Unnamed: 0,q1,q2,q3,iqr,tukey_lower_whisker,tukey_upper_whisker,q_lower_whisker,q_upper_whisker,min_whisker,max_whisker,mean
jc0,0.205725,0.6028,0.7828,0.577075,-0.659887,1.648413,0.0,0.94895,0,0.9787,0.508069
jc10,0.23005,0.60315,0.78755,0.5575,-0.6062,1.6238,0.0,0.948678,0,0.9792,0.511012
jc20,0.1117,0.6038,0.78955,0.67785,-0.905075,1.806325,0.0,0.947995,0,0.9796,0.495707
jc30,0.0726,0.5834,0.7816,0.709,-0.9909,1.8451,0.0,0.949375,0,0.9798,0.484802


In [171]:
cats = list("abcdef")
yy = np.random.randn(2000)
g = np.random.choice(cats, 2000)
for i, l in enumerate(cats):
    yy[g == l] += i // 2
df = pd.DataFrame(dict(score=yy, group=g))

# find the quartiles and IQR for each category
groups = df.groupby('group')
q1 = groups.quantile(q=0.25)
q2 = groups.quantile(q=0.5)
q3 = groups.quantile(q=0.75)
iqr = q3 - q1
upper = q3 + 1.5*iqr
lower = q1 - 1.5*iqr

# find the outliers for each category
def outliers(group):
    cat = group.name
    return group[(group.score > upper.loc[cat]['score']) | (group.score < lower.loc[cat]['score'])]['score']
out = groups.apply(outliers).dropna()

# prepare outlier data for plotting, we need coordinates for every outlier.
if not out.empty:
    outx = []
    outy = []
    for cat in cats:
        # only add outliers if they exist
        if not out.loc[cat].empty:
            for value in out[cat]:
                outx.append(cat)
                outy.append(value)

# PLOTTING
p = figure(background_fill_color="#EFE8E2", title="", x_range=cats)

# if no outliers, shrink lengths of stems to be no longer than the minimums or maximums
qmin = groups.quantile(q=0.00)
qmax = groups.quantile(q=1.00)
upper.score = [min([x,y]) for (x,y) in zip(list(qmax.loc[:,'score']),upper.score)]
lower.score = [max([x,y]) for (x,y) in zip(list(qmin.loc[:,'score']),lower.score)]

# stems
p.segment(cats, upper.score, cats, q3.score, line_color="black")
p.segment(cats, lower.score, cats, q1.score, line_color="black")

# boxes
p.vbar(cats, 0.7, q2.score, q3.score, fill_color="#E08E79", line_color="black")
p.vbar(cats, 0.7, q1.score, q2.score, fill_color="#3B8686", line_color="black")

# whiskers (almost-0 height rects simpler than segments)
p.rect(cats, lower.score, 0.2, 0.01, line_color="black")
p.rect(cats, upper.score, 0.2, 0.01, line_color="black")

# outliers
if not out.empty:
    p.circle(outx, outy, size=6, color="#F38630", fill_alpha=0.6)

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = "white"
p.grid.grid_line_width = 2
p.xaxis.major_label_text_font_size="12pt"

show(p)