In [1]:
from jp_doodle import dual_canvas
from IPython.display import display
import numpy as np

In [2]:
def floatarray(seq):
    return np.array(seq, dtype=np.float)

class BoxPlot:
    
    def __init__(
        self,
        minimum,
        maximum,
        quartiles,
        xy_position=(0,0),
        dxdy=(0,1),
        color="black",
        thin_line=1,
        thick_line=5,
        thicker_line=15,
        dmedian=0.01,
        ):
        quartiles = self.quartiles = floatarray(quartiles)
        self.minimum = float(minimum)
        self.maximum = float(maximum)
        (self.q25, self.q50, self.q75) = quartiles
        self.xy = floatarray(xy_position)
        self.dxdy = floatarray(dxdy)
        self.color = color
        self.thin = thin_line
        self.thick = thick_line
        self.thicker = thicker_line
        self.dmedian = dmedian
        
    def draw(self, on_frame, at_xy=None):
        if at_xy is None:
            at_xy = floatarray(self.xy)
        dxdy = self.dxdy
        pmin = at_xy + dxdy * self.minimum
        p25 = at_xy + dxdy * self.q25
        p50 = at_xy + dxdy * self.q50
        median_offset = (self.maximum - self.minimum) * self.dmedian
        pmedian_low = p50 - dxdy * median_offset
        pmedian_high = p50 + dxdy * median_offset
        p75 = at_xy + dxdy * self.q75
        pmax = at_xy + dxdy * self.maximum
        lines = [
            (self.thin, pmin, p25),
            (self.thick, p25, p75),
            (self.thicker, pmedian_low, pmedian_high),
            (self.thin, p75, pmax),
        ]
        for (lineWidth, (x0,y0), (x1,y1)) in lines:
            on_frame.line(x0, y0, x1, y1, color=self.color, lineWidth=lineWidth)

In [3]:
B = BoxPlot(10, 90, (30, 40, 70))
B2 = BoxPlot(20, 80, (30, 65, 70), color="green", dxdy=(1,0))
B3 = BoxPlot(14, 95, (20, 45, 70), color="blue", dxdy=(1,1))

In [4]:
swatch = dual_canvas.swatch(pixels=300, model_height=120)

DualCanvasWidget(status='deferring flush until render')

In [5]:
B.draw(swatch)
B2.draw(swatch)
B3.draw(swatch)
swatch.lower_left_axes(x_anchor=-10, y_anchor=-10, color="pink", min_y=-20, min_x=-20, max_x=100, max_y=100)

swatch.fit(margin=10)

In [6]:
def random_box_plot(color):
    marks = sorted(np.random.random(5) * 100)
    return BoxPlot(marks[0], marks[-1], marks[1:-1], color=color)
    

In [7]:
swatch = dual_canvas.swatch(pixels=300, model_height=120)

swatch.lower_left_axes(x_anchor=-10, y_anchor=-10, color="pink", min_y=-20, min_x=-20, max_x=100, max_y=100)

for i in range(10):
    x = i*10
    b = random_box_plot("green")
    b.draw(swatch, at_xy=(x, 0))
    
swatch.fit(margin=10)


DualCanvasWidget(status='deferring flush until render')

In [24]:
dx0 = floatarray([20, 0])
x0Labels = "man woman teen infant".split()
colors = "red green blue cyan magenta".split()

dx1 = dx0 * (len(x0Labels) + 2)
x1Labels = "NY NJ CA TX".split()

dy = floatarray([0, 120])
yLabels = "meat vegetables fruit".split()

In [28]:
swatch = dual_canvas.swatch(pixels=800, model_height=1200)

guide_origin = dy * (len(yLabels))

for (ix0, x0Label) in enumerate(x0Labels):
    x0 = dx0 * ix0
    p = guide_origin + x0
    color = colors[ix0]
    swatch.text(p[0], p[1], x0Label, degrees=90, color=color)

for (iy, yLabel) in enumerate(yLabels):
    y1 = iy * dy
    swatch.text(-20, y1[1], yLabel, degrees=90)
    for (ix1, x1Label) in enumerate(x1Labels):
        x1 = ix1 * dx1
        swatch.text(x1[0], -20, x1Label)
        for (ix0, x0Label) in enumerate(x0Labels):
            color = colors[ix0]
            x0 = dx0 * ix0
            p = y1 + x1 + x0
            b = random_box_plot(color)
            b.draw(swatch, at_xy=p)
            
swatch.fit(margin=20)

DualCanvasWidget(status='deferring flush until render')

In [None]:
# example dataset

In [10]:
%ls student_performance/

README.txt       student-merge.R  [31mstudent.txt[m[m*
[31mstudent-mat.csv[m[m* [31mstudent-por.csv[m[m* student.zip


In [11]:
fn = "student_performance/student-por.csv"
from jp_doodle import data_tables
data_tables.widen_notebook()

In [12]:
import csv
f = open(fn)
reader = csv.reader(f, delimiter=";")
headers = next(reader)
body = list(reader)
data_tables.Table1(headers, body)

Table1(status='deferring flush until render')

In [12]:
print(open("student_performance/student.txt").read())

# Attributes for both student-mat.csv (Math course) and student-por.csv (Portuguese language course) datasets:
1 school - student's school (binary: "GP" - Gabriel Pereira or "MS" - Mousinho da Silveira)
2 sex - student's sex (binary: "F" - female or "M" - male)
3 age - student's age (numeric: from 15 to 22)
4 address - student's home address type (binary: "U" - urban or "R" - rural)
5 famsize - family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3)
6 Pstatus - parent's cohabitation status (binary: "T" - living together or "A" - apart)
7 Medu - mother's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
8 Fedu - father's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
9 Mjob - mother's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or 

In [13]:
values = {h: set() for h in headers}
for row in body:
    for (h, v) in zip(headers, row):
        values[h].add(v)

In [14]:
values

{'school': {'GP', 'MS'},
 'sex': {'F', 'M'},
 'age': {'15', '16', '17', '18', '19', '20', '21', '22'},
 'address': {'R', 'U'},
 'famsize': {'GT3', 'LE3'},
 'Pstatus': {'A', 'T'},
 'Medu': {'0', '1', '2', '3', '4'},
 'Fedu': {'0', '1', '2', '3', '4'},
 'Mjob': {'at_home', 'health', 'other', 'services', 'teacher'},
 'Fjob': {'at_home', 'health', 'other', 'services', 'teacher'},
 'reason': {'course', 'home', 'other', 'reputation'},
 'guardian': {'father', 'mother', 'other'},
 'traveltime': {'1', '2', '3', '4'},
 'studytime': {'1', '2', '3', '4'},
 'failures': {'0', '1', '2', '3'},
 'schoolsup': {'no', 'yes'},
 'famsup': {'no', 'yes'},
 'paid': {'no', 'yes'},
 'activities': {'no', 'yes'},
 'nursery': {'no', 'yes'},
 'higher': {'no', 'yes'},
 'internet': {'no', 'yes'},
 'romantic': {'no', 'yes'},
 'famrel': {'1', '2', '3', '4', '5'},
 'freetime': {'1', '2', '3', '4', '5'},
 'goout': {'1', '2', '3', '4', '5'},
 'Dalc': {'1', '2', '3', '4', '5'},
 'Walc': {'1', '2', '3', '4', '5'},
 'health':

________

In [29]:
dx0 = floatarray([20, 0])
x0Labels = ['F', 'M']
colors = "red green blue cyan magenta".split()

dx1 = dx0 * (len(x0Labels) + 2)
x1Labels = ['at_home', 'health', 'other', 'services', 'teacher']

dy = floatarray([0, 120])
yLabels = ['at_home', 'health', 'other', 'services', 'teacher']

In [36]:
import pandas as pd
import numpy as np
df = pd.read_csv(fn,delimiter=';')

In [37]:
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [112]:
def q25(x):
    return x.quantile(0.25)
def q75(x):
    return x.quantile(0.75)
def qmin(x):
#     return q25(x)-1.5*(q75(x)-q25(x))
    return x.min()
def qmax(x):
#     return q75(x)+1.5*(q75(x)-q25(x))
    return x.max()

df_stat = df.groupby(['Mjob','Fjob','sex']).agg({'G3':[qmin,q25,'median',q75,qmax]}).reset_index()
df_stat.head()

Unnamed: 0_level_0,Mjob,Fjob,sex,G3,G3,G3,G3,G3
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,qmin,q25,median,q75,qmax
0,at_home,at_home,F,0,10.0,12.0,14.0,18
1,at_home,at_home,M,8,10.0,11.0,13.0,18
2,at_home,health,F,16,16.0,16.0,16.0,16
3,at_home,health,M,6,6.0,6.0,6.0,6
4,at_home,other,F,0,10.0,11.0,13.0,18


In [113]:
df_stat.iloc[df_stat.groupby(['Mjob','Fjob','sex']).groups['at_home','health','F']]['G3']

Unnamed: 0,qmin,q25,median,q75,qmax
2,16,16.0,16.0,16.0,16


In [121]:
def get_stat(g1, g2, g3, color):
    df_group = df_stat.groupby(['Mjob','Fjob','sex']).groups
    stat = None
    if (g1, g2, g3) not in df_group:
        stat = np.array([0.,0.,0.,0.,0.])
    else:
        stat = df_stat.iloc[df_group[g1,g2,g3]]['G3'].values[0]
    return BoxPlot(stat[0], stat[-1], stat[1:-1], color=color)

In [125]:
swatch = dual_canvas.swatch(pixels=800, model_height=1200)

guide_origin = dy * (len(yLabels))

for (ix0, x0Label) in enumerate(x0Labels):
    x0 = dx0 * ix0
    p = guide_origin + x0
    color = colors[ix0]
    swatch.text(p[0], p[1], x0Label, degrees=90, color=color)

for (iy, yLabel) in enumerate(yLabels):
    y1 = iy * dy
    swatch.text(-20, y1[1], yLabel, degrees=90)
    for (ix1, x1Label) in enumerate(x1Labels):
        x1 = ix1 * dx1
        swatch.text(x1[0], -20, x1Label)
        for (ix0, x0Label) in enumerate(x0Labels):
            color = colors[ix0]
            x0 = dx0 * ix0
            p = y1 + x1 + x0
            b = get_stat(yLabel, x1Label, x0Label, color)
            b.draw(swatch, at_xy=p)
            
swatch.fit(margin=10)

DualCanvasWidget(status='deferring flush until render')