In [None]:
import pandas as pd
import numpy as np
import util

from bokeh.plotting import figure, output_notebook, show
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource, Slope, Whisker
from bokeh.palettes import Category10

output_notebook()

In [None]:
data = pd.read_csv('data/train.csv', index_col='id')

cat_colz = ['cat'+str(i) for i in range(10)]
cont_colz = ['cont'+str(i) for i in range(14)]

### Category Means

In [None]:
cat_fig = figure(height=400,
                 width=600,
                 x_range=(-0.5, 9.5),
                 x_axis_label='Category (jittered)',
                 y_axis_label='Mean Target')

slope = Slope(gradient=0.001, 
              y_intercept=data.target.mean(),
              line_width=2, 
              line_dash='dashed',
              level='underlay')
cat_fig.add_layout(slope)

# Axis labels, tick markers, etc.
cat_fig.xaxis[0].ticker.desired_num_ticks = 10
cat_fig.xaxis[0].ticker.num_minor_ticks = 0
cat_fig.xaxis.axis_label_text_font_size = "16pt"
cat_fig.yaxis.axis_label_text_font_size = "16pt"
cat_fig.xaxis.major_label_text_font_size = "12pt"
cat_fig.yaxis.major_label_text_font_size = "12pt"

for i, col in enumerate(cat_colz):
    cat_stats = data[[col, 'target']] \
        .groupby(col) \
        .agg(mean_target=('target', 'mean'),
             sem_target=('target', 'sem'),
             counts=('target', 'count'))
    
    random_x = np.random.random(len(cat_stats))
    cat_stats['jitter_x'] = i + (random_x - 0.5) / 3
    cat_stats['log_percent'] = np.log(cat_stats.counts) / np.log(cat_stats.counts.sum())
    
    cat_stats.sort_values('log_percent', ascending=False, inplace=True)
    
    cat_fig.circle(cat_stats.jitter_x, cat_stats.mean_target,
             size=15*cat_stats.log_percent,
             color=Category10[10][i%10],
             alpha=0.8,
             line_color='black')
    
show(cat_fig)