In [1]:
import pandas as pd
import numpy as np
import bokeh
from scipy.stats import gaussian_kde 
from bokeh.palettes import Blues9
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook, show
from bokeh.sampledata.autompg import autompg as df
from math import sin
from random import random
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper, Range1d, CustomJS, TextInput, TabPanel, Tabs, CategoricalColorMapper, WMTSTileSource, CustomJS, TextInput
from bokeh.palettes import plasma, d3
from bokeh.transform import transform
from bokeh.layouts import column


output_notebook()

import bokeh.sampledata
bokeh.sampledata.download()



Using data directory: /Users/masontaylor/.bokeh/data
Skipping 'CGM.csv' (checksum match)
Skipping 'US_Counties.zip' (checksum match)
Skipping 'us_cities.json' (checksum match)
Skipping 'unemployment09.csv' (checksum match)
Skipping 'AAPL.csv' (checksum match)
Skipping 'FB.csv' (checksum match)
Skipping 'GOOG.csv' (checksum match)
Skipping 'IBM.csv' (checksum match)
Skipping 'MSFT.csv' (checksum match)
Skipping 'WPP2012_SA_DB03_POPULATION_QUINQUENNIAL.zip' (checksum match)
Skipping 'gapminder_fertility.csv' (checksum match)
Skipping 'gapminder_population.csv' (checksum match)
Skipping 'gapminder_life_expectancy.csv' (checksum match)
Skipping 'gapminder_regions.csv' (checksum match)
Skipping 'world_cities.zip' (checksum match)
Skipping 'airports.json' (checksum match)
Skipping 'movies.db.zip' (checksum match)
Skipping 'airports.csv' (checksum match)
Skipping 'routes.csv' (checksum match)
Skipping 'haarcascade_frontalface_default.xml' (checksum match)
Skipping 'SampleSuperstore.csv.zip' (

In [2]:
def select_school(trends, new):
    boolmap = trends['INSTNM'].apply(lambda x: new.lower() in x.lower())
    trends = trends[boolmap]
    return trends
    

In [3]:
def update(trends, text_input, scatter_source):
    trends = select_school(trends, text_input)
    sizes = (trends['CTOTALT'] * 1000).tolist()
    radii = (trends['CTOTALT']).tolist()
    list_x = trends['noncs_stat'].tolist()
    list_y = trends['cs_stat'].tolist()
    desc   = trends['INSTNM'].tolist()
    scatter_source.data=dict(x=list_x, y=list_y, desc=desc, sizes=sizes, radii=radii)

In [4]:
def demo_graph(demo, unis, completions):
    trends = pd.read_csv('../out/trends_%s.csv'%demo.lower())
    uni_stats = pd.merge(trends, unis[['UNITID', 'INSTNM', 'INSTSIZE']], on='UNITID')
    uni_stats = pd.merge(uni_stats, completions[['UNITID', 'CTOTALT']], on='UNITID')
    uni_stats['CTOTALT'] = uni_stats['CTOTALT']/1000
    text_input = TextInput(value="default", title="University:")
    text_input.js_on_change("value", CustomJS(code="""
    console.log('text_input: value=' + this.value, this.toString())
    """))
    hover = HoverTool(tooltips=[
        ('Name', '@desc'),
        ("Non-CS stat", "@x"),
        ("CS stat", "@y"),
        ("Size", "@sizes")
    ])
    p = figure(width=800, height=800, x_axis_label="Non-CS stat", y_axis_label="CS stat", \
        background_fill_color="#fafafa", toolbar_location=None,\
        title="%s CS stat and Non-CS stat"%demo, \
            tools=[hover])
    p.title.text_font_size = '20pt'
    p.xaxis.axis_label_text_font_size = "20pt"
    p.yaxis.axis_label_text_font_size = "20pt"
    graph_kde(uni_stats, p)
    scatter_source = scatter(uni_stats, p)
    text_input.on_change('value', lambda attr, old, new: update(trends, text_input, scatter_source))
    output_file('%s_bokeh_plot.html'%demo)
    show(p)
    

In [5]:
def kde(x, y, N):
    xmin, xmax = x.min(), x.max()
    ymin, ymax = y.min(), y.max()

    X, Y = np.mgrid[xmin:xmax:N*1j, ymin:ymax:N*1j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    values = np.vstack([x, y])
    kernel = gaussian_kde(values)
    Z = np.reshape(kernel(positions).T, X.shape)

    return X, Y, Z

In [6]:
def scatter(trends, p):
    sizes = (trends['CTOTALT'] * 1000).tolist()
    radii = (trends['CTOTALT']).tolist()
    list_x = trends['noncs_stat'].tolist()
    list_y = trends['cs_stat'].tolist()
    desc   = trends['INSTNM'].tolist()
    source = ColumnDataSource(data=dict(x=list_x, y=list_y, desc=desc, sizes=sizes, radii=radii))
    mapper = LinearColorMapper(palette=plasma(256), low=min(list_y), high=max(list_y))
    p.circle('x', 'y', size='radii', source=source,
            fill_color = 'blue', line_color='black', alpha=0.3)
            #  fill_color=transform('y', mapper))
    return source

In [7]:
def graph_kde (trends, p):
    x, y, z = kde(trends['noncs_stat'], trends['cs_stat'], 300)
    p.grid.level = "overlay"
    p.grid.grid_line_color = "black"
    p.grid.grid_line_alpha = 0.4
    p.grid.minor_grid_line_alpha = 0.4
    p.line(0, 0, line_width=4)
    p.x_range = Range1d(-1, 1)
    p.y_range = Range1d(-1, 1)##
    #
    #p.circle(-1, -1, alpha = 1, color = 'black', size = 100)

    palette = Blues9[::-1]
    levels = np.linspace(np.min(z), np.max(z), 10)
    p.contour(x, y, z, levels[1:], fill_color=palette, line_color=palette)
    #p.circle(trends['noncs_stat'], trends['cs_stat'], size=(trends['INSTSIZE']*100).values, fill_alpha = 1, line_color='black', alpha=0.5)

In [8]:
def graph():
    unis = pd.read_csv('../data/IPEDS/university_info/hd2020.csv', encoding='latin-1')
    completions = pd.read_csv('../data/IPEDS/completions/c2020_a.csv', encoding='latin-1')
    completions = completions.groupby('UNITID').sum()['CTOTALT'].reset_index()
    demos = ['Women', 'Black', 'Hispanic']
    for demo in demos:
        demo_graph(demo, unis, completions)
    #hi
        
        

In [9]:
graph()