In [30]:
import pandas as pd
import numpy as np
import bokeh
from scipy.stats import gaussian_kde
from bokeh.models import Range1d

from bokeh.palettes import Blues9
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook, show
from bokeh.sampledata.autompg import autompg as df

# bokeh.io.output_notebook()


def kde(x, y, N):
    xmin, xmax = x.min(), x.max()
    ymin, ymax = y.min(), y.max()

    X, Y = np.mgrid[xmin:xmax:N*1j, ymin:ymax:N*1j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    values = np.vstack([x, y])
    kernel = gaussian_kde(values)
    Z = np.reshape(kernel(positions).T, X.shape)

    return X, Y, Z


# demos = ['women', 'black', 'hispanic', 'native american']
# %matplotlib inline

In [2]:
bokeh.__version__

'3.2.0'

In [12]:
def codes_to_vals(unis):
    encodings = pd.read_excel('../../data/hd2020.xlsx', sheet_name='Frequencies')
    varnames = list(encodings['varname'].unique())
    for varname in varnames:
        try:
            filtered = encodings[encodings['varname'] == varname].set_index('codevalue')
            filtered.index = filtered.index.astype(int)
            mapping = filtered['valuelabel'].to_dict()
            unis[varname] = unis[varname].replace(mapping)
        except:
            continue
    return unis

In [23]:
black_trends = pd.read_csv('../../out/trends_black.csv', index_col=0)
women_trends = pd.read_csv('../../out/trends_women.csv', index_col=0)
hispanic_trends = pd.read_csv('../../out/trends_hispanic.csv', index_col=0)
trend = pd.read_csv('../../out/trends.csv', index_col=0)

unis = pd.read_csv('../../data/IPEDS/university_info/hd2021.csv', encoding='latin-1', index_col=0)

unis = codes_to_vals(unis)

black_unis_stats = pd.merge(unis, black_trends, left_index=True, right_index=True)
women_unis_stats = pd.merge(unis, women_trends, left_index=True, right_index=True)
hispanic_unis_stats = pd.merge(unis, hispanic_trends, left_index=True, right_index=True)

In [39]:
unis_stats['INSTSIZE']

UNITID
100654       5,000 - 9,999
100663    20,000 and above
100706       5,000 - 9,999
100724       1,000 - 4,999
100751    20,000 and above
                ...       
453215         Under 1,000
458919       5,000 - 9,999
458964       1,000 - 4,999
459994     10,000 - 19,999
475121       5,000 - 9,999
Name: INSTSIZE, Length: 988, dtype: object

In [17]:
unis_stats['cs_stat']

NameError: name 'cs_stat' is not defined

In [105]:
def graph (demo, trends):
    p = figure(width=800, height=800, x_axis_label="Non-CS stat", y_axis_label="CS stat",
            background_fill_color="#fafafa", tools="", toolbar_location=None,
            title="%s CS stat and Non-CSZ stat"%demo)
    x, y, z = kde(trends['noncs_stat'], trends['cs_stat'], 300)
    p.grid.level = "overlay"
    p.grid.grid_line_color = "black"
    p.grid.grid_line_alpha = 0.4
    p.grid.minor_grid_line_alpha = 0.4
    p.line(0, 0, line_width=4)
    p.x_range = Range1d(-1, 1)
    p.y_range = Range1d(-1, 1)
    #
    #p.circle(-1, -1, alpha = 1, color = 'black', size = 100)

    palette = Blues9[::-1]
    levels = np.linspace(np.min(z), np.max(z), 10)
    p.contour(x, y, z, levels[1:], fill_color=palette, line_color=palette)
    p.circle(trends['noncs_stat'], trends['cs_stat'], size=(trends['INSTSIZE']*100).values, fill_alpha = 1, line_color='black', alpha=0.5)
    

    output_file('%s_bokeh_plot.html'%demo)
    show(p)

In [106]:
trends = [women_unis_stats, black_unis_stats, hispanic_unis_stats]
names = ['Women', 'Black', 'Hispanic']

for i, trend in enumerate(trends):
    graph(names[i], trend)