In [1]:
import os
import pandas as pd
import numpy as np
import geopandas as gpd
import bokeh
import json
import us
import matplotlib.pyplot as plt
import seaborn as sns
#import warnings

from bokeh.io import output_notebook, show, output_file
from bokeh.plotting import ColumnDataSource, figure, show, output_file
from bokeh.models import GeoJSONDataSource, LinearColorMapper, ColorBar, NumeralTickFormatter, Slider, HoverTool, Select, Div
from bokeh.palettes import brewer

from bokeh.io.doc import curdoc
from bokeh.layouts import WidgetBox, row, column, gridplot

from bokeh.application import Application
from bokeh.application.handlers import FunctionHandler

from ipywidgets import interact, interact_manual, Dropdown, IntSlider

#warnings.filterwarnings('ignore')
output_notebook()

In [2]:
# set path
#path = r'C:/Users/ShrekTheOger/Documents/GitHub/final-project-final-project-bowen-and-natasia'
path = r'C:\Users\engel\Documents\GitHub\final-project-final-project-bowen-and-natasia'

# Retrive Shape File

In [3]:
# https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.html
county_shp = os.path.join(path+'/raw_data', 'cb_2020_us_county_20m', 'cb_2020_us_county_20m.shp')
county = gpd.read_file(county_shp)

In [4]:
#retrive shape file of universities
universities_shp = os.path.join(path+'/raw_data', 'Colleges_and_Universities-shp', 'Colleges_and_Universities.shp')
universities = gpd.read_file(universities_shp)

## Read & Manipulate Data for Plotting
Read the data that we already cleaned. The data cleaning process and regression analysis could be found in separate py file. 

In [5]:
# read data

# data for income and population
df = os.path.join(path+'/refined_data', 'df_income_pop.csv')
df = pd.read_csv(df)

# data for the universities 
uni_df = os.path.join(path+'/refined_data', 'uni_fund_df.csv')
uni_df = pd.read_csv(uni_df)

In [6]:
# notes to bowen, I found more succinct solution to create GEOID
df['GEOID'] = df['COUNTYFIPS'].apply(lambda x: '{0:05}'.format(x))

In [7]:
#df.insert(3, 'GEOID', l)
df.drop(labels=['state_id', 'county_id'], axis=1, inplace = True)

In [8]:
# Combine yearly data of county level income and population data with county geometries
df_shape = county.merge(df, how = 'inner', on='GEOID').set_geometry('geometry')

In [9]:
# merge university fund data with shape file university geometries
universities['IPEDSID'] = universities['IPEDSID'].astype(str).astype(int)
uni_shape = universities.merge(uni_df, how = 'inner', on = 'IPEDSID').set_geometry('geometry')
uni_shape = uni_shape.to_crs(df_shape.crs)

In [10]:
# filter unniversities that are the top universities
uni_filter = universities.loc[universities['IPEDSID'].isin(uni_df['IPEDSID'])].copy()
uni_filter = uni_filter[['IPEDSID', 'NAME', 'COUNTYFIPS']]

In [11]:
# combined all data: university fund, population, and income
# matched university fund data with the population and income data in that county
all_data = uni_df.merge(uni_filter, how = 'inner', on = ['IPEDSID'])
all_data['year'] = all_data['year'].astype(float)
all_data['COUNTYFIPS'] = all_data['COUNTYFIPS'].astype(float)

all_data = all_data.merge(df_shape, how = 'inner', on=['year', 'state', 'COUNTYFIPS'])

In [12]:
# create data to know how big is the fund for each county
# group fund by country FIPS
# this is to find out how big is the fund for each county
fund_county = all_data.groupby(['year','COUNTYFIPS','state', 'county','total_population', 
                                'total_native', 'total_born_in_state','total_born_out_state',
                                'total_born_outside_US','total_foreign_born','income_past12m'])['fund'].sum().reset_index()

fund_county['year'] = fund_county['year'].astype(int) 

# create column for share of foreigner 
fund_county['share_foreigner'] = (fund_county['total_foreign_born']/fund_county['total_population'])

## Interactive Geographic Map Using Bokeh
Here we use Bokeh to display county level population information with hovering and display income discrepensies using density map. We also displayed where the top 50 universities at each state, also displaying their fund level using hovering effect.

Our code is heavily inspired by Jim King from his website https://jimking100.github.io/2019-09-04-Post-3/

In [13]:
def json_data(selectedYear, selectedState):
    yr = selectedYear
    st = selectedState
    # Pull selected year from state data
    df_yr = df_shape[(df_shape['year'] == yr) & (df_shape['state'] == st)].copy().reset_index(drop=True)
    
    merged_json = json.loads(df_yr.to_json())
    
    # Convert to json preferred string-like object 
    json_data = json.dumps(merged_json)
    return json_data

In [14]:
def json_data_uni(selectedYear, selectedState):
    yr = selectedYear
    st = selectedState
    # Pull selected year from state data
    uni_yr = uni_shape[(uni_shape['year'] == yr) & (uni_shape['state'] == st)].copy().reset_index(drop=True)
    merged_json = json.loads(uni_yr.to_json())
    
    # Convert to json preferred string-like object 
    json_data = json.dumps(merged_json)
    return json_data

In [15]:
"""The disadvantage of using a static scale is some county like Los Angelous has a overly dense population which pulled
the scale upper limit really high, so when we have state level inspections, many counties that have less than 100,000 become 
insignificant, even those big counties like Cook County become drawfted by the abnormal dense population from city like New 
York and Los Angeles."""

# This dictionary contains the formatting for the data in the plots
format_data = [('total_population', 0, 12000000,'0,0', 'Total Population'),
               ('total_native', 0, 6750000,'0,0', 'Total Native'),
               ('total_born_in_state', 0, 5200000,'0,0', 'Total Born in State'),
               ('total_born_out_state', 0, 2000000,'0,0', 'Total Born out State'),
               ('total_born_outside_us', 0, 130000,'0,0', 'Total Born out State'),
               ('total_foreign_born', 0, 3500000,'0,0', 'Total Foreigner'),
               ('income_past12m', 0, 80000,'$0,0', 'Income in Past 12 Months') 
              ]
 
#Create a DataFrame object from the dictionary 
format_df = pd.DataFrame(format_data, columns = ['field' , 'min_range', 'max_range' , 'format', 'verbage'])

In [19]:
# Define the callback function: update_plot
def update_plot(attr, old, new):
    # The input yr is the year selected from the slider
    yr = slider.value
    st = select_st.value
    new_data = json_data(yr, st)
    new_uni = json_data_uni(yr, st)
    
    # Update the data
    geosource.geojson = new_data
    geosource_uni.geojson = new_uni

    # The input cr is the criteria selected from the select box
    input_field = select.value
    #input_field = format_df.loc[format_df['verbage'] == input_field, 'field'].iloc[0]
    
    # Update the plot based on the changed inputs
    p = make_plot(input_field, yr, st)
        
    # Update the layout, clear the old document and display the new document
    layout = column(p, select, select_st, slider)
    curdoc().clear()
    curdoc().add_root(layout)

In [20]:
# Create a plotting function
def make_plot(field_name, yr, st):  
    
    # Set the format of the colorbar
    min_range = format_df.loc[format_df['field'] == field_name, 'min_range'].iloc[0]
    max_range = format_df.loc[format_df['field'] == field_name, 'max_range'].iloc[0]
    field_format = format_df.loc[format_df['field'] == field_name, 'format'].iloc[0]

    # Instantiate LinearColorMapper that linearly maps numbers in a range, into a sequence of colors.
    color_mapper = LinearColorMapper(palette = palette, low = min_range, high = max_range)
    
    # Create color bar.
    format_tick = NumeralTickFormatter(format=field_format)
    color_bar = ColorBar(color_mapper=color_mapper, label_standoff=18, border_line_color=None, location = (0, 0))

    # Create figure object.
    verbage = format_df.loc[format_df['field'] == field_name, 'verbage'].iloc[0]

    p = figure(title = verbage, 
               plot_height = 650, plot_width = 850)
    
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    p.axis.visible = False

    # Add hover tool
    hover = [ ('County','@NAME'),('State', '@state'),('Total population', '@total_population{,}'),
              ('Born instate', '@total_born_in_state{,}'),('Born outstate', '@total_born_out_state{,}'),]
    
    hover_uni = [ ('School','@NAME'),('Fund', '@fund{,}')]

    
    # Add patch renderer to figure. 
    r1 = p.patches('xs','ys', source = geosource, fill_color = {'field' : field_name, 'transform' : color_mapper},
                   line_color = 'black', line_width = 0.25, fill_alpha = 1)

    # Specify color bar layout.
    p.add_layout(color_bar, 'right')

    # Add the hover tool to the graph
    p.add_tools(HoverTool(renderers=[r1], tooltips=hover))
    
    r2 = p.circle('x','y', color = 'red', source=geosource_uni, size=10, fill_alpha = 0.7)
    p.add_tools(HoverTool(renderers=[r2], tooltips=hover_uni))
    
    return p

In [21]:
# Input geojson source that contains features for plotting for:
# Make a slider object: slider 
slider = Slider(title = 'Year',start = 2010, end = 2019, step = 1, value = 2015)

# Make a selection object: select
select = Select(title='Select Criteria:', value='income_past12m', options=['income_past12m', 'total_population'])
# select = Select(title='Select Criteria:', value='income_past12m', options=['Income in Past 12 Months', 'Total Population'])

# Make a selection object: select
select_st = Select(title='Select Target State:', value='Illinois', options=uni_shape['state'].unique().tolist())

yr = slider.value
st = select_st.value
input_field = select.value

# initial year 2015 and initial income per capital for counties -- Income_past12m
geosource = GeoJSONDataSource(geojson = json_data(yr, st))
geosource_uni = GeoJSONDataSource(geojson = json_data_uni(yr, st))
#input_field = 'income_past12m'

# Define a sequential multi-hue color palette.
palette = brewer['Blues'][8]

# Reverse color order so that dark blue is highest obesity.
palette = palette[::-1]

# Call the plotting function
p = make_plot(input_field, yr, st)

# Make a column layout of widgetbox(slider) and plot, and add it to the current document
# Display the current document
layout = column(p, select, select_st, slider)

def modify_doc(doc):
    doc.add_root(column(layout))
    slider.on_change('value', update_plot)
    select.on_change('value', update_plot)
    select_st.on_change('value', update_plot)
    
handler = FunctionHandler(modify_doc)
app = Application(handler)
show(app)