In [1]:
import pandas as pd
import numpy as np
import bokeh
import os
import geopandas as gpd
import us
import ipywidgets as widgets
#import warnings


from ipywidgets import interact, interact_manual, Dropdown, IntSlider
from bokeh.io import push_notebook, output_notebook, show
from bokeh.models import Panel, CustomJS, Div, Select, HoverTool, NumeralTickFormatter, LinearAxis, Range1d
from bokeh.layouts import gridplot, column, row, WidgetBox
from bokeh.plotting import figure, show, output_notebook, ColumnDataSource
from bokeh.io.doc import curdoc

output_notebook()
#warnings.filterwarnings('ignore')

In [2]:
#path = r'C:/Users/ShrekTheOger/Documents/GitHub/Data-II-Project'
# path = r'/Users/bowenli/Documents/GitHub/Data-II-Project'
path = r'C:\Users\engel\Documents\GitHub\final-project-final-project-bowen-and-natasia'

In [3]:
# read data

# data for income and population
df = os.path.join(path+'/refined_data', 'df_income_pop.csv')
df = pd.read_csv(df)

# data for the universities 
uni_df = os.path.join(path+'/refined_data', 'uni_fund_df.csv')
uni_df = pd.read_csv(uni_df)

In [4]:
# notes to bowen, I found more succinct solution to create GEOID
df['GEOID'] = df['COUNTYFIPS'].apply(lambda x: '{0:05}'.format(x))
df.drop(labels=['state_id', 'county_id'], axis=1, inplace = True)

In [5]:
# https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.html
county_shp = os.path.join(path+'/raw_data', 'cb_2020_us_county_20m', 'cb_2020_us_county_20m.shp')
county = gpd.read_file(county_shp)

In [6]:
# Combine yearly data of county level income and population data with county geometries
df_shape = county.merge(df, how = 'inner', on='GEOID').set_geometry('geometry')

In [7]:
#retrive shape file of universities
universities_shp = os.path.join(path+'/raw_data', 'Colleges_and_Universities-shp', 'Colleges_and_Universities.shp')
universities = gpd.read_file(universities_shp)

In [8]:
# merge university fund data with shape file university geometries
universities['IPEDSID'] = universities['IPEDSID'].astype(str).astype(int)
uni_shape = universities.merge(uni_df, how = 'inner', on = 'IPEDSID').set_geometry('geometry')
uni_shape = uni_shape.to_crs(df_shape.crs)

In [9]:
# filter unniversities that are the top universities
uni_filter = universities.loc[universities['IPEDSID'].isin(uni_df['IPEDSID'])].copy()
uni_filter = uni_filter[['IPEDSID', 'NAME', 'COUNTYFIPS']]

In [10]:
# all data: university fund, population, and income
# matched university fund data with the population and income data in that county
all_data = uni_df.merge(uni_filter, how = 'inner', on = ['IPEDSID'])
all_data['year'] = all_data['year'].astype(float)
all_data['COUNTYFIPS'] = all_data['COUNTYFIPS'].astype(float)

all_data = all_data.merge(df_shape, how = 'inner', on=['year', 'state', 'COUNTYFIPS'])

In [11]:
# group fund by country FIPS
# this is to find out how big is the fund for each county
fund_county = all_data.groupby(['year','COUNTYFIPS','state', 'county','total_population', 
                                'total_native', 'total_born_in_state','total_born_out_state',
                                'total_born_outside_US','total_foreign_born','income_past12m'])['fund'].sum().reset_index()

fund_county['year'] = fund_county['year'].astype(int) 

In [12]:
uni_state = fund_county['state'].unique()

# create dictionary for ipwidget 
options = {state: all_data.loc[all_data['state'] == state, 'county'].unique().tolist() for state in uni_state}

In [13]:
#variables = ['Total Born in State', 'Total Born out State', 'Total Born outside US', 
#             'Total Foreigner', 'Income in Past 12 Month']

variables = ['Total Born in State', 'Total Born out State', 'Total Born outside US', 'Total Foreigner']
# create widget
first_widget = Dropdown(options=options.keys())
second_widget = Dropdown()
third_widget= Dropdown(options=variables)

In [14]:
# This dictionary contains the formatting for the data in the plots
format_data = [('total_born_in_state', 0, 1,'0,0', 'Total Born in State'),
               ('total_born_out_state', 0, 1,'0,0', 'Total Born out State'),
               ('total_born_outside_US', 0, 1,'0,0', 'Total Born outside US'),
               ('total_foreign_born', 0, 1,'0,0', 'Total Foreigner'),
               ('income_past12m', 0, 100000,'$0,0', 'Income in Past 12 Months')]
 
#Create a DataFrame object from the dictionary 
format_df = pd.DataFrame(format_data, columns = ['field' , 'min_range', 'max_range' , 'format', 'verbage'])

In [15]:
def data_plot_pop(state, county, variable):
    df_filter = fund_county[(fund_county['state']==state) & (fund_county['county']==county)]
    
    data_plot = pd.DataFrame()
    data_plot['year'] = df_filter['year']
    data_plot[variable] = (df_filter[variable]/df_filter['total_population'])
    data_plot = data_plot.sort_values('year', ascending = True).reset_index(drop=True)
    
    return data_plot 

In [16]:
def data_plot_income(state, county):
    df_filter = fund_county[(fund_county['state']==state) & (fund_county['county']==county)]
    
    data_plot = pd.DataFrame()
    data_plot['year'] = df_filter['year']
    data_plot['income_past12m'] = df_filter['income_past12m']
    data_plot['fund'] = df_filter['fund']
    data_plot = data_plot.sort_values('year', ascending = True).reset_index(drop=True)
    
    return data_plot 

In [17]:
def plot_line_pop(variable, source): 
    var_title = format_df.loc[format_df['field'] == variable, 'verbage'].iloc[0]
    tool = [('value', '@'+variable+'{0.0000}'), ('year','@year')]
    plot = figure(title=f'Share of Total Population: {var_title}', x_axis_label='Year', y_axis_label='natural units', 
                  tooltips=tool, plot_height=400)
    
    r = plot.line(x='year', y=variable , source=source, color='blue')
    r = plot.circle(x='year', y=variable , source=source, color='grey')
    
    return plot

In [18]:
def plot_line_income(source): 
    tool = [('value', '@income_past12m'), ('year','@year')]
    plot = figure(title="Income in Past 12 Months", x_axis_label='Year', y_axis_label='in USD', 
                  tooltips=tool, plot_height=400)
    
    r = plot.line(x='year', y='income_past12m', source=source, color='green')
    r = plot.circle(x='year', y='income_past12m', source=source, color='grey')
    
    plot.yaxis.formatter = NumeralTickFormatter(format="$0")
    
    return plot

In [19]:
def plot_line_fund(source): 
    tool = [('value', '@fund'), ('year','@year')]
    plot = figure(title="R&D Fund", x_axis_label='Year', y_axis_label='in USD', 
                  tooltips=tool, plot_height=400)
    
    r = plot.line(x='year', y='fund', source=source, color='magenta')
    r = plot.circle(x='year', y='fund', source=source, color='grey')
    
    plot.yaxis.formatter = NumeralTickFormatter(format="$0")
    
    return plot

In [20]:
# https://docs.bokeh.org/en/latest/docs/user_guide/data.html
#def update(*args):
#    second_widget.options = options[first_widget.value]

#first_widget.observe(update)

#variable = format_df.loc[format_df['verbage'] == third_widget.value, 'field'].iloc[0]
#initial_data = data_plot_pop('Michigan', 'Washtenaw County', variable)

#source = ColumnDataSource(data=initial_data)
    
#tool = [('value', '@'+variable+'{0.0000}'), ('year','@year')]
#plot = figure(title="Share of Total Population", x_axis_label='Year', y_axis_label='natural units', 
#              tooltips=tool, plot_height=400)

#r = plot.line(x='year', y=variable , source=source, color='blue')
#r = plot.circle(x='year', y=variable , source=source, color='grey')

#def update_plot(state, county, variable):
#    var = format_df.loc[format_df['verbage'] == variable, 'field'].iloc[0]
#    update_data = data_plot_pop(state, county, var)
    
#    source.data = update_data
#    tool = [('value', '@'+var+'{0.0000}'), ('year','@year')]
#    plot = figure(title=f"Share of Total Population: {variable}", x_axis_label='Year', y_axis_label='natural units', 
#              tooltips=tool, plot_height=400)
#    r = plot.line(x='year', y=var, source=source, color='blue')
#    r = plot.circle(x='year', y=var, source=source, color='grey')
    
#    show(plot)
#interact(update_plot, state=first_widget, county=second_widget, variable=third_widget);

In [21]:
# https://docs.bokeh.org/en/latest/docs/user_guide/data.html
def update(*args):
    second_widget.options = options[first_widget.value]

first_widget.observe(update)

initial_data = data_plot_pop('Michigan', 'Washtenaw County', 'total_born_outside_US')
source = ColumnDataSource(data=initial_data)

plot = plot_line_pop('total_born_outside_US', source)

def update_plot(state, county, variable):
    var = format_df.loc[format_df['verbage'] == variable, 'field'].iloc[0]
    update_data = data_plot_pop(state, county, var)
    source = ColumnDataSource(data=update_data)
    plot = plot_line_pop(var, source)
    
    show(plot)
interact(update_plot, state=first_widget, county=second_widget, variable=third_widget);

interactive(children=(Dropdown(description='state', options=('Alabama', 'Arizona', 'California', 'Colorado', '…

In [22]:
# https://docs.bokeh.org/en/latest/docs/user_guide/data.html
# https://docs.bokeh.org/en/latest/docs/first_steps/first_steps_4.html

#def update(*args):
#    second_widget.options = options[first_widget.value]

#first_widget.observe(update)
#initial_data = data_plot_income('Michigan', 'Washtenaw County')

#source = ColumnDataSource(data=initial_data)

# plot income 
#tool_income = [('value', '@income_past12m'), ('year','@year')]
#plot_income = figure(title="Income in Past 12 Months", x_axis_label='Year', y_axis_label='in USD', 
#                     tooltips=tool_income, plot_height=400)

#r_income = plot_income.line(x='year', y='income_past12m', source=source, color='green')
#r_income = plot_income.circle(x='year', y='income_past12m', source=source, color='grey')

# plot fund
#tool_fund = [('value', '@fund'), ('year','@year')]
#plot_fund = figure(title="R&D Fund", x_axis_label='Year', y_axis_label='in USD', 
#                   tooltips=tool_fund, plot_height=400)

#r_fund = plot_fund.line(x='year', y='fund', source=source, color='magenta')
#r_fund = plot_fund.circle(x='year', y='fund', source=source, color='grey')

#plot_income.yaxis.formatter = NumeralTickFormatter(format="$0")
#plot_fund.yaxis.formatter = NumeralTickFormatter(format="$0")

#def update_plot_income(state, county):
#    update_data = data_plot_income(state, county)
#    source.data = update_data
    
#    show(column(plot_income, plot_fund))
#interact(update_plot_income, state=first_widget, county=second_widget);

In [24]:
# https://docs.bokeh.org/en/latest/docs/user_guide/data.html
# https://docs.bokeh.org/en/latest/docs/first_steps/first_steps_4.html

def update(*args):
    second_widget.options = options[first_widget.value]

first_widget.observe(update)
initial_data = data_plot_income('Michigan', 'Washtenaw County')

source = ColumnDataSource(data=initial_data)

# plot income 
plot_income = plot_line_income(source)

# plot fund
plot_fund = plot_line_fund(source)

def update_plot_income(state, county):
    update_data = data_plot_income(state, county)
    #source = ColumnDataSource(data=update_data)
    source.data = update_data
    
    show(column(plot_income, plot_fund))
interact(update_plot_income, state=first_widget, county=second_widget);

interactive(children=(Dropdown(description='state', index=7, options=('Alabama', 'Arizona', 'California', 'Col…