In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
#!pip install duckdb
import duckdb

In [2]:
ddir = r'/Users/katherinewang/SIGD/My Drive/MAPS/sandbox_data'

In [3]:
con = duckdb.connect()
fe = pq.read_table('{}/freshmen-enrollment.parquet'.format(ddir)).to_pandas()
cb_sest = pq.read_table('{}/census_state_est.parquet'.format(ddir)).to_pandas()

slist = ['Private, 2-year',
 'Private, 4-year or above',
 'Proprietary, 2-year',
 'Proprietary, 4-year or above',
 'Public, 2-year',
 'Public, 4-year or above']

r7map = {'american_indian_or_alaska_native':3,
 'asian':4,
 'black_or_african_american':2,
 'hispanic':7,
 'native_hawaiian_or_other_pacific_islander':5,
 'nonresident_alien':9,
 'race_ethnicity_unknown':8,
 'two_or_more_races':6,
 'white':1}

gmap = {'men':1, 'women':2}

# Corresponds to Census FIPS codes.
st_map = {
'Total':0,
'Alabama':1,
'Alaska':2,
'Arizona':4,
'Arkansas':5,
'California':6,
'Colorado':8,
'Connecticut':9,
'Delaware':10,
'District of Columbia':11,
'Florida':12,
'Georgia':13,
'Hawaii':15,
'Idaho':16,
'Illinois':17,
'Indiana':18,
'Iowa':19,
'Kansas':20,
'Kentucky':21,
'Louisiana':22,
'Maine':23,
'Maryland':24,
'Massachusetts':25,
'Michigan':26,
'Minnesota':27,
'Mississippi':28,
'Missouri':29,
'Montana':30,
'Nebraska':31,
'Nevada':32,
'New Hampshire':33,
'New Jersey':34,
'New Mexico':35,
'New York':36,
'North Carolina':37,
'North Dakota':38,
'Ohio':39,
'Oklahoma':40,
'Oregon':41,
'Pennsylvania':42,
'Rhode Island':44,
'South Carolina':45,
'South Dakota':46,
'Tennessee':47,
'Texas':48,
'Utah':49,
'Vermont':50,
'Virginia':51,
'Washington':53,
'West Virginia':54,
'Wisconsin':55,
'Wyoming':56,
# EFCSTATE specifics
'State unknown':57,
'Outlying areas total':89,
'American Samoa':60,
'Federated States of Micronesia':64,
'Guam':66,
'Marshall Islands':68,
'Northern Marianas':69,
'Palau':70,
'Puerto Rico':72,
'Virgin Islands':78,
'Foreign countries':90,
'Residence not reported':98
}

In [4]:
# ST_FIPS is State where the institution is located
# EFCSTATE is State of residence when student was first admitted - based on the address at time of application
# status indicates if student is IN State or OUT of State (IN <-> ST_FIPS == EFCSTATE)
# EFRES01	First-time degree/certificate-seeking undergraduate students
# EFRES02	First-time degree/certificate-seeking undergraduate students who graduated from high school in the past 12 months
fe

Unnamed: 0,YEAR,STABBR,ST_FIPS,sector,EFCSTATE,status,efres01,efres02
0,2011,AK,2,"Private, 4-year or above",2,IN,10.0,7.0
1,2011,AK,2,"Private, 4-year or above",6,OUT,1.0,0.0
2,2011,AK,2,"Private, 4-year or above",12,OUT,1.0,1.0
3,2011,AK,2,"Private, 4-year or above",19,OUT,1.0,1.0
4,2011,AK,2,"Private, 4-year or above",27,OUT,1.0,0.0
...,...,...,...,...,...,...,...,...
74939,2020,WY,56,"Public, 4-year or above",53,OUT,19.0,17.0
74940,2020,WY,56,"Public, 4-year or above",55,OUT,8.0,8.0
74941,2020,WY,56,"Public, 4-year or above",56,IN,782.0,732.0
74942,2020,WY,56,"Public, 4-year or above",57,OUT,12.0,10.0


In [5]:
df_2019 = fe[fe['YEAR']==2019]

## Choropleth Maps

In [6]:
import json
import plotly.express as px
import plotly.io as pio
# when fig.show(), result will be shown in chrome
pio.renderers.default='chrome'

In [7]:
# some function to extract key
# from dictionary
def get_key(val):
    for key, value in st_map.items():
         if val == value:
             return key
 
    return "key doesn't exist"

In [8]:
us_states = json.load(open('./usstates.json','r'))
for feature in us_states['features']:
    feature['id']=feature['properties']['STATE']

In [9]:
temp1 = df_2019.groupby(['ST_FIPS']).size().reset_index(name='sum')
temp2 = df_2019.groupby(['ST_FIPS','sector','status']).size().reset_index(name='counts')
df = pd.merge(temp1,temp2,on='ST_FIPS')
df['ratio']=df['counts']/df['sum']
df.rename(columns={'ST_FIPS':'id'}, inplace=True)
df['state_name']=df['id'].apply(lambda x: get_key(x))
df.head()

Unnamed: 0,id,sum,sector,status,counts,ratio,state_name
0,1,148,"Private, 4-year or above",IN,1,0.006757,Alabama
1,1,148,"Private, 4-year or above",OUT,45,0.304054,Alabama
2,1,148,"Public, 2-year",IN,1,0.006757,Alabama
3,1,148,"Public, 2-year",OUT,44,0.297297,Alabama
4,1,148,"Public, 4-year or above",IN,1,0.006757,Alabama


In [10]:
df_in = df[df.status=='IN']
df_out = df[df.status=='OUT']

In [11]:
testdf = df.loc[(df.status=='IN') & (df.sector=='Private, 4-year or above')]

In [15]:
fig_in = px.choropleth(testdf,locations='id',geojson=us_states,
                       color='ratio',hover_name='state_name')
fig_in.update_geos(visible=False,resolution=110,
scope='usa',showcountries=True, countrycolor="Black",showsubunits=True,subunitcolor="Black")
fig_in.show()

In [13]:
fig_out = px.choropleth(df_out,locations='id',geojson=us_states, color='ratio',hover_name='state_name')
fig_out.update_geos(visible=False,resolution=110,
scope='usa',showcountries=True, countrycolor="Black",showsubunits=True,subunitcolor="Black")
fig_out.show()

In [16]:
fig_out_map = px.choropleth_mapbox(df_out,locations='id',geojson=us_states,
                                   color='ratio',hover_name='state_name')
fig_out_map.update_layout(mapbox_style="carto-positron",
mapbox_zoom=3.5, mapbox_center = {"lat": 38, "lon": -99})
fig_out_map.show()