# Final Project
Logan Cooper

In [21]:
import pandas as pd
import numpy as np
import gc
from tqdm.auto import tqdm
import statsmodels.formula.api as smf
from statsmodels.regression.linear_model import OLSResults

## Data

### Data Import

#### NCDB Building Age Data (2000)

In [None]:
ncdb_dtypes = {
    'tract_id': str,
    'built_1999_2000': int,
    'built_1995_1998': int,
    'built_1990_1994': int,
    'built_1980_1989': int,
    'built_1970_79': int,
    'built_1960_69': int,
    'built_1950_59': int,
    'built_1940_49': int,
    'built_1939_earlier': int
}
ncdb_data = pd.read_csv('./data/NCDB_2000.csv', dtype=ncdb_dtypes)
ncdb_data

In [None]:
ncdb_prop = ncdb_data[['built_1999_2000', 'built_1995_1998', 'built_1990_1994', 'built_1980_1989', 'built_1970_79', 'built_1960_69', 'built_1950_59', 'built_1940_49', 'built_1939_earlier']].div(ncdb_data.sum(axis=1), axis=0)
ncdb_prop['tract_id'] = ncdb_data['tract_id']
ncdb_data = ncdb_prop
ncdb_data

In [None]:
del ncdb_prop
gc.collect()

#### Tract Level Data (2019)

In [None]:
# import & rename cols
tract_data_2019 = pd.read_json('./data/tract_data_2019.json', dtype=False)
year_ranges = ['2014_later', '2010_2013', '2000_2009', '1990_1999', '1980_1989', '1970_1979', '1960_1969', '1950_1959', '1940_1949', '1939_earlier']
built_ranges = [f'built_{el}' for el in year_ranges]
tract_data_2019.columns = ['name', 'median_income', 'num_pub_trans', 'population'] + built_ranges + ['state', 'county', 'tract']

# trim out building dates
tract_data_2019['tract_id'] = tract_data_2019['state'] + tract_data_2019['county'] + tract_data_2019['tract']
tract_data_2019['pub_trans_gt_10pct'] = ((tract_data_2019['num_pub_trans'] / tract_data_2019['population']) >= 0.1).astype(int)

tract_data_2019_trim = tract_data_2019.drop(built_ranges + ['num_pub_trans'], axis=1)
tract_data_2019_trim

In [None]:
tract_data_2019_trim['pub_trans_gt_10pct'].value_counts()

#### MSA-Level Income Data (2019)

In [None]:
msa_data = pd.read_json('./data/msa_data.json')
msa_data.columns = ['name', 'median_income', 'msa_code']
msa_data['msa_code'] = msa_data['msa_code'].astype(str)
msa_data

#### MSA-Tract Crosswalk

In [None]:
msa_lookup = pd.read_excel('./data/msa_codes.xls', 
                           converters={'CBSA Code': str, 
                                       'CBSA Title': str, 
                                       'Metropolitan/Micropolitan Statistical Area': str, 
                                       'FIPS State Code': str,
                                       'FIPS County Code': str}
                           ).drop(range(1916,1920))

msa_lookup

#### Conversion for 2000 => 2010

In [None]:
tract_conversion = pd.read_csv('./data/us2010trf.txt')
tract_conversion

In [None]:
tract_conversion.columns

In [None]:
tract_conversion = tract_conversion[['GEOID00', 'GEOID10', 'AREA10']]
tract_conversion['GEOID00'] = tract_conversion['GEOID00'].astype(str)
tract_conversion['GEOID10'] = tract_conversion['GEOID10'].astype(str)
tract_conversion

#### Tract-School District Crosswalk

In [None]:
school_districts = pd.read_excel('./data/grf19_lea_tract.xlsx')
# school_districts.drop(['NAME_LEA19', 'COUNT', 'LANDAREA', 'WATERAREA'], axis=1, inplace=True)
school_districts['LEAID'] = school_districts['LEAID'].astype(str)
school_districts['TRACT'] = school_districts['TRACT'].astype(str)
school_districts

## Data Combination

#### Trim Tract Level Data to MSAs Only

In [None]:
only_metros = msa_lookup[msa_lookup['Metropolitan/Micropolitan Statistical Area'] == 'Metropolitan Statistical Area']
msa_tracts = pd.merge(left=tract_data_2019_trim, right=only_metros, left_on=['state', 'county'], right_on=['FIPS State Code', 'FIPS County Code'])
msa_tracts

In [None]:
msa_tracts.columns

In [None]:
msa_tracts.drop(['name', 'CBSA Title', 'Metropolitan/Micropolitan Statistical Area', 'FIPS State Code', 'FIPS County Code'], axis=1, inplace=True)
msa_tracts

In [None]:
del only_metros
gc.collect()

#### Merge MSA-Level Data

In [None]:
msa_tracts = pd.merge(left=msa_tracts, right=msa_data, left_on='CBSA Code', right_on='msa_code', suffixes=('_tract', '_msa'))
msa_tracts['income'] = msa_tracts['median_income_tract'] / msa_tracts['median_income_msa']
msa_tracts.drop(['CBSA Code', 'name', 'median_income_tract', 'median_income_msa'], axis=1, inplace=True)
msa_tracts

In [None]:
del msa_data
gc.collect()

#### Convert 2000-Tracts and Merge

In [None]:
ages_2019 = pd.merge(left=ncdb_data, right=tract_conversion, left_on='tract_id', right_on='GEOID00')
ages_2019.drop(['tract_id'], axis=1, inplace=True)
ages_2019.rename({'AREA10': 'area', 'GEOID10': 'tract_id_2010', 'GEOID00': 'tract_id_2000'}, inplace=True, axis=1)

ages_2019

In [None]:
tract_counts = ages_2019['tract_id_2010'].value_counts()
tract_multiples = tract_counts[tract_counts > 1].index
tract_multiples

In [None]:
a = ages_2019[ages_2019['tract_id_2010'] == '53033990100']
a

In [None]:
l = len(a)
a = a.groupby('tract_id_2010').sum()
a['area'] //= l
a.reset_index()

In [None]:
for tid in tqdm(tract_multiples):
    matches = ages_2019[ages_2019['tract_id_2010'] == tid]
    l = len(matches)
    a = matches.groupby('tract_id_2010').sum(numeric_only=True)
    a['area'] //= l

    ages_2019 = ages_2019[ages_2019['tract_id_2010'] != tid] # drop all rows with tid
    ages_2019 = pd.concat((ages_2019, a.reset_index()), axis=0, ignore_index=True) #re-add summed

ages_2019

In [None]:
ages_2019.drop('tract_id_2000', axis=1, inplace=True)
ages_2019

In [None]:
msa_tracts = pd.merge(left=msa_tracts, right=ages_2019, left_on='tract_id', right_on='tract_id_2010')
msa_tracts.drop(['tract_id'], axis=1, inplace=True)

msa_tracts = msa_tracts[(msa_tracts['population'] >= 1) & (msa_tracts['income'] > 0) & (msa_tracts['area'] > 0)] # drop weird tracts

msa_tracts['pop_density'] = msa_tracts['population'] / msa_tracts['area']
msa_tracts

In [None]:
del ages_2019
gc.collect()

#### Add School District

In [None]:
msa_tracts = pd.merge(left=msa_tracts, right=school_districts, left_on='tract_id_2010', right_on='TRACT')
msa_tracts.drop(['NAME_LEA19', 'TRACT', 'COUNT', 'LANDAREA', 'WATERAREA'], axis=1, inplace=True)
msa_tracts

In [None]:
del school_districts
gc.collect()

In [None]:
msa_tracts.to_csv('./data/msa_tracts.csv', index=False)

### Calculating Distances

#### Finding Central Business District

In [None]:
msa_tracts = pd.read_csv('./data/msa_tracts.csv', index_col=False)
msa_tracts

In [None]:
msa_tracts['msa_code'].value_counts()

In [None]:
cbd_candidates = msa_tracts[['msa_code', 'tract_id_2010', 'pop_density']]
cbds = cbd_candidates.groupby('msa_code').max()
cbds.drop('pop_density', axis=1, inplace=True)
cbds.reset_index(inplace=True)
cbds

In [None]:
msa_tracts = pd.merge(left=msa_tracts, right=cbds, left_on='msa_code', right_on='msa_code', suffixes=('', 'cbd'))
msa_tracts.rename({'tract_id_2010cbd': 'cbd'}, inplace=True, axis=1)
msa_tracts['cbd'] = msa_tracts['cbd'].astype(str)
msa_tracts['tract_id_2010'] = msa_tracts['tract_id_2010'].astype(str)
msa_tracts

#### Calculating Distances

In [None]:
msa_tracts['distance'] = None
msa_tracts

In [None]:
msa_tracts[msa_tracts['tract_id_2010'] == msa_tracts['cbd']]['distance'] = 0

In [None]:
msa_tracts_with_dist = None

In [None]:
tract_distances = pd.read_csv('./data/sf12010tractdistance50miles.csv', dtype={'county1': str,'tract1': str, 'county2': str,'tract2': str}, chunksize=5000)
for chunk in tract_distances:
    chunk['tid1'] = chunk['county1'] + chunk['tract1']
    chunk.drop(['county1', 'tract1'], axis=1, inplace=True)
    chunk['tid2'] = chunk['county2'] + chunk['tract2']
    chunk.drop(['county2', 'tract2'], axis=1, inplace=True)
    chunk.rename({'mi_to_tract': 'distance'}, axis=1, inplace=True)
    m = pd.merge(left=msa_tracts, right=chunk, left_on=['cbd', 'tract_id_2010'], right_on=['tid1', 'tid2'])
    if msa_tracts_with_dist is None:
        msa_tracts_with_dist = m
    else:
        msa_tracts_with_dist = pd.concat((m, msa_tracts_with_dist))

msa_tracts_with_dist

In [None]:
msa_tracts_with_dist.drop(['distance_x', 'tid1', 'tid2'], axis=1, inplace=True)
msa_tracts_with_dist.rename({'distance_y': 'distance'}, axis=1, inplace=True)
msa_tracts_with_dist = msa_tracts_with_dist[msa_tracts_with_dist['distance'] <= 40]
msa_tracts_with_dist

In [None]:
cbd_tracts = msa_tracts[msa_tracts['tract_id_2010'] == msa_tracts['cbd']]
cbd_tracts['distance'] = 0
msa_tracts_with_dist = pd.concat((msa_tracts_with_dist, cbd_tracts))
msa_tracts_with_dist

In [None]:
msa_tracts_with_dist.drop(['population', 'state', 'county', 'tract'], axis=1, inplace=True)
msa_tracts_with_dist.to_csv('./data/msa_tracts_dist.csv', index=False)

## Modelling

In [9]:
data = pd.read_csv('./data/msa_tracts_dist.csv', index_col=False)
data.dropna(inplace=True)
data

Unnamed: 0,pub_trans_gt_10pct,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_79,built_1960_69,built_1950_59,built_1940_49,built_1939_earlier,tract_id_2010,area,pop_density,LEAID,cbd,distance
0,0,16220,0.541786,0.054229,0.072841,0.043549,0.268277,0.623324,0.173009,0.160484,0.157274,0.447013,56025000200,5066966,6.601584e-04,5604510,56025001800,31.971663
1,0,16220,0.794956,0.018470,0.024802,0.005805,0.036939,0.113456,0.110290,0.614776,0.036939,0.038522,56025000400,2740366,1.521695e-03,5604510,56025001800,33.572169
2,0,16220,0.709026,0.014100,0.013536,0.003384,0.000000,0.053582,0.081782,0.633390,0.135364,0.064862,56025000800,3998718,9.520551e-04,5604510,56025001800,30.489648
3,0,16220,1.041356,0.094605,0.123017,0.097254,0.541895,1.062368,0.241217,0.205956,0.170210,0.463477,56025001401,2900004318,1.975859e-06,5604510,56025001800,37.594932
4,0,16220,0.821130,0.003820,0.010984,0.026743,0.250716,0.509074,0.126074,0.072588,0.000000,0.000000,56025000501,3298941,1.545648e-03,5604510,56025001800,33.958160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43566,0,48140,1.039428,0.021973,0.082504,0.065506,0.089138,0.139303,0.058458,0.063018,0.065506,0.414594,55073002300,633184236,1.119737e-05,5515900,55073002300,0.000000
43567,0,43100,0.760709,0.010526,0.051629,0.022891,0.008187,0.061654,0.200692,0.122473,0.167657,1.354290,55117011400,2413711,1.040721e-03,5513650,55117011400,0.000000
43568,0,16220,1.330880,0.073113,0.199201,0.083380,0.315820,0.657364,0.238800,0.142132,0.108942,0.181247,56025001800,10864029883,4.307794e-07,5604510,56025001800,0.000000
43569,0,16940,1.303751,0.084999,0.182103,0.086109,0.384136,0.551984,0.142617,0.135040,0.156078,0.276933,56021002000,2730553549,3.608426e-06,5601980,56021002000,0.000000


In [19]:
reg_1 = smf.ols(formula="income ~ distance + C(msa_code)", data=data).fit()
reg_1.summary()

0,1,2,3
Dep. Variable:,income,R-squared:,0.038
Model:,OLS,Adj. R-squared:,0.031
Method:,Least Squares,F-statistic:,5.319
Date:,"Thu, 13 Apr 2023",Prob (F-statistic):,1e-182
Time:,17:01:09,Log-Likelihood:,-23998.0
No. Observations:,43569,AIC:,48640.0
Df Residuals:,43247,BIC:,51430.0
Df Model:,321,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.0434,0.045,22.979,0.000,0.954,1.132
C(msa_code)[T.10420],-0.0088,0.052,-0.167,0.867,-0.112,0.094
C(msa_code)[T.10500],-0.1219,0.082,-1.488,0.137,-0.282,0.039
C(msa_code)[T.10540],-0.0514,0.077,-0.670,0.503,-0.202,0.099
C(msa_code)[T.10580],-0.1077,0.052,-2.056,0.040,-0.210,-0.005
C(msa_code)[T.10740],-0.0463,0.054,-0.863,0.388,-0.152,0.059
C(msa_code)[T.10780],-0.1310,0.083,-1.584,0.113,-0.293,0.031
C(msa_code)[T.10900],-0.0814,0.054,-1.503,0.133,-0.188,0.025
C(msa_code)[T.11020],-0.0544,0.084,-0.651,0.515,-0.218,0.109

0,1,2,3
Omnibus:,8161.848,Durbin-Watson:,1.093
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21092.616
Skew:,1.028,Prob(JB):,0.0
Kurtosis:,5.719,Cond. No.,9410.0


All other models had to be trained non-locally. See `reg.py` for more details.

In [23]:
OLSResults.load('./models/model2.pickle').summary()

: 

: 