# Final Project
Logan Cooper

In [1]:
import pandas as pd
import numpy as np
import gc
from tqdm.auto import tqdm
import statsmodels.formula.api as smf
from statsmodels.regression.linear_model import OLSResults

## Data

### Data Import

#### NCDB Building Age Data (2000)

In [64]:
ncdb_dtypes = {
    'tract_id': str,
    'workers_taking_transit': int,
    'prop_taking_transit': float,
    'built_1999_2000': int,
    'built_1995_1998': int,
    'built_1990_1994': int,
    'built_1980_1989': int,
    'built_1970_79': int,
    'built_1960_69': int,
    'built_1950_59': int,
    'built_1940_49': int,
    'built_1939_earlier': int
}
ncdb_data = pd.read_csv('./data/NCDB_2000.csv', dtype=ncdb_dtypes)
ncdb_data

Unnamed: 0,tract_id,workers_taking_transit,prop_taking_transit,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier
0,1001020100,0,0.000000,28,71,67,137,214,94,59,33,39
1,1001020200,0,0.000000,21,47,39,102,220,83,75,52,119
2,1001020300,0,0.000000,24,176,72,150,389,328,31,54,39
3,1001020400,11,0.005186,11,78,103,159,541,639,306,24,10
4,1001020500,0,0.000000,119,581,504,431,491,115,36,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
65438,56043000200,2,0.001519,2,133,58,301,398,72,144,48,189
65439,56043000301,0,0.000000,0,0,29,30,91,60,518,305,241
65440,56043000302,0,0.000000,7,29,27,152,487,82,152,39,60
65441,56045951100,57,0.045820,64,101,81,280,412,124,139,113,180


In [65]:
ncdb_data['pub_trans_gt_10pct'] = 0
ncdb_data['pub_trans_gt_10pct'][ncdb_data['prop_taking_transit'] >= 0.1] = 1
ncdb_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ncdb_data['pub_trans_gt_10pct'][ncdb_data['prop_taking_transit'] >= 0.1] = 1


Unnamed: 0,tract_id,workers_taking_transit,prop_taking_transit,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct
0,1001020100,0,0.000000,28,71,67,137,214,94,59,33,39,0
1,1001020200,0,0.000000,21,47,39,102,220,83,75,52,119,0
2,1001020300,0,0.000000,24,176,72,150,389,328,31,54,39,0
3,1001020400,11,0.005186,11,78,103,159,541,639,306,24,10,0
4,1001020500,0,0.000000,119,581,504,431,491,115,36,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
65438,56043000200,2,0.001519,2,133,58,301,398,72,144,48,189,0
65439,56043000301,0,0.000000,0,0,29,30,91,60,518,305,241,0
65440,56043000302,0,0.000000,7,29,27,152,487,82,152,39,60,0
65441,56045951100,57,0.045820,64,101,81,280,412,124,139,113,180,0


In [66]:
ncdb_data['pub_trans_gt_10pct'].value_counts()

0    55450
1     9993
Name: pub_trans_gt_10pct, dtype: int64

In [67]:
ncdb_prop = ncdb_data[['built_1999_2000', 'built_1995_1998', 'built_1990_1994', 'built_1980_1989', 'built_1970_1979', 'built_1960_1969', 'built_1950_1959', 'built_1940_1949', 'built_1939_earlier']].div(ncdb_data.sum(axis=1), axis=0)
ncdb_prop['tract_id'] = ncdb_data['tract_id']
ncdb_prop['pub_trans_gt_10pct'] = ncdb_data['pub_trans_gt_10pct']
ncdb_data = ncdb_prop
ncdb_data

  ncdb_prop = ncdb_data[['built_1999_2000', 'built_1995_1998', 'built_1990_1994', 'built_1980_1989', 'built_1970_1979', 'built_1960_1969', 'built_1950_1959', 'built_1940_1949', 'built_1939_earlier']].div(ncdb_data.sum(axis=1), axis=0)


Unnamed: 0,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,tract_id,pub_trans_gt_10pct
0,0.037736,0.095687,0.090296,0.184636,0.288410,0.126685,0.079515,0.044474,0.052561,1001020100,0
1,0.027704,0.062005,0.051451,0.134565,0.290237,0.109499,0.098945,0.068602,0.156992,1001020200,0
2,0.019002,0.139351,0.057007,0.118765,0.307997,0.259699,0.024545,0.042755,0.030879,1001020300,0
3,0.005845,0.041445,0.054729,0.084484,0.287459,0.339531,0.162593,0.012752,0.005313,1001020400,0
4,0.052262,0.255160,0.221344,0.189284,0.215635,0.050505,0.015810,0.000000,0.000000,1001020500,0
...,...,...,...,...,...,...,...,...,...,...,...
65438,0.001485,0.098738,0.043059,0.223459,0.295471,0.053452,0.106904,0.035635,0.140312,56043000200,0
65439,0.000000,0.000000,0.022763,0.023548,0.071429,0.047096,0.406593,0.239403,0.189168,56043000301,0
65440,0.006763,0.028019,0.026087,0.146860,0.470531,0.079227,0.146860,0.037681,0.057971,56043000302,0
65441,0.041262,0.065117,0.052223,0.180523,0.265627,0.079946,0.089617,0.072854,0.116051,56045951100,0


In [68]:
del ncdb_prop
gc.collect()

2555

#### Tract Level Data (2019)

In [69]:
# import & rename cols
tract_data_2019 = pd.read_json('./data/tract_data_2019.json', dtype=False)
tract_data_2019.rename({
    'B19019_001E': 'median_income',
    'B01003_001E': 'population'
}, inplace=True, axis=1)

tract_data_2019['tract_id'] = tract_data_2019['state'] + tract_data_2019['county'] + tract_data_2019['tract']
tract_data_2019

Unnamed: 0,median_income,population,state,county,tract,tract_id
0,37030.0,4781.0,01,073,001100,01073001100
1,36066.0,1946.0,01,073,001400,01073001400
2,27159.0,4080.0,01,073,002000,01073002000
3,38721.0,5291.0,01,073,003802,01073003802
4,18525.0,2533.0,01,073,004000,01073004000
...,...,...,...,...,...,...
72872,87794.0,4187.0,56,021,001902,56021001902
72873,-666666666.0,0.0,56,021,980801,56021980801
72874,85194.0,7513.0,56,025,001602,56025001602
72875,120564.0,3724.0,56,025,001603,56025001603


#### MSA-Level Income Data (2019)

In [70]:
msa_data = pd.read_json('./data/msa_data.json')
msa_data.columns = ['median_income', 'msa_code']
msa_data['msa_code'] = msa_data['msa_code'].astype(str)
msa_data

Unnamed: 0,median_income,msa_code
0,38175,13720
1,61278,13740
2,54295,13780
3,57447,13820
4,71398,13900
...,...,...
933,36403,36660
934,37955,36700
935,58368,36740
936,47466,22700


#### MSA-Tract Crosswalk

In [71]:
msa_lookup = pd.read_excel('./data/msa_codes.xls', 
                           converters={'CBSA Code': str, 
                                       'CBSA Title': str, 
                                       'Metropolitan/Micropolitan Statistical Area': str, 
                                       'FIPS State Code': str,
                                       'FIPS County Code': str}
                           ).drop(range(1916,1920))

msa_lookup

Unnamed: 0,CBSA Code,CBSA Title,Metropolitan/Micropolitan Statistical Area,FIPS State Code,FIPS County Code
0,10100,"Aberdeen, SD",Micropolitan Statistical Area,46,013
1,10100,"Aberdeen, SD",Micropolitan Statistical Area,46,045
2,10140,"Aberdeen, WA",Micropolitan Statistical Area,53,027
3,10180,"Abilene, TX",Metropolitan Statistical Area,48,059
4,10180,"Abilene, TX",Metropolitan Statistical Area,48,253
...,...,...,...,...,...
1911,49700,"Yuba City, CA",Metropolitan Statistical Area,06,101
1912,49700,"Yuba City, CA",Metropolitan Statistical Area,06,115
1913,49740,"Yuma, AZ",Metropolitan Statistical Area,04,027
1914,49780,"Zanesville, OH",Micropolitan Statistical Area,39,119


#### Conversion for 2000 => 2010

In [72]:
tract_conversion = pd.read_csv('./data/us2010trf.txt')
tract_conversion

Unnamed: 0,STATE00,COUNTY00,TRACT00,GEOID00,POP00,HU00,PART00,AREA00,AREALAND00,STATE10,...,AREAPCT00PT,AREALANDPCT00PT,AREAPCT10PT,AREALANDPCT10PT,POP10PT,POPPCT00,POPPCT10,HU10PT,HUPCT00,HUPCT10
0,1,1,20100,1001020100,1913,753,P,9846943,9810183,1,...,99.99,100.00,100.00,100.00,1912,99.95,100.00,752,99.87,100.00
1,1,1,20100,1001020100,1913,753,P,9846943,9810183,1,...,0.01,0.00,0.01,0.00,0,0.00,0.00,0,0.00,0.00
2,1,1,20100,1001020100,1913,753,P,9846943,9810183,1,...,0.00,0.00,0.00,0.00,1,0.05,0.01,1,0.13,0.02
3,1,1,20200,1001020200,2170,822,W,3346351,3340505,1,...,100.00,100.00,100.00,100.00,2170,100.00,100.00,822,100.00,100.00
4,1,1,20300,1001020300,3373,1326,W,5358328,5349274,1,...,100.00,100.00,100.00,100.00,3373,100.00,100.00,1326,100.00,100.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110849,72,153,750601,72153750601,5315,2106,P,11040535,11035996,72,...,99.79,99.79,100.00,100.00,5315,100.00,100.00,2106,100.00,100.00
110850,72,153,750601,72153750601,5315,2106,P,11040535,11035996,72,...,0.21,0.21,0.13,0.13,0,0.00,0.00,0,0.00,0.00
110851,72,153,750602,72153750602,3141,1450,P,17773730,17520777,72,...,0.04,0.04,0.05,0.05,0,0.00,0.00,0,0.00,0.00
110852,72,153,750602,72153750602,3141,1450,P,17773730,17520777,72,...,0.06,0.07,0.04,0.05,0,0.00,0.00,0,0.00,0.00


In [73]:
tract_conversion.columns

Index(['STATE00', 'COUNTY00', 'TRACT00', 'GEOID00', 'POP00', 'HU00', 'PART00',
       'AREA00', 'AREALAND00', 'STATE10', 'COUNTY10', 'TRACT10', 'GEOID10',
       'POP10', 'HU10', 'PART10', 'AREA10', 'AREALAND10', 'AREAPT',
       'AREALANDPT', 'AREAPCT00PT', 'AREALANDPCT00PT', 'AREAPCT10PT',
       'AREALANDPCT10PT', 'POP10PT', 'POPPCT00', 'POPPCT10', 'HU10PT',
       'HUPCT00', 'HUPCT10'],
      dtype='object')

In [74]:
tract_conversion = tract_conversion[['GEOID00', 'GEOID10', 'AREALAND10']]
tract_conversion['GEOID00'] = tract_conversion['GEOID00'].astype(str)
tract_conversion['GEOID10'] = tract_conversion['GEOID10'].astype(str)
tract_conversion

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tract_conversion['GEOID00'] = tract_conversion['GEOID00'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tract_conversion['GEOID10'] = tract_conversion['GEOID10'].astype(str)


Unnamed: 0,GEOID00,GEOID10,AREALAND10
0,1001020100,1001020100,9809944
1,1001020100,1001020600,8020366
2,1001020100,1001020802,190810921
3,1001020200,1001020200,3340505
4,1001020300,1001020300,5349274
...,...,...,...
110849,72153750601,72153750601,11012462
110850,72153750601,72153750602,17526535
110851,72153750602,72055960900,13138821
110852,72153750602,72055961100,24581786


In [75]:
# AREALAND10 is in square meters, want it in square miles
tract_conversion['AREALAND10'] = tract_conversion['AREALAND10'] * 3.861e-7
tract_conversion = tract_conversion[tract_conversion['AREALAND10'] > 0]
tract_conversion

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tract_conversion['AREALAND10'] = tract_conversion['AREALAND10'] * 3.861e-7


Unnamed: 0,GEOID00,GEOID10,AREALAND10
0,1001020100,1001020100,3.787619
1,1001020100,1001020600,3.096663
2,1001020100,1001020802,73.672097
3,1001020200,1001020200,1.289769
4,1001020300,1001020300,2.065355
...,...,...,...
110849,72153750601,72153750601,4.251912
110850,72153750601,72153750602,6.766995
110851,72153750602,72055960900,5.072899
110852,72153750602,72055961100,9.491028


#### Tract-School District Crosswalk

In [76]:
school_districts = pd.read_excel('./data/grf19_lea_tract.xlsx')
# school_districts.drop(['NAME_LEA19', 'COUNT', 'LANDAREA', 'WATERAREA'], axis=1, inplace=True)
school_districts['LEAID'] = school_districts['LEAID'].astype(str)
school_districts['TRACT'] = school_districts['TRACT'].astype(str)
school_districts

Unnamed: 0,LEAID,NAME_LEA19,TRACT,COUNT,LANDAREA,WATERAREA
0,100001,Fort Rucker School District,1031010300,2,23.428498,0.000000
1,100001,Fort Rucker School District,1045020000,2,66.513225,1.081745
2,100003,Maxwell AFB School District,1101000900,3,3.356590,0.143795
3,100003,Maxwell AFB School District,1101001000,3,0.001526,0.000000
4,100003,Maxwell AFB School District,1101006000,3,0.003588,0.000000
...,...,...,...,...,...,...
113515,7800030,Virgin Islands Department of Education,78030960900,32,3.147245,1.173777
113516,7800030,Virgin Islands Department of Education,78030961000,32,0.812847,0.910606
113517,7800030,Virgin Islands Department of Education,78030961100,32,1.356638,0.000000
113518,7800030,Virgin Islands Department of Education,78030961200,32,0.392958,0.309706


## Data Combination

#### Trim Tract Level Data to MSAs Only

In [77]:
only_metros = msa_lookup[msa_lookup['Metropolitan/Micropolitan Statistical Area'] == 'Metropolitan Statistical Area']
msa_tracts = pd.merge(left=tract_data_2019, right=only_metros, left_on=['state', 'county'], right_on=['FIPS State Code', 'FIPS County Code'])
msa_tracts

Unnamed: 0,median_income,population,state,county,tract,tract_id,CBSA Code,CBSA Title,Metropolitan/Micropolitan Statistical Area,FIPS State Code,FIPS County Code
0,37030.0,4781.0,01,073,001100,01073001100,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
1,36066.0,1946.0,01,073,001400,01073001400,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
2,27159.0,4080.0,01,073,002000,01073002000,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
3,38721.0,5291.0,01,073,003802,01073003802,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
4,18525.0,2533.0,01,073,004000,01073004000,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
...,...,...,...,...,...,...,...,...,...,...,...
60863,73795.0,4105.0,56,021,001401,56021001401,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021
60864,65192.0,2671.0,56,021,001402,56021001402,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021
60865,98949.0,5088.0,56,021,001901,56021001901,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021
60866,87794.0,4187.0,56,021,001902,56021001902,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021


In [78]:
msa_tracts = msa_tracts[(msa_tracts['median_income'] >= 0) & (msa_tracts['population'] > 0)]
msa_tracts

Unnamed: 0,median_income,population,state,county,tract,tract_id,CBSA Code,CBSA Title,Metropolitan/Micropolitan Statistical Area,FIPS State Code,FIPS County Code
0,37030.0,4781.0,01,073,001100,01073001100,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
1,36066.0,1946.0,01,073,001400,01073001400,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
2,27159.0,4080.0,01,073,002000,01073002000,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
3,38721.0,5291.0,01,073,003802,01073003802,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
4,18525.0,2533.0,01,073,004000,01073004000,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
...,...,...,...,...,...,...,...,...,...,...,...
60862,79224.0,8092.0,56,021,000501,56021000501,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021
60863,73795.0,4105.0,56,021,001401,56021001401,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021
60864,65192.0,2671.0,56,021,001402,56021001402,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021
60865,98949.0,5088.0,56,021,001901,56021001901,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021


In [79]:
msa_tracts.columns

Index(['median_income', 'population', 'state', 'county', 'tract', 'tract_id',
       'CBSA Code', 'CBSA Title', 'Metropolitan/Micropolitan Statistical Area',
       'FIPS State Code', 'FIPS County Code'],
      dtype='object')

In [80]:
msa_tracts.drop(['CBSA Title', 'Metropolitan/Micropolitan Statistical Area', 'FIPS State Code', 'FIPS County Code'], axis=1, inplace=True)
msa_tracts

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  msa_tracts.drop(['CBSA Title', 'Metropolitan/Micropolitan Statistical Area', 'FIPS State Code', 'FIPS County Code'], axis=1, inplace=True)


Unnamed: 0,median_income,population,state,county,tract,tract_id,CBSA Code
0,37030.0,4781.0,01,073,001100,01073001100,13820
1,36066.0,1946.0,01,073,001400,01073001400,13820
2,27159.0,4080.0,01,073,002000,01073002000,13820
3,38721.0,5291.0,01,073,003802,01073003802,13820
4,18525.0,2533.0,01,073,004000,01073004000,13820
...,...,...,...,...,...,...,...
60862,79224.0,8092.0,56,021,000501,56021000501,16940
60863,73795.0,4105.0,56,021,001401,56021001401,16940
60864,65192.0,2671.0,56,021,001402,56021001402,16940
60865,98949.0,5088.0,56,021,001901,56021001901,16940


In [81]:
del only_metros
gc.collect()

113646

#### Merge MSA-Level Data

In [82]:
msa_tracts = pd.merge(left=msa_tracts, right=msa_data, left_on='CBSA Code', right_on='msa_code', suffixes=('_tract', '_msa'))
msa_tracts['income'] = msa_tracts['median_income_tract'] / msa_tracts['median_income_msa']
msa_tracts.drop(['CBSA Code', 'median_income_tract', 'median_income_msa'], axis=1, inplace=True)
msa_tracts

Unnamed: 0,population,state,county,tract,tract_id,msa_code,income
0,4781.0,01,073,001100,01073001100,13820,0.644594
1,1946.0,01,073,001400,01073001400,13820,0.627813
2,4080.0,01,073,002000,01073002000,13820,0.472766
3,5291.0,01,073,003802,01073003802,13820,0.674030
4,2533.0,01,073,004000,01073004000,13820,0.322471
...,...,...,...,...,...,...,...
60028,8092.0,56,021,000501,56021000501,16940,1.184038
60029,4105.0,56,021,001401,56021001401,16940,1.102899
60030,2671.0,56,021,001402,56021001402,16940,0.974324
60031,5088.0,56,021,001901,56021001901,16940,1.478837


In [83]:
del msa_data
gc.collect()

0

#### Convert 2000-Tracts and Merge

In [84]:
ages_2019 = pd.merge(left=ncdb_data, right=tract_conversion, left_on='tract_id', right_on='GEOID00')
ages_2019.drop(['tract_id'], axis=1, inplace=True)
ages_2019.rename({'AREALAND10': 'area', 'GEOID10': 'tract_id_2010', 'GEOID00': 'tract_id_2000'}, inplace=True, axis=1)

ages_2019

Unnamed: 0,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2000,tract_id_2010,area
0,0.037736,0.095687,0.090296,0.184636,0.288410,0.126685,0.079515,0.044474,0.052561,0,1001020100,1001020100,3.787619
1,0.037736,0.095687,0.090296,0.184636,0.288410,0.126685,0.079515,0.044474,0.052561,0,1001020100,1001020600,3.096663
2,0.037736,0.095687,0.090296,0.184636,0.288410,0.126685,0.079515,0.044474,0.052561,0,1001020100,1001020802,73.672097
3,0.027704,0.062005,0.051451,0.134565,0.290237,0.109499,0.098945,0.068602,0.156992,0,1001020200,1001020200,1.289769
4,0.019002,0.139351,0.057007,0.118765,0.307997,0.259699,0.024545,0.042755,0.030879,0,1001020300,1001020300,2.065355
...,...,...,...,...,...,...,...,...,...,...,...,...,...
108049,0.006763,0.028019,0.026087,0.146860,0.470531,0.079227,0.146860,0.037681,0.057971,0,56043000302,56043000302,5.894123
108050,0.041262,0.065117,0.052223,0.180523,0.265627,0.079946,0.089617,0.072854,0.116051,0,56045951100,46033965100,1524.284426
108051,0.041262,0.065117,0.052223,0.180523,0.265627,0.079946,0.089617,0.072854,0.116051,0,56045951100,56011950200,2008.869910
108052,0.041262,0.065117,0.052223,0.180523,0.265627,0.079946,0.089617,0.072854,0.116051,0,56045951100,56045951100,2355.294496


In [85]:
tract_counts = ages_2019['tract_id_2010'].value_counts()
tract_multiples = tract_counts[tract_counts > 1].index
tract_multiples

Index(['42003563800', '6037404600', '49049010900', '42101980900',
       '42003459201', '6037532400', '42021012500', '42071010300',
       '42069110600', '6059099502',
       ...
       '48113001701', '48029181811', '12086010617', '10003013501',
       '47157000200', '48029181819', '47157022700', '48113005902',
       '12021000700', '48113002200'],
      dtype='object', length=23047)

In [86]:
for tid in tqdm(tract_multiples):
    matches = ages_2019[ages_2019['tract_id_2010'] == tid]
    l = len(matches)
    a = matches.groupby('tract_id_2010').sum(numeric_only=True)

    ages_2019 = ages_2019[ages_2019['tract_id_2010'] != tid] # drop all rows with tid
    ages_2019 = pd.concat((ages_2019, a.reset_index()), axis=0, ignore_index=True) #re-add summed

ages_2019

  0%|          | 0/23047 [00:00<?, ?it/s]

Unnamed: 0,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2000,tract_id_2010,area
0,0.027704,0.062005,0.051451,0.134565,0.290237,0.109499,0.098945,0.068602,0.156992,0,1001020200,1001020200,1.289769
1,0.019002,0.139351,0.057007,0.118765,0.307997,0.259699,0.024545,0.042755,0.030879,0,1001020300,1001020300,2.065355
2,0.005845,0.041445,0.054729,0.084484,0.287459,0.339531,0.162593,0.012752,0.005313,0,1001020400,1001020400,2.464362
3,0.040059,0.088279,0.112760,0.186202,0.250742,0.186202,0.061573,0.052671,0.021513,0,1001020700,1001020700,8.651831
4,0.038363,0.126172,0.100597,0.210571,0.230179,0.087809,0.109122,0.031543,0.065644,0,1001021000,1001021000,149.364715
...,...,...,...,...,...,...,...,...,...,...,...,...,...
72734,0.237318,0.404862,0.349710,0.669083,0.201248,0.027389,0.018493,0.003735,0.000000,0,,48029181819,2.849005
72735,0.000000,0.000000,0.023209,0.107158,0.583301,0.753464,0.358744,0.040816,0.110489,0,,47157022700,6.382621
72736,0.002428,0.008256,0.025465,0.210164,0.225682,0.406517,0.540599,0.280201,0.112439,1,,48113005902,1.517324
72737,0.241934,0.097976,0.140601,0.505781,0.561657,0.310837,0.138054,0.000000,0.003160,0,,12021000700,0.758079


In [87]:
ages_2019.drop('tract_id_2000', axis=1, inplace=True)
ages_2019

Unnamed: 0,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2010,area
0,0.027704,0.062005,0.051451,0.134565,0.290237,0.109499,0.098945,0.068602,0.156992,0,1001020200,1.289769
1,0.019002,0.139351,0.057007,0.118765,0.307997,0.259699,0.024545,0.042755,0.030879,0,1001020300,2.065355
2,0.005845,0.041445,0.054729,0.084484,0.287459,0.339531,0.162593,0.012752,0.005313,0,1001020400,2.464362
3,0.040059,0.088279,0.112760,0.186202,0.250742,0.186202,0.061573,0.052671,0.021513,0,1001020700,8.651831
4,0.038363,0.126172,0.100597,0.210571,0.230179,0.087809,0.109122,0.031543,0.065644,0,1001021000,149.364715
...,...,...,...,...,...,...,...,...,...,...,...,...
72734,0.237318,0.404862,0.349710,0.669083,0.201248,0.027389,0.018493,0.003735,0.000000,0,48029181819,2.849005
72735,0.000000,0.000000,0.023209,0.107158,0.583301,0.753464,0.358744,0.040816,0.110489,0,47157022700,6.382621
72736,0.002428,0.008256,0.025465,0.210164,0.225682,0.406517,0.540599,0.280201,0.112439,1,48113005902,1.517324
72737,0.241934,0.097976,0.140601,0.505781,0.561657,0.310837,0.138054,0.000000,0.003160,0,12021000700,0.758079


In [88]:
msa_tracts = pd.merge(left=msa_tracts, right=ages_2019, left_on='tract_id', right_on='tract_id_2010')
msa_tracts.drop(['tract_id'], axis=1, inplace=True)

msa_tracts = msa_tracts[(msa_tracts['population'] >= 1) & (msa_tracts['income'] > 0) & (msa_tracts['area'] > 0)] # drop weird tracts

msa_tracts['pop_density'] = msa_tracts['population'] / msa_tracts['area']
msa_tracts

Unnamed: 0,population,state,county,tract,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2010,area,pop_density
0,2523.0,13,053,020100,17980,0.765298,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020100,126.645601,19.921734
1,3167.0,13,053,020201,17980,1.353766,0.019542,0.014776,0.026215,0.114394,0.307911,0.156339,0.133460,0.073880,0.132030,0,13053020201,2.489124,1272.335089
2,1678.0,13,053,020203,17980,1.118623,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020203,124.663210,13.460266
3,2455.0,13,053,020205,17980,1.046814,0.019542,0.014776,0.026215,0.114394,0.307911,0.156339,0.133460,0.073880,0.132030,0,13053020205,2.998328,818.789597
4,2350.0,13,263,960200,17980,0.841710,0.084334,0.286097,0.261026,0.363381,0.373735,0.248965,0.128307,0.090842,0.159009,0,13263960200,325.228696,7.225685
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47676,8092.0,56,021,000501,16940,1.184038,0.037420,0.149682,0.031051,0.121019,0.197850,0.362659,0.081210,0.012341,0.006768,0,56021000501,2.358055,3431.641507
47677,4105.0,56,021,001401,16940,1.102899,0.027486,0.098626,0.062652,0.202910,0.350445,0.171787,0.078820,0.000000,0.007276,0,56021001401,1.541849,2662.387090
47678,2671.0,56,021,001402,16940,0.974324,0.027486,0.098626,0.062652,0.202910,0.350445,0.171787,0.078820,0.000000,0.007276,0,56021001402,4.889134,546.313507
47679,5088.0,56,021,001901,16940,1.478837,0.071102,0.154262,0.151351,0.208731,0.198752,0.064449,0.046985,0.027443,0.071518,0,56021001901,585.414185,8.691282


In [89]:
del ages_2019
gc.collect()

20

#### Add School District

In [90]:
msa_tracts = pd.merge(left=msa_tracts, right=school_districts, left_on='tract_id_2010', right_on='TRACT')
msa_tracts.drop(['NAME_LEA19', 'TRACT', 'COUNT', 'LANDAREA', 'WATERAREA'], axis=1, inplace=True)
msa_tracts

Unnamed: 0,population,state,county,tract,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2010,area,pop_density,LEAID
0,2523.0,13,053,020100,17980,0.765298,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020100,126.645601,19.921734,1301050
1,3167.0,13,053,020201,17980,1.353766,0.019542,0.014776,0.026215,0.114394,0.307911,0.156339,0.133460,0.073880,0.132030,0,13053020201,2.489124,1272.335089,1300002
2,3167.0,13,053,020201,17980,1.353766,0.019542,0.014776,0.026215,0.114394,0.307911,0.156339,0.133460,0.073880,0.132030,0,13053020201,2.489124,1272.335089,1313053
3,1678.0,13,053,020203,17980,1.118623,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020203,124.663210,13.460266,1300002
4,1678.0,13,053,020203,17980,1.118623,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020203,124.663210,13.460266,1313053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64443,4105.0,56,021,001401,16940,1.102899,0.027486,0.098626,0.062652,0.202910,0.350445,0.171787,0.078820,0.000000,0.007276,0,56021001401,1.541849,2662.387090,5601980
64444,2671.0,56,021,001402,16940,0.974324,0.027486,0.098626,0.062652,0.202910,0.350445,0.171787,0.078820,0.000000,0.007276,0,56021001402,4.889134,546.313507,5601980
64445,5088.0,56,021,001901,16940,1.478837,0.071102,0.154262,0.151351,0.208731,0.198752,0.064449,0.046985,0.027443,0.071518,0,56021001901,585.414185,8.691282,5601980
64446,5088.0,56,021,001901,16940,1.478837,0.071102,0.154262,0.151351,0.208731,0.198752,0.064449,0.046985,0.027443,0.071518,0,56021001901,585.414185,8.691282,5604120


In [91]:
del school_districts
gc.collect()

0

In [92]:
msa_tracts.to_csv('./data/msa_tracts.csv', index=False)

### Calculating Distances

#### Finding Central Business District

In [4]:
msa_tracts = pd.read_csv('./data/msa_tracts.csv', index_col=False)
msa_tracts.rename({'msa_code_tract': 'msa_code'}, axis=1, inplace=True)
msa_tracts.drop(['area', 'population'], inplace=True, axis=1)
msa_tracts

Unnamed: 0,state,county,tract,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2010,pop_density,LEAID
0,13,53,20100,17980,0.765298,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020100,19.921734,1301050
1,13,53,20201,17980,1.353766,0.019542,0.014776,0.026215,0.114394,0.307911,0.156339,0.133460,0.073880,0.132030,0,13053020201,1272.335089,1300002
2,13,53,20201,17980,1.353766,0.019542,0.014776,0.026215,0.114394,0.307911,0.156339,0.133460,0.073880,0.132030,0,13053020201,1272.335089,1313053
3,13,53,20203,17980,1.118623,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020203,13.460266,1300002
4,13,53,20203,17980,1.118623,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020203,13.460266,1313053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64443,56,21,1401,16940,1.102899,0.027486,0.098626,0.062652,0.202910,0.350445,0.171787,0.078820,0.000000,0.007276,0,56021001401,2662.387090,5601980
64444,56,21,1402,16940,0.974324,0.027486,0.098626,0.062652,0.202910,0.350445,0.171787,0.078820,0.000000,0.007276,0,56021001402,546.313507,5601980
64445,56,21,1901,16940,1.478837,0.071102,0.154262,0.151351,0.208731,0.198752,0.064449,0.046985,0.027443,0.071518,0,56021001901,8.691282,5601980
64446,56,21,1901,16940,1.478837,0.071102,0.154262,0.151351,0.208731,0.198752,0.064449,0.046985,0.027443,0.071518,0,56021001901,8.691282,5604120


In [5]:
msa_tracts['msa_code'].value_counts()

35620    5339
16980    4167
19100    1909
19820    1694
37980    1615
         ... 
16220      18
45540      17
15680      17
25980      16
16180      13
Name: msa_code, Length: 321, dtype: int64

In [6]:
cbd_candidates = msa_tracts[['msa_code', 'tract_id_2010', 'pop_density']]
cbds = cbd_candidates.groupby('msa_code').max()
cbds.drop('pop_density', axis=1, inplace=True)
cbds.reset_index(inplace=True)
cbds

Unnamed: 0,msa_code,tract_id_2010
0,10180,48441013600
1,10420,39153534100
2,10500,13321950600
3,10540,41043030904
4,10580,36095740800
...,...,...
316,49180,37197050502
317,49340,25027761400
318,49420,53077940006
319,49620,42133024002


In [7]:
msa_tracts = pd.merge(left=msa_tracts, right=cbds, left_on='msa_code', right_on='msa_code', suffixes=('', 'cbd'))
msa_tracts.rename({'tract_id_2010cbd': 'cbd'}, inplace=True, axis=1)
msa_tracts['cbd'] = msa_tracts['cbd'].astype(str)
msa_tracts['tract_id_2010'] = msa_tracts['tract_id_2010'].astype(str)
msa_tracts

Unnamed: 0,state,county,tract,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2010,pop_density,LEAID,cbd
0,13,53,20100,17980,0.765298,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020100,19.921734,1301050,13263960300
1,13,53,20201,17980,1.353766,0.019542,0.014776,0.026215,0.114394,0.307911,0.156339,0.133460,0.073880,0.132030,0,13053020201,1272.335089,1300002,13263960300
2,13,53,20201,17980,1.353766,0.019542,0.014776,0.026215,0.114394,0.307911,0.156339,0.133460,0.073880,0.132030,0,13053020201,1272.335089,1313053,13263960300
3,13,53,20203,17980,1.118623,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020203,13.460266,1300002,13263960300
4,13,53,20203,17980,1.118623,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020203,13.460266,1313053,13263960300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64443,56,21,1401,16940,1.102899,0.027486,0.098626,0.062652,0.202910,0.350445,0.171787,0.078820,0.000000,0.007276,0,56021001401,2662.387090,5601980,56021002000
64444,56,21,1402,16940,0.974324,0.027486,0.098626,0.062652,0.202910,0.350445,0.171787,0.078820,0.000000,0.007276,0,56021001402,546.313507,5601980,56021002000
64445,56,21,1901,16940,1.478837,0.071102,0.154262,0.151351,0.208731,0.198752,0.064449,0.046985,0.027443,0.071518,0,56021001901,8.691282,5601980,56021002000
64446,56,21,1901,16940,1.478837,0.071102,0.154262,0.151351,0.208731,0.198752,0.064449,0.046985,0.027443,0.071518,0,56021001901,8.691282,5604120,56021002000


#### Calculating Distances

In [8]:
msa_tracts['distance'] = None
msa_tracts

Unnamed: 0,state,county,tract,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2010,pop_density,LEAID,cbd,distance
0,13,53,20100,17980,0.765298,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020100,19.921734,1301050,13263960300,
1,13,53,20201,17980,1.353766,0.019542,0.014776,0.026215,0.114394,0.307911,0.156339,0.133460,0.073880,0.132030,0,13053020201,1272.335089,1300002,13263960300,
2,13,53,20201,17980,1.353766,0.019542,0.014776,0.026215,0.114394,0.307911,0.156339,0.133460,0.073880,0.132030,0,13053020201,1272.335089,1313053,13263960300,
3,13,53,20203,17980,1.118623,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020203,13.460266,1300002,13263960300,
4,13,53,20203,17980,1.118623,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020203,13.460266,1313053,13263960300,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64443,56,21,1401,16940,1.102899,0.027486,0.098626,0.062652,0.202910,0.350445,0.171787,0.078820,0.000000,0.007276,0,56021001401,2662.387090,5601980,56021002000,
64444,56,21,1402,16940,0.974324,0.027486,0.098626,0.062652,0.202910,0.350445,0.171787,0.078820,0.000000,0.007276,0,56021001402,546.313507,5601980,56021002000,
64445,56,21,1901,16940,1.478837,0.071102,0.154262,0.151351,0.208731,0.198752,0.064449,0.046985,0.027443,0.071518,0,56021001901,8.691282,5601980,56021002000,
64446,56,21,1901,16940,1.478837,0.071102,0.154262,0.151351,0.208731,0.198752,0.064449,0.046985,0.027443,0.071518,0,56021001901,8.691282,5604120,56021002000,


In [9]:
msa_tracts[msa_tracts['tract_id_2010'] == msa_tracts['cbd']]['distance'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  msa_tracts[msa_tracts['tract_id_2010'] == msa_tracts['cbd']]['distance'] = 0


In [10]:
msa_tracts_with_dist = None

In [11]:
tract_distances = pd.read_csv('./data/sf12010tractdistance50miles.csv', dtype={'county1': str,'tract1': str, 'county2': str,'tract2': str}, chunksize=5000)
for chunk in tract_distances:
    chunk['tid1'] = chunk['county1'] + chunk['tract1']
    chunk.drop(['county1', 'tract1'], axis=1, inplace=True)
    chunk['tid2'] = chunk['county2'] + chunk['tract2']
    chunk.drop(['county2', 'tract2'], axis=1, inplace=True)
    chunk.rename({'mi_to_tract': 'distance'}, axis=1, inplace=True)
    m = pd.merge(left=msa_tracts, right=chunk, left_on=['cbd', 'tract_id_2010'], right_on=['tid1', 'tid2'])
    if msa_tracts_with_dist is None:
        msa_tracts_with_dist = m
    else:
        msa_tracts_with_dist = pd.concat((m, msa_tracts_with_dist))

msa_tracts_with_dist

Unnamed: 0,state,county,tract,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,...,built_1939_earlier,pub_trans_gt_10pct,tract_id_2010,pop_density,LEAID,cbd,distance_x,distance_y,tid1,tid2
0,56,25,200,16220,0.541786,0.053984,0.072444,0.043224,0.266710,0.619336,...,0.443310,0,56025000200,880.861184,5604510,56025001800,,31.971663,56025001800,56025000200
1,56,25,400,16220,0.794956,0.018392,0.024698,0.005780,0.036784,0.112979,...,0.038360,0,56025000400,3941.192819,5604510,56025001800,,33.572169,56025001800,56025000400
2,56,25,800,16220,0.709026,0.014061,0.013498,0.003375,0.000000,0.053431,...,0.064679,0,56025000800,2600.662094,5604510,56025001800,,30.489648,56025001800,56025000800
3,56,25,1401,16220,1.041356,0.094313,0.122562,0.096865,0.540007,1.057863,...,0.459754,0,56025001401,1.710672,5604510,56025001800,,37.594932,56025001800,56025001401
4,56,25,501,16220,0.821130,0.003806,0.010942,0.026641,0.249762,0.507135,...,0.000000,0,56025000501,4003.231099,5604510,56025001800,,33.958160,56025001800,56025000501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49,10,1,40202,20100,1.217304,0.020000,0.096585,0.102926,0.179511,0.177560,...,0.165365,0,10001040202,1373.545001,1001620,10001043400,,21.649411,10001043400,10001040202
50,10,1,40203,20100,1.096651,0.013051,0.181892,0.155791,0.168842,0.132953,...,0.077488,0,10001040203,237.404589,1001620,10001043400,,22.112634,10001043400,10001040203
51,10,1,40900,20100,0.485405,0.032945,0.091084,0.019380,0.184107,0.109495,...,0.239339,0,10001040900,4880.272353,1000190,10001043400,,13.923058,10001043400,10001040900
52,10,1,41300,20100,0.628600,0.000000,0.012264,0.024528,0.090564,0.066979,...,0.250937,0,10001041300,2475.679905,1000190,10001043400,,12.789227,10001043400,10001041300


In [12]:
msa_tracts_with_dist.drop(['distance_x', 'tid1', 'tid2'], axis=1, inplace=True)
msa_tracts_with_dist.rename({'distance_y': 'distance'}, axis=1, inplace=True)
msa_tracts_with_dist = msa_tracts_with_dist[msa_tracts_with_dist['distance'] <= 40]
msa_tracts_with_dist

Unnamed: 0,state,county,tract,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2010,pop_density,LEAID,cbd,distance
0,56,25,200,16220,0.541786,0.053984,0.072444,0.043224,0.266710,0.619336,0.171655,0.159039,0.155930,0.443310,0,56025000200,880.861184,5604510,56025001800,31.971663
1,56,25,400,16220,0.794956,0.018392,0.024698,0.005780,0.036784,0.112979,0.109826,0.612190,0.036784,0.038360,0,56025000400,3941.192819,5604510,56025001800,33.572169
2,56,25,800,16220,0.709026,0.014061,0.013498,0.003375,0.000000,0.053431,0.081552,0.631608,0.134983,0.064679,0,56025000800,2600.662094,5604510,56025001800,30.489648
3,56,25,1401,16220,1.041356,0.094313,0.122562,0.096865,0.540007,1.057863,0.239784,0.204458,0.168851,0.459754,0,56025001401,1.710672,5604510,56025001800,37.594932
4,56,25,501,16220,0.821130,0.003806,0.010942,0.026641,0.249762,0.507135,0.125594,0.072312,0.000000,0.000000,0,56025000501,4003.231099,5604510,56025001800,33.958160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49,10,1,40202,20100,1.217304,0.020000,0.096585,0.102926,0.179511,0.177560,0.118048,0.079512,0.045853,0.165365,0,10001040202,1373.545001,1001620,10001043400,21.649411
50,10,1,40203,20100,1.096651,0.013051,0.181892,0.155791,0.168842,0.132953,0.123980,0.118271,0.027732,0.077488,0,10001040203,237.404589,1001620,10001043400,22.112634
51,10,1,40900,20100,0.485405,0.032945,0.091084,0.019380,0.184107,0.109495,0.125968,0.137596,0.052325,0.239339,0,10001040900,4880.272353,1000190,10001043400,13.923058
52,10,1,41300,20100,0.628600,0.000000,0.012264,0.024528,0.090564,0.066979,0.119808,0.277351,0.133959,0.250937,0,10001041300,2475.679905,1000190,10001043400,12.789227


In [13]:
cbd_tracts = msa_tracts[msa_tracts['tract_id_2010'] == msa_tracts['cbd']]
cbd_tracts['distance'] = 0
msa_tracts_with_dist = pd.concat((msa_tracts_with_dist, cbd_tracts))
msa_tracts_with_dist

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cbd_tracts['distance'] = 0


Unnamed: 0,state,county,tract,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2010,pop_density,LEAID,cbd,distance
0,56,25,200,16220,0.541786,0.053984,0.072444,0.043224,0.266710,0.619336,0.171655,0.159039,0.155930,0.443310,0,56025000200,880.861184,5604510,56025001800,31.971663
1,56,25,400,16220,0.794956,0.018392,0.024698,0.005780,0.036784,0.112979,0.109826,0.612190,0.036784,0.038360,0,56025000400,3941.192819,5604510,56025001800,33.572169
2,56,25,800,16220,0.709026,0.014061,0.013498,0.003375,0.000000,0.053431,0.081552,0.631608,0.134983,0.064679,0,56025000800,2600.662094,5604510,56025001800,30.489648
3,56,25,1401,16220,1.041356,0.094313,0.122562,0.096865,0.540007,1.057863,0.239784,0.204458,0.168851,0.459754,0,56025001401,1.710672,5604510,56025001800,37.594932
4,56,25,501,16220,0.821130,0.003806,0.010942,0.026641,0.249762,0.507135,0.125594,0.072312,0.000000,0.000000,0,56025000501,4003.231099,5604510,56025001800,33.958160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64317,55,73,2300,48140,1.039428,0.021865,0.082096,0.065181,0.088696,0.138614,0.058168,0.062706,0.065181,0.412541,0,55073002300,29.006610,5515900,55073002300,0.000000
64356,55,117,11400,43100,0.760709,0.010333,0.049751,0.022005,0.008037,0.059128,0.192227,0.118828,0.162784,1.310886,0,55117011400,1526.951415,5513650,55117011400,0.000000
64418,56,25,1800,16220,1.330880,0.072723,0.197701,0.083088,0.314001,0.653162,0.237053,0.141210,0.108111,0.179792,0,56025001800,0.562065,5604510,56025001800,0.000000
64426,56,21,2000,16940,1.303751,0.171925,0.284788,0.116650,0.474119,0.667689,0.195365,0.199707,0.234291,0.641497,0,56021002000,3.116781,5601980,56021002000,0.000000


In [14]:
msa_tracts_with_dist.drop(['state', 'county', 'tract'], axis=1, inplace=True)
msa_tracts_with_dist.to_csv('./data/msa_tracts_dist.csv', index=False)

## Modelling

The smallest of these models runs OLS with ~43,000 data points and 300 fixed effects. Therefore, I wasn't able to run most of them locally. Instead, I ran the models on the Duke Economics Computing Cluster and downloaded the saved models. The process for this can be seen in `reg.py`. Note that the results below omit the several thousand fixed effects.

In [2]:
data = pd.read_csv('./data/msa_tracts_dist.csv', index_col=False)
data.dropna(inplace=True)
data

Unnamed: 0,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2010,pop_density,LEAID,cbd,distance
0,16220,0.541786,0.053984,0.072444,0.043224,0.266710,0.619336,0.171655,0.159039,0.155930,0.443310,0,56025000200,6.601584e-04,5604510,56025001800,31.971663
1,16220,0.794956,0.018392,0.024698,0.005780,0.036784,0.112979,0.109826,0.612190,0.036784,0.038360,0,56025000400,1.521695e-03,5604510,56025001800,33.572169
2,16220,0.709026,0.014061,0.013498,0.003375,0.000000,0.053431,0.081552,0.631608,0.134983,0.064679,0,56025000800,9.520551e-04,5604510,56025001800,30.489648
3,16220,1.041356,0.094313,0.122562,0.096865,0.540007,1.057863,0.239784,0.204458,0.168851,0.459754,0,56025001401,1.975859e-06,5604510,56025001800,37.594932
4,16220,0.821130,0.003806,0.010942,0.026641,0.249762,0.507135,0.125594,0.072312,0.000000,0.000000,0,56025000501,1.545648e-03,5604510,56025001800,33.958160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43566,48140,1.039428,0.021865,0.082096,0.065181,0.088696,0.138614,0.058168,0.062706,0.065181,0.412541,0,55073002300,1.119737e-05,5515900,55073002300,0.000000
43567,43100,0.760709,0.010333,0.049751,0.022005,0.008037,0.059128,0.192227,0.118828,0.162784,1.310886,0,55117011400,1.040721e-03,5513650,55117011400,0.000000
43568,16220,1.330880,0.072723,0.197701,0.083088,0.314001,0.653162,0.237053,0.141210,0.108111,0.179792,0,56025001800,4.307794e-07,5604510,56025001800,0.000000
43569,16940,1.303751,0.171925,0.284788,0.116650,0.474119,0.667689,0.195365,0.199707,0.234291,0.641497,0,56021002000,3.608426e-06,5601980,56021002000,0.000000


In [38]:
import re
fixed_effects = re.compile(r'C\([A-Za-z_]+\).+\n')

def print_model_output(model_num: int) -> str:
    with open(f'./models/model-{model_num}-summary.txt', 'r') as f:
        print(fixed_effects.sub('', f.read()))

print_model_output(1)

                            OLS Regression Results                            
Dep. Variable:                 income   R-squared:                       0.038
Model:                            OLS   Adj. R-squared:                  0.031
Method:                 Least Squares   F-statistic:                     5.319
Date:                Sat, 15 Apr 2023   Prob (F-statistic):          9.96e-183
Time:                        15:48:59   Log-Likelihood:                -23998.
No. Observations:               43570   AIC:                         4.864e+04
Df Residuals:                   43248   BIC:                         5.144e+04
Df Model:                         321                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept                1.0434 

In [39]:
print_model_output(2)

                            OLS Regression Results                            
Dep. Variable:                 income   R-squared:                       0.371
Model:                            OLS   Adj. R-squared:                  0.295
Method:                 Least Squares   F-statistic:                     4.853
Date:                Sat, 15 Apr 2023   Prob (F-statistic):               0.00
Time:                        15:50:56   Log-Likelihood:                -14727.
No. Observations:               43570   AIC:                         3.891e+04
Df Residuals:                   38840   BIC:                         7.998e+04
Df Model:                        4729                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               1.1288    

In [40]:
print_model_output(3)

                            OLS Regression Results                            
Dep. Variable:                 income   R-squared:                       0.385
Model:                            OLS   Adj. R-squared:                  0.310
Method:                 Least Squares   F-statistic:                     5.133
Date:                Sat, 15 Apr 2023   Prob (F-statistic):               0.00
Time:                        15:52:53   Log-Likelihood:                -14263.
No. Observations:               43570   AIC:                         3.799e+04
Df Residuals:                   38839   BIC:                         7.906e+04
Df Model:                        4730                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               1.1268    

In [41]:
print_model_output(4)

                            OLS Regression Results                            
Dep. Variable:                 income   R-squared:                       0.442
Model:                            OLS   Adj. R-squared:                  0.374
Method:                 Least Squares   F-statistic:                     6.490
Date:                Sat, 15 Apr 2023   Prob (F-statistic):               0.00
Time:                        15:54:50   Log-Likelihood:                -12135.
No. Observations:               43570   AIC:                         3.375e+04
Df Residuals:                   38831   BIC:                         7.489e+04
Df Model:                        4738                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               1.0262    