# Final Project
Logan Cooper

In [1]:
import pandas as pd
import numpy as np
import gc
from tqdm.auto import tqdm
import statsmodels.formula.api as smf
from statsmodels.regression.linear_model import OLSResults

## Data

### Data Import

#### NCDB Building Age Data (2000)

In [2]:
ncdb_dtypes = {
    'tract_id': str,
    'workers_taking_transit': int,
    'prop_taking_transit': float,
    'built_1999_2000': int,
    'built_1995_1998': int,
    'built_1990_1994': int,
    'built_1980_1989': int,
    'built_1970_79': int,
    'built_1960_69': int,
    'built_1950_59': int,
    'built_1940_49': int,
    'built_1939_earlier': int
}
ncdb_data = pd.read_csv('./data/NCDB_2000.csv', dtype=ncdb_dtypes)
ncdb_data

Unnamed: 0,tract_id,workers_taking_transit,prop_taking_transit,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier
0,1001020100,0,0.000000,28,71,67,137,214,94,59,33,39
1,1001020200,0,0.000000,21,47,39,102,220,83,75,52,119
2,1001020300,0,0.000000,24,176,72,150,389,328,31,54,39
3,1001020400,11,0.005186,11,78,103,159,541,639,306,24,10
4,1001020500,0,0.000000,119,581,504,431,491,115,36,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
65438,56043000200,2,0.001519,2,133,58,301,398,72,144,48,189
65439,56043000301,0,0.000000,0,0,29,30,91,60,518,305,241
65440,56043000302,0,0.000000,7,29,27,152,487,82,152,39,60
65441,56045951100,57,0.045820,64,101,81,280,412,124,139,113,180


In [3]:
ncdb_data[ncdb_data['tract_id'].str.startswith('6')]

Unnamed: 0,tract_id,workers_taking_transit,prop_taking_transit,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier
2970,6001400100,124,0.091988,48,230,401,56,76,54,95,69,122
2971,6001400200,258,0.232014,0,0,10,18,26,62,31,94,658
2972,6001400300,700,0.226904,7,29,8,49,255,179,206,206,1628
2973,6001400400,450,0.204360,0,13,0,43,88,139,222,146,1229
2974,6001400500,395,0.213629,0,4,5,40,57,133,216,226,920
...,...,...,...,...,...,...,...,...,...,...,...,...
10014,6115040800,0,0.000000,22,54,127,209,231,182,132,57,135
10015,6115040901,5,0.005688,43,29,183,201,153,99,69,52,61
10016,6115040902,18,0.006529,7,123,46,176,303,685,436,67,7
10017,6115041000,6,0.002308,49,140,394,548,629,237,177,120,158


In [4]:
ncdb_data['pub_trans_gt_10pct'] = 0
ncdb_data['pub_trans_gt_10pct'][ncdb_data['prop_taking_transit'] >= 0.1] = 1
ncdb_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ncdb_data['pub_trans_gt_10pct'][ncdb_data['prop_taking_transit'] >= 0.1] = 1


Unnamed: 0,tract_id,workers_taking_transit,prop_taking_transit,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct
0,1001020100,0,0.000000,28,71,67,137,214,94,59,33,39,0
1,1001020200,0,0.000000,21,47,39,102,220,83,75,52,119,0
2,1001020300,0,0.000000,24,176,72,150,389,328,31,54,39,0
3,1001020400,11,0.005186,11,78,103,159,541,639,306,24,10,0
4,1001020500,0,0.000000,119,581,504,431,491,115,36,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
65438,56043000200,2,0.001519,2,133,58,301,398,72,144,48,189,0
65439,56043000301,0,0.000000,0,0,29,30,91,60,518,305,241,0
65440,56043000302,0,0.000000,7,29,27,152,487,82,152,39,60,0
65441,56045951100,57,0.045820,64,101,81,280,412,124,139,113,180,0


In [5]:
ncdb_data[ncdb_data['tract_id'].str.startswith('6')]

Unnamed: 0,tract_id,workers_taking_transit,prop_taking_transit,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct
2970,6001400100,124,0.091988,48,230,401,56,76,54,95,69,122,0
2971,6001400200,258,0.232014,0,0,10,18,26,62,31,94,658,1
2972,6001400300,700,0.226904,7,29,8,49,255,179,206,206,1628,1
2973,6001400400,450,0.204360,0,13,0,43,88,139,222,146,1229,1
2974,6001400500,395,0.213629,0,4,5,40,57,133,216,226,920,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10014,6115040800,0,0.000000,22,54,127,209,231,182,132,57,135,0
10015,6115040901,5,0.005688,43,29,183,201,153,99,69,52,61,0
10016,6115040902,18,0.006529,7,123,46,176,303,685,436,67,7,0
10017,6115041000,6,0.002308,49,140,394,548,629,237,177,120,158,0


In [6]:
ncdb_data['pub_trans_gt_10pct'].value_counts()

0    55450
1     9993
Name: pub_trans_gt_10pct, dtype: int64

In [7]:
ncdb_data

Unnamed: 0,tract_id,workers_taking_transit,prop_taking_transit,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct
0,1001020100,0,0.000000,28,71,67,137,214,94,59,33,39,0
1,1001020200,0,0.000000,21,47,39,102,220,83,75,52,119,0
2,1001020300,0,0.000000,24,176,72,150,389,328,31,54,39,0
3,1001020400,11,0.005186,11,78,103,159,541,639,306,24,10,0
4,1001020500,0,0.000000,119,581,504,431,491,115,36,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
65438,56043000200,2,0.001519,2,133,58,301,398,72,144,48,189,0
65439,56043000301,0,0.000000,0,0,29,30,91,60,518,305,241,0
65440,56043000302,0,0.000000,7,29,27,152,487,82,152,39,60,0
65441,56045951100,57,0.045820,64,101,81,280,412,124,139,113,180,0


#### Tract Level Data (2019)

In [8]:
# import & rename cols
tract_data_2019 = pd.read_json('./data/tract_data_2019.json', dtype=False)
tract_data_2019.rename({
    'B19019_001E': 'median_income',
    'B01003_001E': 'population'
}, inplace=True, axis=1)

tract_data_2019['tract_id'] = tract_data_2019['state'] + tract_data_2019['county'] + tract_data_2019['tract']
tract_data_2019

Unnamed: 0,median_income,population,state,county,tract,tract_id
0,37030.0,4781.0,01,073,001100,01073001100
1,36066.0,1946.0,01,073,001400,01073001400
2,27159.0,4080.0,01,073,002000,01073002000
3,38721.0,5291.0,01,073,003802,01073003802
4,18525.0,2533.0,01,073,004000,01073004000
...,...,...,...,...,...,...
72872,87794.0,4187.0,56,021,001902,56021001902
72873,-666666666.0,0.0,56,021,980801,56021980801
72874,85194.0,7513.0,56,025,001602,56025001602
72875,120564.0,3724.0,56,025,001603,56025001603


In [9]:
tract_data_2019['state'].value_counts().sort_index()

01    1181
02     167
04    1526
05     686
06    8057
08    1249
09     833
10     218
12    4245
13    1969
15     351
16     298
17    3123
18    1511
19     825
20     770
21    1115
22    1148
23     358
24    1406
25    1478
26    2813
27    1338
28     664
29    1393
30     271
31     532
32     687
33     295
34    2010
35     499
36    4918
37    2195
38     205
39    2952
40    1046
41     834
42    3218
44     244
45    1103
46     222
47    1497
48    5265
49     588
50     184
51    1907
53    1458
54     484
55    1409
56     132
Name: state, dtype: int64

#### MSA-Level Income Data (2019)

In [10]:
msa_data = pd.read_json('./data/msa_data.json')
msa_data.columns = ['median_income', 'msa_code']
msa_data['msa_code'] = msa_data['msa_code'].astype(str)
msa_data

Unnamed: 0,median_income,msa_code
0,38175,13720
1,61278,13740
2,54295,13780
3,57447,13820
4,71398,13900
...,...,...
933,36403,36660
934,37955,36700
935,58368,36740
936,47466,22700


#### MSA-Tract Crosswalk

In [11]:
msa_lookup = pd.read_excel('./data/msa_codes.xls', 
                           converters={'CBSA Code': str, 
                                       'CBSA Title': str, 
                                       'Metropolitan/Micropolitan Statistical Area': str, 
                                       'FIPS State Code': str,
                                       'FIPS County Code': str}
                           ).drop(range(1916,1920))

msa_lookup

Unnamed: 0,CBSA Code,CBSA Title,Metropolitan/Micropolitan Statistical Area,FIPS State Code,FIPS County Code
0,10100,"Aberdeen, SD",Micropolitan Statistical Area,46,013
1,10100,"Aberdeen, SD",Micropolitan Statistical Area,46,045
2,10140,"Aberdeen, WA",Micropolitan Statistical Area,53,027
3,10180,"Abilene, TX",Metropolitan Statistical Area,48,059
4,10180,"Abilene, TX",Metropolitan Statistical Area,48,253
...,...,...,...,...,...
1911,49700,"Yuba City, CA",Metropolitan Statistical Area,06,101
1912,49700,"Yuba City, CA",Metropolitan Statistical Area,06,115
1913,49740,"Yuma, AZ",Metropolitan Statistical Area,04,027
1914,49780,"Zanesville, OH",Micropolitan Statistical Area,39,119


#### Conversion for 2000 => 2010

In [12]:
tract_conversion = pd.read_csv('./data/us2010trf.txt')
tract_conversion

Unnamed: 0,STATE00,COUNTY00,TRACT00,GEOID00,POP00,HU00,PART00,AREA00,AREALAND00,STATE10,...,AREAPCT00PT,AREALANDPCT00PT,AREAPCT10PT,AREALANDPCT10PT,POP10PT,POPPCT00,POPPCT10,HU10PT,HUPCT00,HUPCT10
0,1,1,20100,1001020100,1913,753,P,9846943,9810183,1,...,99.99,100.00,100.00,100.00,1912,99.95,100.00,752,99.87,100.00
1,1,1,20100,1001020100,1913,753,P,9846943,9810183,1,...,0.01,0.00,0.01,0.00,0,0.00,0.00,0,0.00,0.00
2,1,1,20100,1001020100,1913,753,P,9846943,9810183,1,...,0.00,0.00,0.00,0.00,1,0.05,0.01,1,0.13,0.02
3,1,1,20200,1001020200,2170,822,W,3346351,3340505,1,...,100.00,100.00,100.00,100.00,2170,100.00,100.00,822,100.00,100.00
4,1,1,20300,1001020300,3373,1326,W,5358328,5349274,1,...,100.00,100.00,100.00,100.00,3373,100.00,100.00,1326,100.00,100.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110849,72,153,750601,72153750601,5315,2106,P,11040535,11035996,72,...,99.79,99.79,100.00,100.00,5315,100.00,100.00,2106,100.00,100.00
110850,72,153,750601,72153750601,5315,2106,P,11040535,11035996,72,...,0.21,0.21,0.13,0.13,0,0.00,0.00,0,0.00,0.00
110851,72,153,750602,72153750602,3141,1450,P,17773730,17520777,72,...,0.04,0.04,0.05,0.05,0,0.00,0.00,0,0.00,0.00
110852,72,153,750602,72153750602,3141,1450,P,17773730,17520777,72,...,0.06,0.07,0.04,0.05,0,0.00,0.00,0,0.00,0.00


In [13]:
tract_conversion.columns

Index(['STATE00', 'COUNTY00', 'TRACT00', 'GEOID00', 'POP00', 'HU00', 'PART00',
       'AREA00', 'AREALAND00', 'STATE10', 'COUNTY10', 'TRACT10', 'GEOID10',
       'POP10', 'HU10', 'PART10', 'AREA10', 'AREALAND10', 'AREAPT',
       'AREALANDPT', 'AREAPCT00PT', 'AREALANDPCT00PT', 'AREAPCT10PT',
       'AREALANDPCT10PT', 'POP10PT', 'POPPCT00', 'POPPCT10', 'HU10PT',
       'HUPCT00', 'HUPCT10'],
      dtype='object')

In [14]:
tract_conversion = tract_conversion[['GEOID00', 'GEOID10', 'AREALAND10']]
tract_conversion['GEOID00'] = tract_conversion['GEOID00'].astype(str)
tract_conversion['GEOID10'] = tract_conversion['GEOID10'].astype(str)
tract_conversion

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tract_conversion['GEOID00'] = tract_conversion['GEOID00'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tract_conversion['GEOID10'] = tract_conversion['GEOID10'].astype(str)


Unnamed: 0,GEOID00,GEOID10,AREALAND10
0,1001020100,1001020100,9809944
1,1001020100,1001020600,8020366
2,1001020100,1001020802,190810921
3,1001020200,1001020200,3340505
4,1001020300,1001020300,5349274
...,...,...,...
110849,72153750601,72153750601,11012462
110850,72153750601,72153750602,17526535
110851,72153750602,72055960900,13138821
110852,72153750602,72055961100,24581786


In [15]:
# AREALAND10 is in square meters, want it in square miles
tract_conversion['AREALAND10'] = tract_conversion['AREALAND10'] * 3.861e-7
tract_conversion = tract_conversion[tract_conversion['AREALAND10'] > 0]
tract_conversion

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tract_conversion['AREALAND10'] = tract_conversion['AREALAND10'] * 3.861e-7


Unnamed: 0,GEOID00,GEOID10,AREALAND10
0,1001020100,1001020100,3.787619
1,1001020100,1001020600,3.096663
2,1001020100,1001020802,73.672097
3,1001020200,1001020200,1.289769
4,1001020300,1001020300,2.065355
...,...,...,...
110849,72153750601,72153750601,4.251912
110850,72153750601,72153750602,6.766995
110851,72153750602,72055960900,5.072899
110852,72153750602,72055961100,9.491028


#### Tract-School District Crosswalk

In [16]:
school_districts = pd.read_excel('./data/grf19_lea_tract.xlsx')
# school_districts.drop(['NAME_LEA19', 'COUNT', 'LANDAREA', 'WATERAREA'], axis=1, inplace=True)
school_districts['LEAID'] = school_districts['LEAID'].astype(str)
school_districts['TRACT'] = school_districts['TRACT'].astype(str)
school_districts

Unnamed: 0,LEAID,NAME_LEA19,TRACT,COUNT,LANDAREA,WATERAREA
0,100001,Fort Rucker School District,1031010300,2,23.428498,0.000000
1,100001,Fort Rucker School District,1045020000,2,66.513225,1.081745
2,100003,Maxwell AFB School District,1101000900,3,3.356590,0.143795
3,100003,Maxwell AFB School District,1101001000,3,0.001526,0.000000
4,100003,Maxwell AFB School District,1101006000,3,0.003588,0.000000
...,...,...,...,...,...,...
113515,7800030,Virgin Islands Department of Education,78030960900,32,3.147245,1.173777
113516,7800030,Virgin Islands Department of Education,78030961000,32,0.812847,0.910606
113517,7800030,Virgin Islands Department of Education,78030961100,32,1.356638,0.000000
113518,7800030,Virgin Islands Department of Education,78030961200,32,0.392958,0.309706


## Data Combination

#### Trim Tract Level Data to MSAs Only

In [17]:
only_metros = msa_lookup[msa_lookup['Metropolitan/Micropolitan Statistical Area'] == 'Metropolitan Statistical Area']
msa_tracts = pd.merge(left=tract_data_2019, right=only_metros, left_on=['state', 'county'], right_on=['FIPS State Code', 'FIPS County Code'])
msa_tracts

Unnamed: 0,median_income,population,state,county,tract,tract_id,CBSA Code,CBSA Title,Metropolitan/Micropolitan Statistical Area,FIPS State Code,FIPS County Code
0,37030.0,4781.0,01,073,001100,01073001100,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
1,36066.0,1946.0,01,073,001400,01073001400,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
2,27159.0,4080.0,01,073,002000,01073002000,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
3,38721.0,5291.0,01,073,003802,01073003802,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
4,18525.0,2533.0,01,073,004000,01073004000,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
...,...,...,...,...,...,...,...,...,...,...,...
60863,73795.0,4105.0,56,021,001401,56021001401,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021
60864,65192.0,2671.0,56,021,001402,56021001402,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021
60865,98949.0,5088.0,56,021,001901,56021001901,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021
60866,87794.0,4187.0,56,021,001902,56021001902,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021


In [18]:
msa_tracts = msa_tracts[(msa_tracts['median_income'] >= 0) & (msa_tracts['population'] > 0)]
msa_tracts

Unnamed: 0,median_income,population,state,county,tract,tract_id,CBSA Code,CBSA Title,Metropolitan/Micropolitan Statistical Area,FIPS State Code,FIPS County Code
0,37030.0,4781.0,01,073,001100,01073001100,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
1,36066.0,1946.0,01,073,001400,01073001400,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
2,27159.0,4080.0,01,073,002000,01073002000,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
3,38721.0,5291.0,01,073,003802,01073003802,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
4,18525.0,2533.0,01,073,004000,01073004000,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
...,...,...,...,...,...,...,...,...,...,...,...
60862,79224.0,8092.0,56,021,000501,56021000501,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021
60863,73795.0,4105.0,56,021,001401,56021001401,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021
60864,65192.0,2671.0,56,021,001402,56021001402,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021
60865,98949.0,5088.0,56,021,001901,56021001901,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021


In [19]:
msa_tracts['state'].value_counts().sort_index()

01     843
02      98
04    1417
05     395
06    7766
08    1048
09     772
10     214
12    3996
13    1536
15     263
16     201
17    2679
18    1133
19     439
20     472
21     626
22     930
23     188
24    1345
25    1445
26    2225
27     974
28     293
29     996
30      82
31     314
32     607
33     174
34    1992
35     325
36    4404
37    1685
38      76
39    2378
40     683
41     656
42    2790
44     240
45     908
46      88
47    1125
48    4444
49     513
50      46
51    1599
53    1264
54     300
55    1008
56      38
Name: state, dtype: int64

In [20]:
msa_tracts.columns

Index(['median_income', 'population', 'state', 'county', 'tract', 'tract_id',
       'CBSA Code', 'CBSA Title', 'Metropolitan/Micropolitan Statistical Area',
       'FIPS State Code', 'FIPS County Code'],
      dtype='object')

In [21]:
msa_tracts.drop(['CBSA Title', 'Metropolitan/Micropolitan Statistical Area', 'FIPS State Code', 'FIPS County Code'], axis=1, inplace=True)
msa_tracts

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  msa_tracts.drop(['CBSA Title', 'Metropolitan/Micropolitan Statistical Area', 'FIPS State Code', 'FIPS County Code'], axis=1, inplace=True)


Unnamed: 0,median_income,population,state,county,tract,tract_id,CBSA Code
0,37030.0,4781.0,01,073,001100,01073001100,13820
1,36066.0,1946.0,01,073,001400,01073001400,13820
2,27159.0,4080.0,01,073,002000,01073002000,13820
3,38721.0,5291.0,01,073,003802,01073003802,13820
4,18525.0,2533.0,01,073,004000,01073004000,13820
...,...,...,...,...,...,...,...
60862,79224.0,8092.0,56,021,000501,56021000501,16940
60863,73795.0,4105.0,56,021,001401,56021001401,16940
60864,65192.0,2671.0,56,021,001402,56021001402,16940
60865,98949.0,5088.0,56,021,001901,56021001901,16940


In [22]:
del only_metros
gc.collect()

113646

#### Merge MSA-Level Data

In [23]:
msa_tracts = pd.merge(left=msa_tracts, right=msa_data, left_on='CBSA Code', right_on='msa_code', suffixes=('_tract', '_msa'))
msa_tracts['income'] = msa_tracts['median_income_tract'] / msa_tracts['median_income_msa']
msa_tracts.drop(['CBSA Code', 'median_income_tract', 'median_income_msa'], axis=1, inplace=True)
msa_tracts

Unnamed: 0,population,state,county,tract,tract_id,msa_code,income
0,4781.0,01,073,001100,01073001100,13820,0.644594
1,1946.0,01,073,001400,01073001400,13820,0.627813
2,4080.0,01,073,002000,01073002000,13820,0.472766
3,5291.0,01,073,003802,01073003802,13820,0.674030
4,2533.0,01,073,004000,01073004000,13820,0.322471
...,...,...,...,...,...,...,...
60028,8092.0,56,021,000501,56021000501,16940,1.184038
60029,4105.0,56,021,001401,56021001401,16940,1.102899
60030,2671.0,56,021,001402,56021001402,16940,0.974324
60031,5088.0,56,021,001901,56021001901,16940,1.478837


In [24]:
msa_tracts['state'].value_counts().sort_index()

01     843
02      98
04    1417
05     395
06    7766
08    1048
09     772
10     214
12    3996
13    1536
15     263
16     201
17    2679
18    1133
19     439
20     472
21     626
22     930
23     188
24    1345
25    1445
26    2225
27     974
28     293
29     996
30      82
31     314
32     607
33     174
34    1992
35     325
36    4404
37    1685
38      76
39    2378
40     683
41     656
42    2790
44     240
45     908
46      88
47    1125
48    4444
49     513
50      46
51    1599
53    1264
54     300
55    1008
56      38
Name: state, dtype: int64

In [25]:
# msa_tracts[msa_tracts['tract_id'].str.startswith('0')]['tract_id'] = msa_tracts[msa_tracts['tract_id'].str.startswith('0')]['tract_id'].str[1:]

In [26]:
del msa_data
gc.collect()

0

#### Convert 2000-Tracts and Merge

In [27]:
ages_2019 = pd.merge(left=ncdb_data, right=tract_conversion, left_on='tract_id', right_on='GEOID00')
ages_2019.drop(['tract_id', 'workers_taking_transit', 'prop_taking_transit'], axis=1, inplace=True)
ages_2019.rename({'AREALAND10': 'area', 'GEOID10': 'tract_id_2010', 'GEOID00': 'tract_id_2000'}, inplace=True, axis=1)

ages_2019

Unnamed: 0,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2000,tract_id_2010,area
0,28,71,67,137,214,94,59,33,39,0,1001020100,1001020100,3.787619
1,28,71,67,137,214,94,59,33,39,0,1001020100,1001020600,3.096663
2,28,71,67,137,214,94,59,33,39,0,1001020100,1001020802,73.672097
3,21,47,39,102,220,83,75,52,119,0,1001020200,1001020200,1.289769
4,24,176,72,150,389,328,31,54,39,0,1001020300,1001020300,2.065355
...,...,...,...,...,...,...,...,...,...,...,...,...,...
108049,7,29,27,152,487,82,152,39,60,0,56043000302,56043000302,5.894123
108050,64,101,81,280,412,124,139,113,180,0,56045951100,46033965100,1524.284426
108051,64,101,81,280,412,124,139,113,180,0,56045951100,56011950200,2008.869910
108052,64,101,81,280,412,124,139,113,180,0,56045951100,56045951100,2355.294496


In [36]:
non_aggregated_tract_data = ages_2019[['tract_id_2010', 'area']].drop_duplicates()
non_aggregated_tract_data

Unnamed: 0,tract_id_2010,area
0,1001020100,3.787619
1,1001020600,3.096663
2,1001020802,73.672097
3,1001020200,1.289769
4,1001020300,2.065355
...,...,...
108045,56041975400,51.087489
108047,56043000301,0.769575
108048,56043000302,5.894123
108052,56045951100,2355.294496


In [37]:
summed_ages = ages_2019.groupby('tract_id_2010').sum(numeric_only=True).drop(['area'], axis=1).reset_index()
summed_ages

Unnamed: 0,tract_id_2010,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct
0,10001040100,74,238,301,502,450,126,46,33,134,0
1,10001040201,61,114,87,105,299,229,131,87,336,0
2,10001040202,41,198,211,368,364,242,163,94,339,0
3,10001040203,16,223,191,207,163,152,145,34,95,0
4,10001040501,175,527,413,404,786,789,294,89,126,0
...,...,...,...,...,...,...,...,...,...,...,...
72734,9015906100,26,197,249,769,742,519,327,166,800,0
72735,9015907100,27,133,139,472,502,454,334,209,1373,0
72736,9015907200,0,72,66,210,320,275,235,117,733,0
72737,9015907300,14,100,90,398,322,346,148,121,494,0


In [43]:
summed_ages['pub_trans_gt_10pct'] = (summed_ages['pub_trans_gt_10pct'] >= 1).astype(int)
summed_ages

Unnamed: 0,tract_id_2010,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct
0,10001040100,74,238,301,502,450,126,46,33,134,0
1,10001040201,61,114,87,105,299,229,131,87,336,0
2,10001040202,41,198,211,368,364,242,163,94,339,0
3,10001040203,16,223,191,207,163,152,145,34,95,0
4,10001040501,175,527,413,404,786,789,294,89,126,0
...,...,...,...,...,...,...,...,...,...,...,...
72734,9015906100,26,197,249,769,742,519,327,166,800,0
72735,9015907100,27,133,139,472,502,454,334,209,1373,0
72736,9015907200,0,72,66,210,320,275,235,117,733,0
72737,9015907300,14,100,90,398,322,346,148,121,494,0


In [44]:
summed_ages.describe()

Unnamed: 0,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct
count,72739.0,72739.0,72739.0,72739.0,72739.0,72739.0,72739.0,72739.0,72739.0,72739.0
mean,86.584075,255.289171,241.975831,490.126782,528.749804,370.581298,331.683045,186.090983,388.802293,0.152325
std,169.292508,410.71605,331.594439,578.283644,508.355353,351.77701,352.244631,214.831011,553.901444,0.359339
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,24.0,33.0,105.0,173.0,134.0,102.0,44.0,44.0,0.0
50%,28.0,106.0,121.0,298.0,389.0,271.0,228.0,124.0,188.0,0.0
75%,96.0,320.0,322.0,674.0,728.0,495.0,439.0,251.0,519.0,0.0
max,4918.0,8456.0,4639.0,11267.0,10034.0,8417.0,5537.0,8296.0,10131.0,1.0


In [45]:
msa_tracts['tract_id'] = msa_tracts['tract_id'].str.removeprefix('0')
msa_tracts['tract_id']

0         1073001100
1         1073001400
2         1073002000
3         1073003802
4         1073004000
            ...     
60028    56021000501
60029    56021001401
60030    56021001402
60031    56021001901
60032    56021001902
Name: tract_id, Length: 60033, dtype: object

In [46]:
ages_2019 = pd.merge(left=summed_ages, right=non_aggregated_tract_data)
ages_2019

Unnamed: 0,tract_id_2010,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,area
0,10001040100,74,238,301,502,450,126,46,33,134,0,48.164373
1,10001040201,61,114,87,105,299,229,131,87,336,0,3.757499
2,10001040202,41,198,211,368,364,242,163,94,339,0,12.333051
3,10001040203,16,223,191,207,163,152,145,34,95,0,22.927105
4,10001040501,175,527,413,404,786,789,294,89,126,0,4.040981
...,...,...,...,...,...,...,...,...,...,...,...,...
72734,9015906100,26,197,249,769,742,519,327,166,800,0,39.944555
72735,9015907100,27,133,139,472,502,454,334,209,1373,0,13.248126
72736,9015907200,0,72,66,210,320,275,235,117,733,0,13.477847
72737,9015907300,14,100,90,398,322,346,148,121,494,0,15.633260


In [47]:
ages_2019.describe()

Unnamed: 0,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,area
count,72739.0,72739.0,72739.0,72739.0,72739.0,72739.0,72739.0,72739.0,72739.0,72739.0,72739.0
mean,86.584075,255.289171,241.975831,490.126782,528.749804,370.581298,331.683045,186.090983,388.802293,0.152325,48.555597
std,169.292508,410.71605,331.594439,578.283644,508.355353,351.77701,352.244631,214.831011,553.901444,0.359339,542.923463
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003204
25%,5.0,24.0,33.0,105.0,173.0,134.0,102.0,44.0,44.0,0.0,0.69446
50%,28.0,106.0,121.0,298.0,389.0,271.0,228.0,124.0,188.0,0.0,1.890971
75%,96.0,320.0,322.0,674.0,728.0,495.0,439.0,251.0,519.0,0.0,13.309841
max,4918.0,8456.0,4639.0,11267.0,10034.0,8417.0,5537.0,8296.0,10131.0,1.0,85425.25371


In [49]:
msa_tracts = pd.merge(left=msa_tracts, right=ages_2019, left_on='tract_id', right_on='tract_id_2010')
msa_tracts.drop(['tract_id'], axis=1, inplace=True)

msa_tracts = msa_tracts[(msa_tracts['population'] >= 1) & (msa_tracts['income'] > 0) & (msa_tracts['area'] > 0)] # drop weird tracts

msa_tracts['pop_density'] = msa_tracts['population'] / msa_tracts['area']
msa_tracts

Unnamed: 0,population,state,county,tract,msa_code,income,tract_id_2010,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,area,pop_density
0,4781.0,01,073,001100,13820,0.644594,1073001100,41,168,118,788,1570,919,854,759,596,0,4.412472,1083.519710
1,1946.0,01,073,001400,13820,0.627813,1073001400,15,27,30,290,422,894,1393,601,1126,0,0.865361,2248.772710
2,4080.0,01,073,002000,13820,0.472766,1073002000,13,24,24,172,490,746,717,520,388,0,1.391900,2931.244740
3,5291.0,01,073,003802,13820,0.674030,1073003802,46,76,22,184,603,968,2195,919,574,0,1.252927,4222.913159
4,2533.0,01,073,004000,13820,0.322471,1073004000,18,31,66,352,617,554,207,191,244,0,0.913001,2774.367116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60007,8092.0,56,021,000501,16940,1.184038,56021000501,94,376,78,304,497,911,204,31,17,0,2.358055,3431.641507
60008,4105.0,56,021,001401,16940,1.102899,56021001401,68,244,155,502,867,425,195,0,18,0,1.541849,2662.387090
60009,2671.0,56,021,001402,16940,0.974324,56021001402,68,244,155,502,867,425,195,0,18,0,4.889134,546.313507
60010,5088.0,56,021,001901,16940,1.478837,56021001901,171,371,364,502,478,155,113,66,172,0,585.414185,8.691282


In [50]:
msa_tracts['state'].value_counts().sort_index()

01     843
02      98
04    1410
05     395
06    7765
08    1048
09     772
10     214
12    3996
13    1536
15     263
16     201
17    2679
18    1133
19     439
20     472
21     626
22     930
23     188
24    1345
25    1445
26    2225
27     974
28     293
29     996
30      82
31     314
32     607
33     174
34    1992
35     325
36    4392
37    1685
38      76
39    2378
40     683
41     656
42    2790
44     240
45     908
46      88
47    1125
48    4444
49     513
50      46
51    1598
53    1264
54     300
55    1008
56      38
Name: state, dtype: int64

In [51]:
del ages_2019
gc.collect()

482

#### Add School District

In [52]:
msa_tracts = pd.merge(left=msa_tracts, right=school_districts, left_on='tract_id_2010', right_on='TRACT')
msa_tracts.drop(['NAME_LEA19', 'TRACT', 'COUNT', 'LANDAREA', 'WATERAREA'], axis=1, inplace=True)
msa_tracts

Unnamed: 0,population,state,county,tract,msa_code,income,tract_id_2010,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,area,pop_density,LEAID
0,4781.0,01,073,001100,13820,0.644594,1073001100,41,168,118,788,1570,919,854,759,596,0,4.412472,1083.519710,100390
1,4781.0,01,073,001100,13820,0.644594,1073001100,41,168,118,788,1570,919,854,759,596,0,4.412472,1083.519710,101920
2,1946.0,01,073,001400,13820,0.627813,1073001400,15,27,30,290,422,894,1393,601,1126,0,0.865361,2248.772710,100390
3,4080.0,01,073,002000,13820,0.472766,1073002000,13,24,24,172,490,746,717,520,388,0,1.391900,2931.244740,100390
4,4080.0,01,073,002000,13820,0.472766,1073002000,13,24,24,172,490,746,717,520,388,0,1.391900,2931.244740,101920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84869,4105.0,56,021,001401,16940,1.102899,56021001401,68,244,155,502,867,425,195,0,18,0,1.541849,2662.387090,5601980
84870,2671.0,56,021,001402,16940,0.974324,56021001402,68,244,155,502,867,425,195,0,18,0,4.889134,546.313507,5601980
84871,5088.0,56,021,001901,16940,1.478837,56021001901,171,371,364,502,478,155,113,66,172,0,585.414185,8.691282,5601980
84872,5088.0,56,021,001901,16940,1.478837,56021001901,171,371,364,502,478,155,113,66,172,0,585.414185,8.691282,5604120


In [53]:
del school_districts
gc.collect()

0

In [54]:
msa_tracts.to_csv('./data/msa_tracts.csv', index=False)

### Calculating Distances

#### Finding Central Business District

In [55]:
msa_tracts = pd.read_csv('./data/msa_tracts.csv', index_col=False)
msa_tracts.rename({'msa_code_tract': 'msa_code'}, axis=1, inplace=True)
msa_tracts.drop(['area', 'population'], inplace=True, axis=1)
msa_tracts

Unnamed: 0,state,county,tract,msa_code,income,tract_id_2010,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,pop_density,LEAID
0,1,73,1100,13820,0.644594,1073001100,41,168,118,788,1570,919,854,759,596,0,1083.519710,100390
1,1,73,1100,13820,0.644594,1073001100,41,168,118,788,1570,919,854,759,596,0,1083.519710,101920
2,1,73,1400,13820,0.627813,1073001400,15,27,30,290,422,894,1393,601,1126,0,2248.772710,100390
3,1,73,2000,13820,0.472766,1073002000,13,24,24,172,490,746,717,520,388,0,2931.244740,100390
4,1,73,2000,13820,0.472766,1073002000,13,24,24,172,490,746,717,520,388,0,2931.244740,101920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84869,56,21,1401,16940,1.102899,56021001401,68,244,155,502,867,425,195,0,18,0,2662.387090,5601980
84870,56,21,1402,16940,0.974324,56021001402,68,244,155,502,867,425,195,0,18,0,546.313507,5601980
84871,56,21,1901,16940,1.478837,56021001901,171,371,364,502,478,155,113,66,172,0,8.691282,5601980
84872,56,21,1901,16940,1.478837,56021001901,171,371,364,502,478,155,113,66,172,0,8.691282,5604120


In [56]:
msa_tracts['state'].value_counts().sort_index()

1      1317
2        98
4      2463
5       725
6     13725
8      1298
9       800
10      304
12     3996
13     1684
15      263
16      348
17     5578
18     1394
19      837
20      797
21      730
22      962
23      221
24     1345
25     1556
26     3438
27     1558
28      400
29     1734
30      325
31      528
32      607
33      206
34     2425
35      402
36     6131
37     1771
38      152
39     3632
40     1288
41     1058
42     2957
44      245
45     1052
46      188
47     1403
48     6627
49      594
50       48
51     1604
53     1998
54      300
55     1722
56       40
Name: state, dtype: int64

In [58]:
msa_tracts['msa_code'].value_counts()

35620    5339
16980    4167
31080    4146
19100    1909
19820    1694
         ... 
16220      18
45540      17
15680      17
25980      16
16180      13
Name: msa_code, Length: 384, dtype: int64

In [59]:
cbd_candidates = msa_tracts[['msa_code', 'tract_id_2010', 'pop_density']]
cbds = cbd_candidates.groupby('msa_code').max()
cbds.drop('pop_density', axis=1, inplace=True)
cbds.reset_index(inplace=True)
cbds

Unnamed: 0,msa_code,tract_id_2010
0,10180,48441013600
1,10420,39153534100
2,10500,13321950600
3,10540,41043030904
4,10580,36095740800
...,...,...
379,49420,53077940006
380,49620,42133024002
381,49660,42085033400
382,49700,6115041100


In [60]:
msa_tracts = pd.merge(left=msa_tracts, right=cbds, left_on='msa_code', right_on='msa_code', suffixes=('', 'cbd'))
msa_tracts.rename({'tract_id_2010cbd': 'cbd'}, inplace=True, axis=1)
msa_tracts['cbd'] = msa_tracts['cbd'].astype(str)
msa_tracts['tract_id_2010'] = msa_tracts['tract_id_2010'].astype(str)
msa_tracts

Unnamed: 0,state,county,tract,msa_code,income,tract_id_2010,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,pop_density,LEAID,cbd
0,1,73,1100,13820,0.644594,1073001100,41,168,118,788,1570,919,854,759,596,0,1083.519710,100390,1117030900
1,1,73,1100,13820,0.644594,1073001100,41,168,118,788,1570,919,854,759,596,0,1083.519710,101920,1117030900
2,1,73,1400,13820,0.627813,1073001400,15,27,30,290,422,894,1393,601,1126,0,2248.772710,100390,1117030900
3,1,73,2000,13820,0.472766,1073002000,13,24,24,172,490,746,717,520,388,0,2931.244740,100390,1117030900
4,1,73,2000,13820,0.472766,1073002000,13,24,24,172,490,746,717,520,388,0,2931.244740,101920,1117030900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84869,56,21,1401,16940,1.102899,56021001401,68,244,155,502,867,425,195,0,18,0,2662.387090,5601980,56021002000
84870,56,21,1402,16940,0.974324,56021001402,68,244,155,502,867,425,195,0,18,0,546.313507,5601980,56021002000
84871,56,21,1901,16940,1.478837,56021001901,171,371,364,502,478,155,113,66,172,0,8.691282,5601980,56021002000
84872,56,21,1901,16940,1.478837,56021001901,171,371,364,502,478,155,113,66,172,0,8.691282,5604120,56021002000


#### Calculating Distances

In [64]:
msa_tracts['distance'] = -1
msa_tracts

Unnamed: 0,state,county,tract,msa_code,income,tract_id_2010,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,pop_density,LEAID,cbd,distance
0,1,73,1100,13820,0.644594,1073001100,41,168,118,788,1570,919,854,759,596,0,1083.519710,100390,1117030900,-1
1,1,73,1100,13820,0.644594,1073001100,41,168,118,788,1570,919,854,759,596,0,1083.519710,101920,1117030900,-1
2,1,73,1400,13820,0.627813,1073001400,15,27,30,290,422,894,1393,601,1126,0,2248.772710,100390,1117030900,-1
3,1,73,2000,13820,0.472766,1073002000,13,24,24,172,490,746,717,520,388,0,2931.244740,100390,1117030900,-1
4,1,73,2000,13820,0.472766,1073002000,13,24,24,172,490,746,717,520,388,0,2931.244740,101920,1117030900,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84869,56,21,1401,16940,1.102899,56021001401,68,244,155,502,867,425,195,0,18,0,2662.387090,5601980,56021002000,-1
84870,56,21,1402,16940,0.974324,56021001402,68,244,155,502,867,425,195,0,18,0,546.313507,5601980,56021002000,-1
84871,56,21,1901,16940,1.478837,56021001901,171,371,364,502,478,155,113,66,172,0,8.691282,5601980,56021002000,-1
84872,56,21,1901,16940,1.478837,56021001901,171,371,364,502,478,155,113,66,172,0,8.691282,5604120,56021002000,-1


In [77]:
msa_tracts_with_dist = None

In [78]:
tract_distances = pd.read_csv('./data/sf12010tractdistance50miles.csv', dtype={'county1': str,'tract1': str, 'county2': str,'tract2': str}, chunksize=5000)
for chunk in tract_distances:
    chunk['tid1'] = (chunk['county1'] + chunk['tract1']).str.removeprefix('0')
    chunk.drop(['county1', 'tract1'], axis=1, inplace=True)
    chunk['tid2'] = (chunk['county2'] + chunk['tract2']).str.removeprefix('0')
    chunk.drop(['county2', 'tract2'], axis=1, inplace=True)
    chunk.rename({'mi_to_tract': 'distance'}, axis=1, inplace=True)
    m = pd.merge(left=msa_tracts, right=chunk, left_on=['cbd', 'tract_id_2010'], right_on=['tid1', 'tid2'])
    if msa_tracts_with_dist is None:
        msa_tracts_with_dist = m
    else:
        msa_tracts_with_dist = pd.concat((m, msa_tracts_with_dist))

msa_tracts_with_dist

Unnamed: 0,state,county,tract,msa_code,income,tract_id_2010,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,...,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,pop_density,LEAID,cbd,distance_x,distance_y,tid1,tid2
0,56,25,200,16220,0.541786,56025000200,84,120,81,453,...,310,869,0,1761.722367,5604510,56025001800,-1,31.971663,56025001800,56025000200
1,56,25,400,16220,0.794956,56025000400,35,47,11,70,...,70,73,0,3941.192819,5604510,56025001800,-1,33.572169,56025001800,56025000400
2,56,25,800,16220,0.709026,56025000800,25,24,6,0,...,240,115,0,2600.662094,5604510,56025001800,-1,30.489648,56025001800,56025000800
3,56,25,1401,16220,1.041356,56025001401,187,248,218,1151,...,343,911,0,5.132017,5604510,56025001800,-1,37.594932,56025001800,56025001401
4,56,25,501,16220,0.821130,56025000501,8,23,56,525,...,0,0,0,4003.231099,5604510,56025001800,-1,33.958160,56025001800,56025000501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24,1,3,11407,19300,0.944633,1003011407,875,4863,2993,5917,...,227,137,0,384.688053,100270,1003011602,-1,20.974950,1003011602,1003011407
25,1,3,11408,19300,1.788512,1003011408,688,4149,2646,5523,...,139,98,0,52.786264,100270,1003011602,-1,30.366451,1003011602,1003011408
26,1,3,11501,19300,0.772325,1003011501,298,891,818,993,...,111,194,0,465.047346,100270,1003011602,-1,13.369388,1003011602,1003011501
27,1,3,11502,19300,0.750274,1003011502,298,891,818,993,...,111,194,0,646.413800,100270,1003011602,-1,12.460361,1003011602,1003011502


In [79]:
msa_tracts_with_dist.drop(['distance_x', 'tid1', 'tid2'], axis=1, inplace=True)
msa_tracts_with_dist.rename({'distance_y': 'distance'}, axis=1, inplace=True)
msa_tracts_with_dist = msa_tracts_with_dist[msa_tracts_with_dist['distance'] <= 40]
msa_tracts_with_dist

Unnamed: 0,state,county,tract,msa_code,income,tract_id_2010,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,pop_density,LEAID,cbd,distance
0,56,25,200,16220,0.541786,56025000200,84,120,81,453,1089,328,324,310,869,0,1761.722367,5604510,56025001800,31.971663
1,56,25,400,16220,0.794956,56025000400,35,47,11,70,215,209,1165,70,73,0,3941.192819,5604510,56025001800,33.572169
2,56,25,800,16220,0.709026,56025000800,25,24,6,0,95,145,1123,240,115,0,2600.662094,5604510,56025001800,30.489648
3,56,25,1401,16220,1.041356,56025001401,187,248,218,1151,2209,502,440,343,911,0,5.132017,5604510,56025001800,37.594932
4,56,25,501,16220,0.821130,56025000501,8,23,56,525,1066,264,152,0,0,0,4003.231099,5604510,56025001800,33.958160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24,1,3,11407,19300,0.944633,1003011407,875,4863,2993,5917,2315,641,442,227,137,0,384.688053,100270,1003011602,20.974950
25,1,3,11408,19300,1.788512,1003011408,688,4149,2646,5523,1986,444,350,139,98,0,52.786264,100270,1003011602,30.366451
26,1,3,11501,19300,0.772325,1003011501,298,891,818,993,733,341,294,111,194,0,465.047346,100270,1003011602,13.369388
27,1,3,11502,19300,0.750274,1003011502,298,891,818,993,733,341,294,111,194,0,646.413800,100270,1003011602,12.460361


In [80]:
cbd_tracts = msa_tracts[msa_tracts['tract_id_2010'] == msa_tracts['cbd']]
cbd_tracts['distance'] = 0
msa_tracts_with_dist = pd.concat((msa_tracts_with_dist, cbd_tracts))
msa_tracts_with_dist

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cbd_tracts['distance'] = 0


Unnamed: 0,state,county,tract,msa_code,income,tract_id_2010,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,pop_density,LEAID,cbd,distance
0,56,25,200,16220,0.541786,56025000200,84,120,81,453,1089,328,324,310,869,0,1761.722367,5604510,56025001800,31.971663
1,56,25,400,16220,0.794956,56025000400,35,47,11,70,215,209,1165,70,73,0,3941.192819,5604510,56025001800,33.572169
2,56,25,800,16220,0.709026,56025000800,25,24,6,0,95,145,1123,240,115,0,2600.662094,5604510,56025001800,30.489648
3,56,25,1401,16220,1.041356,56025001401,187,248,218,1151,2209,502,440,343,911,0,5.132017,5604510,56025001800,37.594932
4,56,25,501,16220,0.821130,56025000501,8,23,56,525,1066,264,152,0,0,0,4003.231099,5604510,56025001800,33.958160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84743,55,73,2300,48140,1.039428,55073002300,53,199,158,215,336,141,152,158,1000,0,29.006610,5515900,55073002300,0.000000
84782,55,117,11400,43100,0.760709,55117011400,9,31,13,7,33,104,85,118,896,0,4580.854245,5513650,55117011400,0.000000
84844,56,25,1800,16220,1.330880,56025001800,156,419,180,672,1393,503,301,229,380,0,1.124130,5604510,56025001800,0.000000
84852,56,21,2000,16940,1.303751,56021002000,307,533,223,911,1298,371,377,442,1139,0,9.350342,5601980,56021002000,0.000000


In [81]:
msa_tracts_with_dist['state'].value_counts().sort_index()

1      1203
2        79
4       755
5       638
6     10307
8       589
9       756
10      218
12     2775
13     1068
15      262
16      189
17     2607
18      989
19      747
20      540
21      642
22      840
23      130
24      880
25      583
26     3078
27      884
28      296
29     1583
30      278
31      505
32      493
33      180
34      816
35      351
36     1879
37     1407
38      143
39     3280
40     1079
41      628
42     2528
44      225
45      933
46      171
47     1197
48     4214
49      590
50       48
51     1000
53     1428
54      283
55     1564
56       40
Name: state, dtype: int64

#### Regularize Housing Age Distribution

In [82]:
msa_tracts_with_dist.columns

Index(['state', 'county', 'tract', 'msa_code', 'income', 'tract_id_2010',
       'built_1999_2000', 'built_1995_1998', 'built_1990_1994',
       'built_1980_1989', 'built_1970_1979', 'built_1960_1969',
       'built_1950_1959', 'built_1940_1949', 'built_1939_earlier',
       'pub_trans_gt_10pct', 'pop_density', 'LEAID', 'cbd', 'distance'],
      dtype='object')

In [83]:
age_cats = ['built_1999_2000',
       'built_1995_1998', 'built_1990_1994', 'built_1980_1989',
       'built_1970_1979', 'built_1960_1969', 'built_1950_1959',
       'built_1940_1949', 'built_1939_earlier']

build_totals = msa_tracts_with_dist[age_cats].sum(axis=1)

msa_tracts_with_dist[age_cats] = msa_tracts_with_dist[age_cats].div(build_totals, axis=0)
msa_tracts_with_dist

Unnamed: 0,state,county,tract,msa_code,income,tract_id_2010,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,pop_density,LEAID,cbd,distance
0,56,25,200,16220,0.541786,56025000200,0.022963,0.032805,0.022143,0.123838,0.297704,0.089666,0.088573,0.084746,0.237562,0,1761.722367,5604510,56025001800,31.971663
1,56,25,400,16220,0.794956,56025000400,0.018470,0.024802,0.005805,0.036939,0.113456,0.110290,0.614776,0.036939,0.038522,0,3941.192819,5604510,56025001800,33.572169
2,56,25,800,16220,0.709026,56025000800,0.014100,0.013536,0.003384,0.000000,0.053582,0.081782,0.633390,0.135364,0.064862,0,2600.662094,5604510,56025001800,30.489648
3,56,25,1401,16220,1.041356,56025001401,0.030118,0.039942,0.035110,0.185376,0.355774,0.080850,0.070865,0.055242,0.146722,0,5.132017,5604510,56025001800,37.594932
4,56,25,501,16220,0.821130,56025000501,0.003820,0.010984,0.026743,0.250716,0.509074,0.126074,0.072588,0.000000,0.000000,0,4003.231099,5604510,56025001800,33.958160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84743,55,73,2300,48140,1.039428,55073002300,0.021973,0.082504,0.065506,0.089138,0.139303,0.058458,0.063018,0.065506,0.414594,0,29.006610,5515900,55073002300,0.000000
84782,55,117,11400,43100,0.760709,55117011400,0.006944,0.023920,0.010031,0.005401,0.025463,0.080247,0.065586,0.091049,0.691358,0,4580.854245,5513650,55117011400,0.000000
84844,56,25,1800,16220,1.330880,56025001800,0.036853,0.098984,0.042523,0.158753,0.329081,0.118828,0.071108,0.054099,0.089771,0,1.124130,5604510,56025001800,0.000000
84852,56,21,2000,16940,1.303751,56021002000,0.054812,0.095162,0.039814,0.162650,0.231744,0.066238,0.067309,0.078914,0.203357,0,9.350342,5601980,56021002000,0.000000


In [84]:
msa_tracts_with_dist.drop(['state', 'county', 'tract'], axis=1, inplace=True)
msa_tracts_with_dist.to_csv('./data/msa_tracts_dist.csv', index=False)

## Modelling

The smallest of these models runs OLS with ~43,000 data points and 300 fixed effects. Therefore, I wasn't able to run most of them locally. Instead, I ran the models on the Duke Economics Computing Cluster and downloaded the saved models. The process for this can be seen in `reg.py`. Note that the results below omit the several thousand fixed effects.

In [None]:
data = pd.read_csv('./data/msa_tracts_dist.csv', index_col=False)
data.dropna(inplace=True)
data

In [None]:
data.describe()

In [None]:
import re
fixed_effects = re.compile(r'C\([A-Za-z_]+\).+\n')

def print_model_output(model_num: int) -> str:
    with open(f'./models/model-{model_num}-summary.txt', 'r') as f:
        print(fixed_effects.sub('', f.read()))

print_model_output(1)

In [None]:
print_model_output(2)

In [None]:
print_model_output(3)

In [None]:
print_model_output(4)

## Graphing

In [None]:
from matplotlib import pyplot as plt

data.plot(kind='scatter', x='distance', y='income')

In [None]:
plt.hexbin(x=data['distance'], y=data['income'], gridsize=(40, 10))

In [None]:
tract_counts = data.groupby('msa_code').count()['income']
tract_counts = tract_counts.reset_index().rename({'income': 'tract_counts'}, axis=1)
tract_counts

In [None]:
data = pd.merge(right=data, left=tract_counts, left_on='msa_code', right_on='msa_code')
data

In [None]:
data['dist_bin'] = data['distance'].round()
data

In [None]:
data[data['dist_bin'] == 1]

In [None]:
under_100 = data[data['tract_counts'] < 100]
btw_100_500 = data[(data['tract_counts'] >= 100) & (data['tract_counts'] < 500)]
btw_500_1000 = data[(data['tract_counts'] >= 500) & (data['tract_counts'] < 1000)]
larger_1000 = data[data['tract_counts'] >= 1000]

under_100.shape, btw_100_500.shape, btw_500_1000.shape, larger_1000.shape

In [None]:
def plot_relative_income(df: pd.DataFrame):
    df_dist_inc = df.groupby('dist_bin').mean()['income']
    plt.plot((df_dist_inc - df_dist_inc[0]) / df_dist_inc[0])
    plt.xlabel('Distance from CBD (mi)')
    plt.ylabel('Median Income Compared to Mile Zero')

plot_relative_income(under_100)

In [None]:
plot_relative_income(btw_100_500)

In [None]:
plot_relative_income(btw_500_1000)

In [None]:
plot_relative_income(larger_1000)

In [None]:
data.groupby('dist_bin').mean()[
    ['built_1999_2000', 
        'built_1995_1998', 
        'built_1990_1994', 
        'built_1980_1989', 
        'built_1970_1979', 
        'built_1960_1969', 
        'built_1950_1959', 
        'built_1940_1949', 
        'built_1939_earlier']
    ].plot(xlabel='Distance from CBD (mi)', ylabel='Mean Share of Housing Age')
# def plot_age_distribution(df):
    

## Mapping

In [None]:
import geopandas

ca_geo = geopandas.read_file('./data/CA_shapefile/tl_2010_06_tract00.shp')
ca_geo

In [None]:
ca_geo = ca_geo[['CTIDFP00', 'geometry']]

ca_geo['CTIDFP00'] = ca_geo['CTIDFP00'].astype(str).str.removeprefix('0')
data['tract_id_2010'] = data['tract_id_2010'].astype(str)

pd.merge(left=ca_geo, right=data, left_on='CTIDFP00', right_on='tract_id_2010')

In [None]:
ca_geo.plot()