# Final Project
Logan Cooper

In [1]:
import pandas as pd
import numpy as np
import gc
from tqdm.auto import tqdm
import statsmodels.formula.api as smf
from statsmodels.regression.linear_model import OLSResults

## Data

### Data Import

#### NCDB Building Age Data (2000)

In [None]:
ncdb_dtypes = {
    'tract_id': str,
    'workers_taking_transit': int,
    'prop_taking_transit': float,
    'built_1999_2000': int,
    'built_1995_1998': int,
    'built_1990_1994': int,
    'built_1980_1989': int,
    'built_1970_79': int,
    'built_1960_69': int,
    'built_1950_59': int,
    'built_1940_49': int,
    'built_1939_earlier': int
}
ncdb_data = pd.read_csv('./data/NCDB_2000.csv', dtype=ncdb_dtypes)
ncdb_data

In [None]:
ncdb_data['pub_trans_gt_10pct'] = 0
ncdb_data['pub_trans_gt_10pct'][ncdb_data['prop_taking_transit'] >= 0.1] = 1
ncdb_data

In [None]:
ncdb_data['pub_trans_gt_10pct'].value_counts()

In [None]:
ncdb_prop = ncdb_data[['built_1999_2000', 'built_1995_1998', 'built_1990_1994', 'built_1980_1989', 'built_1970_1979', 'built_1960_1969', 'built_1950_1959', 'built_1940_1949', 'built_1939_earlier']].div(ncdb_data.sum(axis=1), axis=0)
ncdb_prop['tract_id'] = ncdb_data['tract_id']
ncdb_prop['pub_trans_gt_10pct'] = ncdb_data['pub_trans_gt_10pct']
ncdb_data = ncdb_prop
ncdb_data

In [None]:
del ncdb_prop
gc.collect()

#### Tract Level Data (2019)

In [None]:
# import & rename cols
tract_data_2019 = pd.read_json('./data/tract_data_2019.json', dtype=False)
tract_data_2019.rename({
    'B19019_001E': 'median_income',
    'B01003_001E': 'population'
}, inplace=True, axis=1)

tract_data_2019['tract_id'] = tract_data_2019['state'] + tract_data_2019['county'] + tract_data_2019['tract']
tract_data_2019

#### MSA-Level Income Data (2019)

In [None]:
msa_data = pd.read_json('./data/msa_data.json')
msa_data.columns = ['median_income', 'msa_code']
msa_data['msa_code'] = msa_data['msa_code'].astype(str)
msa_data

#### MSA-Tract Crosswalk

In [None]:
msa_lookup = pd.read_excel('./data/msa_codes.xls', 
                           converters={'CBSA Code': str, 
                                       'CBSA Title': str, 
                                       'Metropolitan/Micropolitan Statistical Area': str, 
                                       'FIPS State Code': str,
                                       'FIPS County Code': str}
                           ).drop(range(1916,1920))

msa_lookup

#### Conversion for 2000 => 2010

In [None]:
tract_conversion = pd.read_csv('./data/us2010trf.txt')
tract_conversion

In [None]:
tract_conversion.columns

In [None]:
tract_conversion = tract_conversion[['GEOID00', 'GEOID10', 'AREA10']]
tract_conversion['GEOID00'] = tract_conversion['GEOID00'].astype(str)
tract_conversion['GEOID10'] = tract_conversion['GEOID10'].astype(str)
tract_conversion

#### Tract-School District Crosswalk

In [None]:
school_districts = pd.read_excel('./data/grf19_lea_tract.xlsx')
# school_districts.drop(['NAME_LEA19', 'COUNT', 'LANDAREA', 'WATERAREA'], axis=1, inplace=True)
school_districts['LEAID'] = school_districts['LEAID'].astype(str)
school_districts['TRACT'] = school_districts['TRACT'].astype(str)
school_districts

## Data Combination

#### Trim Tract Level Data to MSAs Only

In [None]:
only_metros = msa_lookup[msa_lookup['Metropolitan/Micropolitan Statistical Area'] == 'Metropolitan Statistical Area']
msa_tracts = pd.merge(left=tract_data_2019, right=only_metros, left_on=['state', 'county'], right_on=['FIPS State Code', 'FIPS County Code'])
msa_tracts

In [None]:
msa_tracts = msa_tracts[(msa_tracts['median_income'] >= 0) & (msa_tracts['population'] > 0)]
msa_tracts

In [None]:
msa_tracts.columns

In [None]:
msa_tracts.drop(['CBSA Title', 'Metropolitan/Micropolitan Statistical Area', 'FIPS State Code', 'FIPS County Code'], axis=1, inplace=True)
msa_tracts

In [None]:
del only_metros
gc.collect()

#### Merge MSA-Level Data

In [None]:
msa_tracts = pd.merge(left=msa_tracts, right=msa_data, left_on='CBSA Code', right_on='msa_code', suffixes=('_tract', '_msa'))
msa_tracts['income'] = msa_tracts['median_income_tract'] / msa_tracts['median_income_msa']
msa_tracts.drop(['CBSA Code', 'median_income_tract', 'median_income_msa'], axis=1, inplace=True)
msa_tracts

In [None]:
del msa_data
gc.collect()

#### Convert 2000-Tracts and Merge

In [None]:
ages_2019 = pd.merge(left=ncdb_data, right=tract_conversion, left_on='tract_id', right_on='GEOID00')
ages_2019.drop(['tract_id'], axis=1, inplace=True)
ages_2019.rename({'AREA10': 'area', 'GEOID10': 'tract_id_2010', 'GEOID00': 'tract_id_2000'}, inplace=True, axis=1)

ages_2019

In [None]:
tract_counts = ages_2019['tract_id_2010'].value_counts()
tract_multiples = tract_counts[tract_counts > 1].index
tract_multiples

In [None]:
a = ages_2019[ages_2019['tract_id_2010'] == '53033990100']
a

In [None]:
l = len(a)
a = a.groupby('tract_id_2010').sum()
a['area'] //= l
a.reset_index()

In [None]:
for tid in tqdm(tract_multiples):
    matches = ages_2019[ages_2019['tract_id_2010'] == tid]
    l = len(matches)
    a = matches.groupby('tract_id_2010').sum(numeric_only=True)
    a['area'] //= l

    ages_2019 = ages_2019[ages_2019['tract_id_2010'] != tid] # drop all rows with tid
    ages_2019 = pd.concat((ages_2019, a.reset_index()), axis=0, ignore_index=True) #re-add summed

ages_2019

In [None]:
ages_2019.drop('tract_id_2000', axis=1, inplace=True)
ages_2019

In [None]:
msa_tracts = pd.merge(left=msa_tracts, right=ages_2019, left_on='tract_id', right_on='tract_id_2010')
msa_tracts.drop(['tract_id'], axis=1, inplace=True)

msa_tracts = msa_tracts[(msa_tracts['population'] >= 1) & (msa_tracts['income'] > 0) & (msa_tracts['area'] > 0)] # drop weird tracts

msa_tracts['pop_density'] = msa_tracts['population'] / msa_tracts['area']
msa_tracts

In [None]:
del ages_2019
gc.collect()

#### Add School District

In [None]:
msa_tracts = pd.merge(left=msa_tracts, right=school_districts, left_on='tract_id_2010', right_on='TRACT')
msa_tracts.drop(['NAME_LEA19', 'TRACT', 'COUNT', 'LANDAREA', 'WATERAREA'], axis=1, inplace=True)
msa_tracts

In [None]:
del school_districts
gc.collect()

In [None]:
msa_tracts.to_csv('./data/msa_tracts.csv', index=False)

### Calculating Distances

#### Finding Central Business District

In [5]:
msa_tracts = pd.read_csv('./data/msa_tracts.csv', index_col=False)
msa_tracts.rename({'msa_code_tract': 'msa_code'}, axis=1, inplace=True)
msa_tracts.drop(['msa_code_msa', 'median_income', 'area', 'population'], inplace=True, axis=1)
msa_tracts

Unnamed: 0,state,county,tract,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2010,pop_density,LEAID
0,13,53,20100,17980,0.765298,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020100,0.000015,1301050
1,13,53,20201,17980,1.353766,0.019542,0.014776,0.026215,0.114394,0.307911,0.156339,0.133460,0.073880,0.132030,0,13053020201,0.000491,1300002
2,13,53,20201,17980,1.353766,0.019542,0.014776,0.026215,0.114394,0.307911,0.156339,0.133460,0.073880,0.132030,0,13053020201,0.000491,1313053
3,13,53,20203,17980,1.118623,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020203,0.000010,1300002
4,13,53,20203,17980,1.118623,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020203,0.000010,1313053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64443,56,21,1401,16940,1.102899,0.027486,0.098626,0.062652,0.202910,0.350445,0.171787,0.078820,0.000000,0.007276,0,56021001401,0.001028,5601980
64444,56,21,1402,16940,0.974324,0.027486,0.098626,0.062652,0.202910,0.350445,0.171787,0.078820,0.000000,0.007276,0,56021001402,0.000211,5601980
64445,56,21,1901,16940,1.478837,0.071102,0.154262,0.151351,0.208731,0.198752,0.064449,0.046985,0.027443,0.071518,0,56021001901,0.000003,5601980
64446,56,21,1901,16940,1.478837,0.071102,0.154262,0.151351,0.208731,0.198752,0.064449,0.046985,0.027443,0.071518,0,56021001901,0.000003,5604120


In [6]:
msa_tracts['msa_code'].value_counts()

35620    5339
16980    4167
19100    1909
19820    1694
37980    1615
         ... 
16220      18
45540      17
15680      17
25980      16
16180      13
Name: msa_code, Length: 321, dtype: int64

In [7]:
cbd_candidates = msa_tracts[['msa_code', 'tract_id_2010', 'pop_density']]
cbds = cbd_candidates.groupby('msa_code').max()
cbds.drop('pop_density', axis=1, inplace=True)
cbds.reset_index(inplace=True)
cbds

Unnamed: 0,msa_code,tract_id_2010
0,10180,48441013600
1,10420,39153534100
2,10500,13321950600
3,10540,41043030904
4,10580,36095740800
...,...,...
316,49180,37197050502
317,49340,25027761400
318,49420,53077940006
319,49620,42133024002


In [8]:
msa_tracts = pd.merge(left=msa_tracts, right=cbds, left_on='msa_code', right_on='msa_code', suffixes=('', 'cbd'))
msa_tracts.rename({'tract_id_2010cbd': 'cbd'}, inplace=True, axis=1)
msa_tracts['cbd'] = msa_tracts['cbd'].astype(str)
msa_tracts['tract_id_2010'] = msa_tracts['tract_id_2010'].astype(str)
msa_tracts

Unnamed: 0,state,county,tract,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2010,pop_density,LEAID,cbd
0,13,53,20100,17980,0.765298,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020100,0.000015,1301050,13263960300
1,13,53,20201,17980,1.353766,0.019542,0.014776,0.026215,0.114394,0.307911,0.156339,0.133460,0.073880,0.132030,0,13053020201,0.000491,1300002,13263960300
2,13,53,20201,17980,1.353766,0.019542,0.014776,0.026215,0.114394,0.307911,0.156339,0.133460,0.073880,0.132030,0,13053020201,0.000491,1313053,13263960300
3,13,53,20203,17980,1.118623,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020203,0.000010,1300002,13263960300
4,13,53,20203,17980,1.118623,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020203,0.000010,1313053,13263960300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64443,56,21,1401,16940,1.102899,0.027486,0.098626,0.062652,0.202910,0.350445,0.171787,0.078820,0.000000,0.007276,0,56021001401,0.001028,5601980,56021002000
64444,56,21,1402,16940,0.974324,0.027486,0.098626,0.062652,0.202910,0.350445,0.171787,0.078820,0.000000,0.007276,0,56021001402,0.000211,5601980,56021002000
64445,56,21,1901,16940,1.478837,0.071102,0.154262,0.151351,0.208731,0.198752,0.064449,0.046985,0.027443,0.071518,0,56021001901,0.000003,5601980,56021002000
64446,56,21,1901,16940,1.478837,0.071102,0.154262,0.151351,0.208731,0.198752,0.064449,0.046985,0.027443,0.071518,0,56021001901,0.000003,5604120,56021002000


#### Calculating Distances

In [9]:
msa_tracts['distance'] = None
msa_tracts

Unnamed: 0,state,county,tract,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2010,pop_density,LEAID,cbd,distance
0,13,53,20100,17980,0.765298,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020100,0.000015,1301050,13263960300,
1,13,53,20201,17980,1.353766,0.019542,0.014776,0.026215,0.114394,0.307911,0.156339,0.133460,0.073880,0.132030,0,13053020201,0.000491,1300002,13263960300,
2,13,53,20201,17980,1.353766,0.019542,0.014776,0.026215,0.114394,0.307911,0.156339,0.133460,0.073880,0.132030,0,13053020201,0.000491,1313053,13263960300,
3,13,53,20203,17980,1.118623,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020203,0.000010,1300002,13263960300,
4,13,53,20203,17980,1.118623,0.070723,0.131311,0.172671,0.347464,0.497674,0.300433,0.197239,0.095927,0.159589,0,13053020203,0.000010,1313053,13263960300,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64443,56,21,1401,16940,1.102899,0.027486,0.098626,0.062652,0.202910,0.350445,0.171787,0.078820,0.000000,0.007276,0,56021001401,0.001028,5601980,56021002000,
64444,56,21,1402,16940,0.974324,0.027486,0.098626,0.062652,0.202910,0.350445,0.171787,0.078820,0.000000,0.007276,0,56021001402,0.000211,5601980,56021002000,
64445,56,21,1901,16940,1.478837,0.071102,0.154262,0.151351,0.208731,0.198752,0.064449,0.046985,0.027443,0.071518,0,56021001901,0.000003,5601980,56021002000,
64446,56,21,1901,16940,1.478837,0.071102,0.154262,0.151351,0.208731,0.198752,0.064449,0.046985,0.027443,0.071518,0,56021001901,0.000003,5604120,56021002000,


In [10]:
msa_tracts[msa_tracts['tract_id_2010'] == msa_tracts['cbd']]['distance'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  msa_tracts[msa_tracts['tract_id_2010'] == msa_tracts['cbd']]['distance'] = 0


In [11]:
msa_tracts_with_dist = None

In [12]:
tract_distances = pd.read_csv('./data/sf12010tractdistance50miles.csv', dtype={'county1': str,'tract1': str, 'county2': str,'tract2': str}, chunksize=5000)
for chunk in tract_distances:
    chunk['tid1'] = chunk['county1'] + chunk['tract1']
    chunk.drop(['county1', 'tract1'], axis=1, inplace=True)
    chunk['tid2'] = chunk['county2'] + chunk['tract2']
    chunk.drop(['county2', 'tract2'], axis=1, inplace=True)
    chunk.rename({'mi_to_tract': 'distance'}, axis=1, inplace=True)
    m = pd.merge(left=msa_tracts, right=chunk, left_on=['cbd', 'tract_id_2010'], right_on=['tid1', 'tid2'])
    if msa_tracts_with_dist is None:
        msa_tracts_with_dist = m
    else:
        msa_tracts_with_dist = pd.concat((m, msa_tracts_with_dist))

msa_tracts_with_dist

Unnamed: 0,state,county,tract,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,...,built_1939_earlier,pub_trans_gt_10pct,tract_id_2010,pop_density,LEAID,cbd,distance_x,distance_y,tid1,tid2
0,56,25,200,16220,0.541786,0.053984,0.072444,0.043224,0.266710,0.619336,...,0.443310,0,56025000200,0.000660,5604510,56025001800,,31.971663,56025001800,56025000200
1,56,25,400,16220,0.794956,0.018392,0.024698,0.005780,0.036784,0.112979,...,0.038360,0,56025000400,0.001522,5604510,56025001800,,33.572169,56025001800,56025000400
2,56,25,800,16220,0.709026,0.014061,0.013498,0.003375,0.000000,0.053431,...,0.064679,0,56025000800,0.000952,5604510,56025001800,,30.489648,56025001800,56025000800
3,56,25,1401,16220,1.041356,0.094313,0.122562,0.096865,0.540007,1.057863,...,0.459754,0,56025001401,0.000002,5604510,56025001800,,37.594932,56025001800,56025001401
4,56,25,501,16220,0.821130,0.003806,0.010942,0.026641,0.249762,0.507135,...,0.000000,0,56025000501,0.001546,5604510,56025001800,,33.958160,56025001800,56025000501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49,10,1,40202,20100,1.217304,0.020000,0.096585,0.102926,0.179511,0.177560,...,0.165365,0,10001040202,0.000519,1001620,10001043400,,21.649411,10001043400,10001040202
50,10,1,40203,20100,1.096651,0.013051,0.181892,0.155791,0.168842,0.132953,...,0.077488,0,10001040203,0.000089,1001620,10001043400,,22.112634,10001043400,10001040203
51,10,1,40900,20100,0.485405,0.032945,0.091084,0.019380,0.184107,0.109495,...,0.239339,0,10001040900,0.001602,1000190,10001043400,,13.923058,10001043400,10001040900
52,10,1,41300,20100,0.628600,0.000000,0.012264,0.024528,0.090564,0.066979,...,0.250937,0,10001041300,0.000956,1000190,10001043400,,12.789227,10001043400,10001041300


In [13]:
msa_tracts_with_dist.drop(['distance_x', 'tid1', 'tid2'], axis=1, inplace=True)
msa_tracts_with_dist.rename({'distance_y': 'distance'}, axis=1, inplace=True)
msa_tracts_with_dist = msa_tracts_with_dist[msa_tracts_with_dist['distance'] <= 40]
msa_tracts_with_dist

Unnamed: 0,state,county,tract,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2010,pop_density,LEAID,cbd,distance
0,56,25,200,16220,0.541786,0.053984,0.072444,0.043224,0.266710,0.619336,0.171655,0.159039,0.155930,0.443310,0,56025000200,0.000660,5604510,56025001800,31.971663
1,56,25,400,16220,0.794956,0.018392,0.024698,0.005780,0.036784,0.112979,0.109826,0.612190,0.036784,0.038360,0,56025000400,0.001522,5604510,56025001800,33.572169
2,56,25,800,16220,0.709026,0.014061,0.013498,0.003375,0.000000,0.053431,0.081552,0.631608,0.134983,0.064679,0,56025000800,0.000952,5604510,56025001800,30.489648
3,56,25,1401,16220,1.041356,0.094313,0.122562,0.096865,0.540007,1.057863,0.239784,0.204458,0.168851,0.459754,0,56025001401,0.000002,5604510,56025001800,37.594932
4,56,25,501,16220,0.821130,0.003806,0.010942,0.026641,0.249762,0.507135,0.125594,0.072312,0.000000,0.000000,0,56025000501,0.001546,5604510,56025001800,33.958160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49,10,1,40202,20100,1.217304,0.020000,0.096585,0.102926,0.179511,0.177560,0.118048,0.079512,0.045853,0.165365,0,10001040202,0.000519,1001620,10001043400,21.649411
50,10,1,40203,20100,1.096651,0.013051,0.181892,0.155791,0.168842,0.132953,0.123980,0.118271,0.027732,0.077488,0,10001040203,0.000089,1001620,10001043400,22.112634
51,10,1,40900,20100,0.485405,0.032945,0.091084,0.019380,0.184107,0.109495,0.125968,0.137596,0.052325,0.239339,0,10001040900,0.001602,1000190,10001043400,13.923058
52,10,1,41300,20100,0.628600,0.000000,0.012264,0.024528,0.090564,0.066979,0.119808,0.277351,0.133959,0.250937,0,10001041300,0.000956,1000190,10001043400,12.789227


In [14]:
cbd_tracts = msa_tracts[msa_tracts['tract_id_2010'] == msa_tracts['cbd']]
cbd_tracts['distance'] = 0
msa_tracts_with_dist = pd.concat((msa_tracts_with_dist, cbd_tracts))
msa_tracts_with_dist

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cbd_tracts['distance'] = 0


Unnamed: 0,state,county,tract,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2010,pop_density,LEAID,cbd,distance
0,56,25,200,16220,0.541786,0.053984,0.072444,0.043224,0.266710,0.619336,0.171655,0.159039,0.155930,0.443310,0,56025000200,6.601584e-04,5604510,56025001800,31.971663
1,56,25,400,16220,0.794956,0.018392,0.024698,0.005780,0.036784,0.112979,0.109826,0.612190,0.036784,0.038360,0,56025000400,1.521695e-03,5604510,56025001800,33.572169
2,56,25,800,16220,0.709026,0.014061,0.013498,0.003375,0.000000,0.053431,0.081552,0.631608,0.134983,0.064679,0,56025000800,9.520551e-04,5604510,56025001800,30.489648
3,56,25,1401,16220,1.041356,0.094313,0.122562,0.096865,0.540007,1.057863,0.239784,0.204458,0.168851,0.459754,0,56025001401,1.975859e-06,5604510,56025001800,37.594932
4,56,25,501,16220,0.821130,0.003806,0.010942,0.026641,0.249762,0.507135,0.125594,0.072312,0.000000,0.000000,0,56025000501,1.545648e-03,5604510,56025001800,33.958160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64317,55,73,2300,48140,1.039428,0.021865,0.082096,0.065181,0.088696,0.138614,0.058168,0.062706,0.065181,0.412541,0,55073002300,1.119737e-05,5515900,55073002300,0.000000
64356,55,117,11400,43100,0.760709,0.010333,0.049751,0.022005,0.008037,0.059128,0.192227,0.118828,0.162784,1.310886,0,55117011400,1.040721e-03,5513650,55117011400,0.000000
64418,56,25,1800,16220,1.330880,0.072723,0.197701,0.083088,0.314001,0.653162,0.237053,0.141210,0.108111,0.179792,0,56025001800,4.307794e-07,5604510,56025001800,0.000000
64426,56,21,2000,16940,1.303751,0.171925,0.284788,0.116650,0.474119,0.667689,0.195365,0.199707,0.234291,0.641497,0,56021002000,3.608426e-06,5601980,56021002000,0.000000


In [16]:
msa_tracts_with_dist.drop(['state', 'county', 'tract'], axis=1, inplace=True)
msa_tracts_with_dist.to_csv('./data/msa_tracts_dist.csv', index=False)

## Modelling

The smallest of these models runs OLS with ~43,000 data points and 300 fixed effects. Therefore, I wasn't able to run most of them locally. Instead, I ran the models on the Duke Economics Computing Cluster and downloaded the saved models. The process for this can be seen in `reg.py`.

In [2]:
data = pd.read_csv('./data/msa_tracts_dist.csv', index_col=False)
data.dropna(inplace=True)
data

Unnamed: 0,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_1939_earlier,pub_trans_gt_10pct,tract_id_2010,pop_density,LEAID,cbd,distance
0,16220,0.541786,0.053984,0.072444,0.043224,0.266710,0.619336,0.171655,0.159039,0.155930,0.443310,0,56025000200,6.601584e-04,5604510,56025001800,31.971663
1,16220,0.794956,0.018392,0.024698,0.005780,0.036784,0.112979,0.109826,0.612190,0.036784,0.038360,0,56025000400,1.521695e-03,5604510,56025001800,33.572169
2,16220,0.709026,0.014061,0.013498,0.003375,0.000000,0.053431,0.081552,0.631608,0.134983,0.064679,0,56025000800,9.520551e-04,5604510,56025001800,30.489648
3,16220,1.041356,0.094313,0.122562,0.096865,0.540007,1.057863,0.239784,0.204458,0.168851,0.459754,0,56025001401,1.975859e-06,5604510,56025001800,37.594932
4,16220,0.821130,0.003806,0.010942,0.026641,0.249762,0.507135,0.125594,0.072312,0.000000,0.000000,0,56025000501,1.545648e-03,5604510,56025001800,33.958160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43566,48140,1.039428,0.021865,0.082096,0.065181,0.088696,0.138614,0.058168,0.062706,0.065181,0.412541,0,55073002300,1.119737e-05,5515900,55073002300,0.000000
43567,43100,0.760709,0.010333,0.049751,0.022005,0.008037,0.059128,0.192227,0.118828,0.162784,1.310886,0,55117011400,1.040721e-03,5513650,55117011400,0.000000
43568,16220,1.330880,0.072723,0.197701,0.083088,0.314001,0.653162,0.237053,0.141210,0.108111,0.179792,0,56025001800,4.307794e-07,5604510,56025001800,0.000000
43569,16940,1.303751,0.171925,0.284788,0.116650,0.474119,0.667689,0.195365,0.199707,0.234291,0.641497,0,56021002000,3.608426e-06,5601980,56021002000,0.000000


In [4]:
## test for saving model summaries
# model = smf.ols(formula='income ~ distance + C(msa_code)', data=data).fit()

# with open('./results.txt', 'w') as f:
#     f.write(model.summary().as_text())