# Final Project
Logan Cooper

In [1]:
import pandas as pd
import numpy as np
import gc
from tqdm.auto import tqdm

## Data

### Data Import

#### NCDB Building Age Data (2000)

In [55]:
ncdb_dtypes = {
    'tract_id': str,
    'built_1999_2000': int,
    'built_1995_1998': int,
    'built_1990_1994': int,
    'built_1980_1989': int,
    'built_1970_79': int,
    'built_1960_69': int,
    'built_1950_59': int,
    'built_1940_49': int,
    'built_1939_earlier': int
}
ncdb_data = pd.read_csv('./data/NCDB_2000.csv', dtype=ncdb_dtypes)
ncdb_data

Unnamed: 0,tract_id,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_79,built_1960_69,built_1950_59,built_1940_49,built_1939_earlier
0,01001020100,28,71,67,137,214,94,59,33,39
1,01001020200,21,47,39,102,220,83,75,52,119
2,01001020300,24,176,72,150,389,328,31,54,39
3,01001020400,11,78,103,159,541,639,306,24,10
4,01001020500,119,581,504,431,491,115,36,0,0
...,...,...,...,...,...,...,...,...,...,...
65438,56043000200,2,133,58,301,398,72,144,48,189
65439,56043000301,0,0,29,30,91,60,518,305,241
65440,56043000302,7,29,27,152,487,82,152,39,60
65441,56045951100,64,101,81,280,412,124,139,113,180


#### Tract Level Data (2019)

In [56]:
# import & rename cols
tract_data_2019 = pd.read_json('./data/tract_data_2019.json', dtype=False)
year_ranges = ['2014_later', '2010_2013', '2000_2009', '1990_1999', '1980_1989', '1970_1979', '1960_1969', '1950_1959', '1940_1949', '1939_earlier']
built_ranges = [f'built_{el}' for el in year_ranges]
tract_data_2019.columns = ['name', 'median_income', 'num_pub_trans', 'population'] + built_ranges + ['state', 'county', 'tract']

# trim out building dates
tract_data_2019['tract_id'] = tract_data_2019['state'] + tract_data_2019['county'] + tract_data_2019['tract']
tract_data_2019['pub_trans_gt_10pct'] = ((tract_data_2019['num_pub_trans'] / tract_data_2019['population']) >= 0.1).astype(int)

tract_data_2019_trim = tract_data_2019.drop(built_ranges + ['num_pub_trans'], axis=1)
tract_data_2019_trim

Unnamed: 0,name,median_income,population,state,county,tract,tract_id,pub_trans_gt_10pct
0,"Census Tract 11, Jefferson County, Alabama",37030.0,4781.0,01,073,001100,01073001100,0
1,"Census Tract 14, Jefferson County, Alabama",36066.0,1946.0,01,073,001400,01073001400,0
2,"Census Tract 20, Jefferson County, Alabama",27159.0,4080.0,01,073,002000,01073002000,0
3,"Census Tract 38.02, Jefferson County, Alabama",38721.0,5291.0,01,073,003802,01073003802,0
4,"Census Tract 40, Jefferson County, Alabama",18525.0,2533.0,01,073,004000,01073004000,0
...,...,...,...,...,...,...,...,...
72872,"Census Tract 19.02, Laramie County, Wyoming",87794.0,4187.0,56,021,001902,56021001902,0
72873,"Census Tract 9808.01, Laramie County, Wyoming",-666666666.0,0.0,56,021,980801,56021980801,0
72874,"Census Tract 16.02, Natrona County, Wyoming",85194.0,7513.0,56,025,001602,56025001602,0
72875,"Census Tract 16.03, Natrona County, Wyoming",120564.0,3724.0,56,025,001603,56025001603,0


In [57]:
tract_data_2019_trim['pub_trans_gt_10pct'].value_counts()

0    68089
1     4788
Name: pub_trans_gt_10pct, dtype: int64

#### MSA-Level Income Data (2019)

In [58]:
msa_data = pd.read_json('./data/msa_data.json')
msa_data.columns = ['name', 'median_income', 'msa_code']
msa_data['msa_code'] = msa_data['msa_code'].astype(str)
msa_data

Unnamed: 0,name,median_income,msa_code
0,"Big Stone Gap, VA Micro Area",38175,13720
1,"Billings, MT Metro Area",61278,13740
2,"Binghamton, NY Metro Area",54295,13780
3,"Birmingham-Hoover, AL Metro Area",57447,13820
4,"Bismarck, ND Metro Area",71398,13900
...,...,...,...
933,"Opelousas, LA Micro Area",36403,36660
934,"Orangeburg, SC Micro Area",37955,36700
935,"Orlando-Kissimmee-Sanford, FL Metro Area",58368,36740
936,"Fort Dodge, IA Micro Area",47466,22700


#### MSA-Tract Crosswalk

In [59]:
msa_lookup = pd.read_excel('./data/msa_codes.xls', 
                           converters={'CBSA Code': str, 
                                       'CBSA Title': str, 
                                       'Metropolitan/Micropolitan Statistical Area': str, 
                                       'FIPS State Code': str,
                                       'FIPS County Code': str}
                           ).drop(range(1916,1920))

msa_lookup

Unnamed: 0,CBSA Code,CBSA Title,Metropolitan/Micropolitan Statistical Area,FIPS State Code,FIPS County Code
0,10100,"Aberdeen, SD",Micropolitan Statistical Area,46,013
1,10100,"Aberdeen, SD",Micropolitan Statistical Area,46,045
2,10140,"Aberdeen, WA",Micropolitan Statistical Area,53,027
3,10180,"Abilene, TX",Metropolitan Statistical Area,48,059
4,10180,"Abilene, TX",Metropolitan Statistical Area,48,253
...,...,...,...,...,...
1911,49700,"Yuba City, CA",Metropolitan Statistical Area,06,101
1912,49700,"Yuba City, CA",Metropolitan Statistical Area,06,115
1913,49740,"Yuma, AZ",Metropolitan Statistical Area,04,027
1914,49780,"Zanesville, OH",Micropolitan Statistical Area,39,119


#### Conversion for 2000 => 2010

In [60]:
tract_conversion = pd.read_csv('./data/us2010trf.txt')
tract_conversion

Unnamed: 0,STATE00,COUNTY00,TRACT00,GEOID00,POP00,HU00,PART00,AREA00,AREALAND00,STATE10,...,AREAPCT00PT,AREALANDPCT00PT,AREAPCT10PT,AREALANDPCT10PT,POP10PT,POPPCT00,POPPCT10,HU10PT,HUPCT00,HUPCT10
0,1,1,20100,1001020100,1913,753,P,9846943,9810183,1,...,99.99,100.00,100.00,100.00,1912,99.95,100.00,752,99.87,100.00
1,1,1,20100,1001020100,1913,753,P,9846943,9810183,1,...,0.01,0.00,0.01,0.00,0,0.00,0.00,0,0.00,0.00
2,1,1,20100,1001020100,1913,753,P,9846943,9810183,1,...,0.00,0.00,0.00,0.00,1,0.05,0.01,1,0.13,0.02
3,1,1,20200,1001020200,2170,822,W,3346351,3340505,1,...,100.00,100.00,100.00,100.00,2170,100.00,100.00,822,100.00,100.00
4,1,1,20300,1001020300,3373,1326,W,5358328,5349274,1,...,100.00,100.00,100.00,100.00,3373,100.00,100.00,1326,100.00,100.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110849,72,153,750601,72153750601,5315,2106,P,11040535,11035996,72,...,99.79,99.79,100.00,100.00,5315,100.00,100.00,2106,100.00,100.00
110850,72,153,750601,72153750601,5315,2106,P,11040535,11035996,72,...,0.21,0.21,0.13,0.13,0,0.00,0.00,0,0.00,0.00
110851,72,153,750602,72153750602,3141,1450,P,17773730,17520777,72,...,0.04,0.04,0.05,0.05,0,0.00,0.00,0,0.00,0.00
110852,72,153,750602,72153750602,3141,1450,P,17773730,17520777,72,...,0.06,0.07,0.04,0.05,0,0.00,0.00,0,0.00,0.00


In [61]:
tract_conversion.columns

Index(['STATE00', 'COUNTY00', 'TRACT00', 'GEOID00', 'POP00', 'HU00', 'PART00',
       'AREA00', 'AREALAND00', 'STATE10', 'COUNTY10', 'TRACT10', 'GEOID10',
       'POP10', 'HU10', 'PART10', 'AREA10', 'AREALAND10', 'AREAPT',
       'AREALANDPT', 'AREAPCT00PT', 'AREALANDPCT00PT', 'AREAPCT10PT',
       'AREALANDPCT10PT', 'POP10PT', 'POPPCT00', 'POPPCT10', 'HU10PT',
       'HUPCT00', 'HUPCT10'],
      dtype='object')

In [62]:
tract_conversion = tract_conversion[['GEOID00', 'GEOID10', 'AREA10']]
tract_conversion['GEOID00'] = tract_conversion['GEOID00'].astype(str)
tract_conversion['GEOID10'] = tract_conversion['GEOID10'].astype(str)
tract_conversion

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tract_conversion['GEOID00'] = tract_conversion['GEOID00'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tract_conversion['GEOID10'] = tract_conversion['GEOID10'].astype(str)


Unnamed: 0,GEOID00,GEOID10,AREA10
0,1001020100,1001020100,9846256
1,1001020100,1001020600,8080414
2,1001020100,1001020802,191488944
3,1001020200,1001020200,3346351
4,1001020300,1001020300,5358328
...,...,...,...
110849,72153750601,72153750601,11017001
110850,72153750601,72153750602,17779488
110851,72153750602,72055960900,13182129
110852,72153750602,72055961100,29215554


#### Tract-School District Crosswalk

In [63]:
school_districts = pd.read_excel('./data/grf19_lea_tract.xlsx')
# school_districts.drop(['NAME_LEA19', 'COUNT', 'LANDAREA', 'WATERAREA'], axis=1, inplace=True)
school_districts['LEAID'] = school_districts['LEAID'].astype(str)
school_districts['TRACT'] = school_districts['TRACT'].astype(str)
school_districts

Unnamed: 0,LEAID,NAME_LEA19,TRACT,COUNT,LANDAREA,WATERAREA
0,100001,Fort Rucker School District,1031010300,2,23.428498,0.000000
1,100001,Fort Rucker School District,1045020000,2,66.513225,1.081745
2,100003,Maxwell AFB School District,1101000900,3,3.356590,0.143795
3,100003,Maxwell AFB School District,1101001000,3,0.001526,0.000000
4,100003,Maxwell AFB School District,1101006000,3,0.003588,0.000000
...,...,...,...,...,...,...
113515,7800030,Virgin Islands Department of Education,78030960900,32,3.147245,1.173777
113516,7800030,Virgin Islands Department of Education,78030961000,32,0.812847,0.910606
113517,7800030,Virgin Islands Department of Education,78030961100,32,1.356638,0.000000
113518,7800030,Virgin Islands Department of Education,78030961200,32,0.392958,0.309706


## Data Combination

#### Trim Tract Level Data to MSAs Only

In [64]:
only_metros = msa_lookup[msa_lookup['Metropolitan/Micropolitan Statistical Area'] == 'Metropolitan Statistical Area']
msa_tracts = pd.merge(left=tract_data_2019_trim, right=only_metros, left_on=['state', 'county'], right_on=['FIPS State Code', 'FIPS County Code'])
msa_tracts

Unnamed: 0,name,median_income,population,state,county,tract,tract_id,pub_trans_gt_10pct,CBSA Code,CBSA Title,Metropolitan/Micropolitan Statistical Area,FIPS State Code,FIPS County Code
0,"Census Tract 11, Jefferson County, Alabama",37030.0,4781.0,01,073,001100,01073001100,0,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
1,"Census Tract 14, Jefferson County, Alabama",36066.0,1946.0,01,073,001400,01073001400,0,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
2,"Census Tract 20, Jefferson County, Alabama",27159.0,4080.0,01,073,002000,01073002000,0,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
3,"Census Tract 38.02, Jefferson County, Alabama",38721.0,5291.0,01,073,003802,01073003802,0,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
4,"Census Tract 40, Jefferson County, Alabama",18525.0,2533.0,01,073,004000,01073004000,0,13820,"Birmingham-Hoover, AL",Metropolitan Statistical Area,01,073
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60863,"Census Tract 14.01, Laramie County, Wyoming",73795.0,4105.0,56,021,001401,56021001401,0,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021
60864,"Census Tract 14.02, Laramie County, Wyoming",65192.0,2671.0,56,021,001402,56021001402,0,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021
60865,"Census Tract 19.01, Laramie County, Wyoming",98949.0,5088.0,56,021,001901,56021001901,0,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021
60866,"Census Tract 19.02, Laramie County, Wyoming",87794.0,4187.0,56,021,001902,56021001902,0,16940,"Cheyenne, WY",Metropolitan Statistical Area,56,021


In [65]:
msa_tracts.columns

Index(['name', 'median_income', 'population', 'state', 'county', 'tract',
       'tract_id', 'pub_trans_gt_10pct', 'CBSA Code', 'CBSA Title',
       'Metropolitan/Micropolitan Statistical Area', 'FIPS State Code',
       'FIPS County Code'],
      dtype='object')

In [66]:
msa_tracts.drop(['name', 'CBSA Title', 'Metropolitan/Micropolitan Statistical Area', 'FIPS State Code', 'FIPS County Code'], axis=1, inplace=True)
msa_tracts

Unnamed: 0,median_income,population,state,county,tract,tract_id,pub_trans_gt_10pct,CBSA Code
0,37030.0,4781.0,01,073,001100,01073001100,0,13820
1,36066.0,1946.0,01,073,001400,01073001400,0,13820
2,27159.0,4080.0,01,073,002000,01073002000,0,13820
3,38721.0,5291.0,01,073,003802,01073003802,0,13820
4,18525.0,2533.0,01,073,004000,01073004000,0,13820
...,...,...,...,...,...,...,...,...
60863,73795.0,4105.0,56,021,001401,56021001401,0,16940
60864,65192.0,2671.0,56,021,001402,56021001402,0,16940
60865,98949.0,5088.0,56,021,001901,56021001901,0,16940
60866,87794.0,4187.0,56,021,001902,56021001902,0,16940


In [67]:
del only_metros
gc.collect()

113646

#### Merge MSA-Level Data

In [68]:
msa_tracts = pd.merge(left=msa_tracts, right=msa_data, left_on='CBSA Code', right_on='msa_code', suffixes=('_tract', '_msa'))
msa_tracts['income'] = msa_tracts['median_income_tract'] / msa_tracts['median_income_msa']
msa_tracts.drop(['CBSA Code', 'name', 'median_income_tract', 'median_income_msa'], axis=1, inplace=True)
msa_tracts

Unnamed: 0,population,state,county,tract,tract_id,pub_trans_gt_10pct,msa_code,income
0,4781.0,01,073,001100,01073001100,0,13820,0.644594
1,1946.0,01,073,001400,01073001400,0,13820,0.627813
2,4080.0,01,073,002000,01073002000,0,13820,0.472766
3,5291.0,01,073,003802,01073003802,0,13820,0.674030
4,2533.0,01,073,004000,01073004000,0,13820,0.322471
...,...,...,...,...,...,...,...,...
60863,4105.0,56,021,001401,56021001401,0,16940,1.102899
60864,2671.0,56,021,001402,56021001402,0,16940,0.974324
60865,5088.0,56,021,001901,56021001901,0,16940,1.478837
60866,4187.0,56,021,001902,56021001902,0,16940,1.312121


In [69]:
del msa_data
gc.collect()

0

#### Convert 2000-Tracts and Merge

In [39]:
ages_2019 = pd.merge(left=ncdb_data, right=tract_conversion, left_on='tract_id', right_on='GEOID00')
ages_2019.drop(['tract_id'], axis=1, inplace=True)
ages_2019.rename({'AREA10': 'area', 'GEOID10': 'tract_id_2010', 'GEOID00': 'tract_id_2000'}, inplace=True, axis=1)

ages_2019

Unnamed: 0,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_79,built_1960_69,built_1950_59,built_1940_49,built_1939_earlier,tract_id_2000,tract_id_2010,area
0,74,238,301,502,450,126,46,33,134,10001040100,10001040100,124745852
1,61,114,87,105,299,229,131,87,336,10001040201,10001040201,9767793
2,41,198,211,368,364,242,163,94,339,10001040202,10001040202,32627551
3,16,223,191,207,163,152,145,34,95,10001040203,10001040203,60922673
4,0,20,35,81,105,56,88,12,196,10001040400,10001043202,342113295
...,...,...,...,...,...,...,...,...,...,...,...,...
90446,7,29,27,152,487,82,152,39,60,56043000302,56043000302,16222789
90447,64,101,81,280,412,124,139,113,180,56045951100,46033965100,3953338213
90448,64,101,81,280,412,124,139,113,180,56045951100,56011950200,5230796551
90449,64,101,81,280,412,124,139,113,180,56045951100,56045951100,6105036767


In [40]:
tract_counts = ages_2019['tract_id_2010'].value_counts()
tract_multiples = tract_counts[tract_counts > 1].index
tract_multiples

Index(['15003990001', '53033990100', '25025990101', '12087990000',
       '53061990002', '12099990000', '12057990000', '12009990000',
       '53029992201', '25009990100',
       ...
       '48057000100', '55053960400', '55089660304', '48055960600',
       '48055960700', '48057000400', '21175950200', '21133950200',
       '55029100900', '21173920200'],
      dtype='object', length=19686)

In [42]:
a = ages_2019[ages_2019['tract_id_2010'] == '53033990100']
a

Unnamed: 0,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_79,built_1960_69,built_1950_59,built_1940_49,built_1939_earlier,tract_id_2000,tract_id_2010,area
86018,0,48,20,95,69,202,601,219,135,53033000500,53033990100,239607794
86028,153,65,121,212,182,248,502,420,309,53033001400,53033990100,239607794
86030,0,32,32,29,102,156,247,196,266,53033001500,53033990100,239607794
86032,18,19,62,179,108,226,661,294,244,53033001600,53033990100,239607794
86048,16,17,24,80,92,151,336,602,1357,53033003100,53033990100,239607794
86050,65,113,179,328,489,570,600,376,1206,53033003200,53033990100,239607794
86077,0,11,40,67,111,144,576,884,942,53033005600,53033990100,239607794
86079,15,62,64,185,232,391,478,736,631,53033005700,53033990100,239607794
86082,15,109,170,328,306,580,447,165,400,53033005802,53033990100,239607794
86106,213,217,927,603,113,8,35,35,458,53033008001,53033990100,239607794


In [43]:
l = len(a)
a = a.groupby('tract_id_2010').sum()
a['area'] //= l
a.reset_index()

  a = a.groupby('tract_id_2010').sum()


Unnamed: 0,tract_id_2010,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_79,built_1960_69,built_1950_59,built_1940_49,built_1939_earlier,area
0,53033990100,1012,3073,4393,8888,9968,10256,11996,8298,13830,239607794


In [49]:
for tid in tqdm(tract_multiples):
    matches = ages_2019[ages_2019['tract_id_2010'] == tid]
    l = len(matches)
    a = matches.groupby('tract_id_2010').sum(numeric_only=True)
    a['area'] //= l

    ages_2019 = ages_2019[ages_2019['tract_id_2010'] != tid] # drop all rows with tid
    ages_2019 = pd.concat((ages_2019, a.reset_index()), axis=0, ignore_index=True) #re-add summed

ages_2019

100%|██████████| 19686/19686 [11:55<00:00, 27.53it/s]


Unnamed: 0,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_79,built_1960_69,built_1950_59,built_1940_49,built_1939_earlier,tract_id_2000,tract_id_2010,area
0,74,238,301,502,450,126,46,33,134,10001040100,10001040100,124745852
1,61,114,87,105,299,229,131,87,336,10001040201,10001040201,9767793
2,41,198,211,368,364,242,163,94,339,10001040202,10001040202,32627551
3,16,223,191,207,163,152,145,34,95,10001040203,10001040203,60922673
4,175,527,413,404,786,789,294,89,126,10001040500,10001040501,10466151
...,...,...,...,...,...,...,...,...,...,...,...,...
59363,122,311,302,1225,1067,1295,628,173,138,,48057000400,481520755
59364,51,198,197,303,604,203,149,102,161,,21175950200,229661663
59365,190,591,514,814,1391,611,434,500,976,,21133950200,110871515
59366,50,228,169,299,472,291,290,249,611,,55029100900,160651212


In [50]:
ages_2019.drop('tract_id_2000', axis=1, inplace=True)
ages_2019

Unnamed: 0,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_79,built_1960_69,built_1950_59,built_1940_49,built_1939_earlier,tract_id_2010,area
0,74,238,301,502,450,126,46,33,134,10001040100,124745852
1,61,114,87,105,299,229,131,87,336,10001040201,9767793
2,41,198,211,368,364,242,163,94,339,10001040202,32627551
3,16,223,191,207,163,152,145,34,95,10001040203,60922673
4,175,527,413,404,786,789,294,89,126,10001040501,10466151
...,...,...,...,...,...,...,...,...,...,...,...
59363,122,311,302,1225,1067,1295,628,173,138,48057000400,481520755
59364,51,198,197,303,604,203,149,102,161,21175950200,229661663
59365,190,591,514,814,1391,611,434,500,976,21133950200,110871515
59366,50,228,169,299,472,291,290,249,611,55029100900,160651212


In [70]:
msa_tracts = pd.merge(left=msa_tracts, right=ages_2019, left_on='tract_id', right_on='tract_id_2010')
msa_tracts.drop(['tract_id'], axis=1, inplace=True)

msa_tracts = msa_tracts[(msa_tracts['population'] >= 1) & (msa_tracts['income'] > 0) & (msa_tracts['area'] > 0)] # drop weird tracts

msa_tracts['pop_density'] = msa_tracts['population'] / msa_tracts['area']
msa_tracts

Unnamed: 0,population,state,county,tract,pub_trans_gt_10pct,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_79,built_1960_69,built_1950_59,built_1940_49,built_1939_earlier,tract_id_2010,area,pop_density
0,2523.0,13,053,020100,0,17980,0.765298,106,179,241,536,887,511,361,183,312,13053020100,166017244,0.000015
1,3167.0,13,053,020201,0,17980,1.353766,41,31,55,240,646,328,280,155,277,13053020201,6448825,0.000491
2,1678.0,13,053,020203,0,17980,1.118623,106,179,241,536,887,511,361,183,312,13053020203,164314662,0.000010
3,2455.0,13,053,020205,0,17980,1.046814,41,31,55,240,646,328,280,155,277,13053020205,7885910,0.000311
5,2350.0,13,263,960200,0,17980,0.841710,224,818,770,857,802,454,290,243,273,13263960200,424516492,0.000006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48373,8092.0,56,021,000501,0,16940,1.184038,94,376,78,304,497,911,204,31,17,56021000501,6107369,0.001325
48374,4105.0,56,021,001401,0,16940,1.102899,68,244,155,502,867,425,195,0,18,56021001401,3993394,0.001028
48375,2671.0,56,021,001402,0,16940,0.974324,68,244,155,502,867,425,195,0,18,56021001402,12662870,0.000211
48376,5088.0,56,021,001901,0,16940,1.478837,171,371,364,502,478,155,113,66,172,56021001901,1516337079,0.000003


In [71]:
del ages_2019
gc.collect()

0

#### Add School District

In [72]:
msa_tracts = pd.merge(left=msa_tracts, right=school_districts, left_on='tract_id_2010', right_on='TRACT')
msa_tracts.drop(['NAME_LEA19', 'TRACT', 'COUNT', 'LANDAREA', 'WATERAREA'], axis=1, inplace=True)
msa_tracts

Unnamed: 0,population,state,county,tract,pub_trans_gt_10pct,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,built_1980_1989,built_1970_79,built_1960_69,built_1950_59,built_1940_49,built_1939_earlier,tract_id_2010,area,pop_density,LEAID
0,2523.0,13,053,020100,0,17980,0.765298,106,179,241,536,887,511,361,183,312,13053020100,166017244,0.000015,1301050
1,3167.0,13,053,020201,0,17980,1.353766,41,31,55,240,646,328,280,155,277,13053020201,6448825,0.000491,1300002
2,3167.0,13,053,020201,0,17980,1.353766,41,31,55,240,646,328,280,155,277,13053020201,6448825,0.000491,1313053
3,1678.0,13,053,020203,0,17980,1.118623,106,179,241,536,887,511,361,183,312,13053020203,164314662,0.000010,1300002
4,1678.0,13,053,020203,0,17980,1.118623,106,179,241,536,887,511,361,183,312,13053020203,164314662,0.000010,1313053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64443,4105.0,56,021,001401,0,16940,1.102899,68,244,155,502,867,425,195,0,18,56021001401,3993394,0.001028,5601980
64444,2671.0,56,021,001402,0,16940,0.974324,68,244,155,502,867,425,195,0,18,56021001402,12662870,0.000211,5601980
64445,5088.0,56,021,001901,0,16940,1.478837,171,371,364,502,478,155,113,66,172,56021001901,1516337079,0.000003,5601980
64446,5088.0,56,021,001901,0,16940,1.478837,171,371,364,502,478,155,113,66,172,56021001901,1516337079,0.000003,5604120


In [None]:
del school_districts
gc.collect()

In [77]:
msa_tracts.to_csv('./data/msa_tracts.csv')

### Calculating Distances

#### Finding Central Business District

In [3]:
msa_tracts = pd.read_csv('./data/msa_tracts.csv', index_col=False)
msa_tracts

Unnamed: 0,population,state,county,tract,pub_trans_gt_10pct,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,...,built_1970_79,built_1960_69,built_1950_59,built_1940_49,built_1939_earlier,tract_id_2010,area,pop_density,LEAID,distance
0,2523.0,13,53,20100,0,17980,0.765298,106,179,241,...,887,511,361,183,312,13053020100,166017244,0.000015,1301050,
1,3167.0,13,53,20201,0,17980,1.353766,41,31,55,...,646,328,280,155,277,13053020201,6448825,0.000491,1300002,
2,3167.0,13,53,20201,0,17980,1.353766,41,31,55,...,646,328,280,155,277,13053020201,6448825,0.000491,1313053,
3,1678.0,13,53,20203,0,17980,1.118623,106,179,241,...,887,511,361,183,312,13053020203,164314662,0.000010,1300002,
4,1678.0,13,53,20203,0,17980,1.118623,106,179,241,...,887,511,361,183,312,13053020203,164314662,0.000010,1313053,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64443,4105.0,56,21,1401,0,16940,1.102899,68,244,155,...,867,425,195,0,18,56021001401,3993394,0.001028,5601980,
64444,2671.0,56,21,1402,0,16940,0.974324,68,244,155,...,867,425,195,0,18,56021001402,12662870,0.000211,5601980,
64445,5088.0,56,21,1901,0,16940,1.478837,171,371,364,...,478,155,113,66,172,56021001901,1516337079,0.000003,5601980,
64446,5088.0,56,21,1901,0,16940,1.478837,171,371,364,...,478,155,113,66,172,56021001901,1516337079,0.000003,5604120,


In [4]:
msa_tracts['msa_code'].value_counts()

35620    5339
16980    4167
19100    1909
19820    1694
37980    1615
         ... 
16220      18
45540      17
15680      17
25980      16
16180      13
Name: msa_code, Length: 321, dtype: int64

In [5]:
cbd_candidates = msa_tracts[['msa_code', 'tract_id_2010', 'pop_density']]
cbds = cbd_candidates.groupby('msa_code').max()
cbds.drop('pop_density', axis=1, inplace=True)
cbds.reset_index(inplace=True)
cbds

Unnamed: 0,msa_code,tract_id_2010
0,10180,48441013600
1,10420,39153534100
2,10500,13321950600
3,10540,41043030904
4,10580,36095740800
...,...,...
316,49180,37197050502
317,49340,25027761400
318,49420,53077940006
319,49620,42133024002


In [6]:
msa_tracts = pd.merge(left=msa_tracts, right=cbds, left_on='msa_code', right_on='msa_code', suffixes=('', 'cbd'))
msa_tracts.rename({'tract_id_2010cbd': 'cbd'}, inplace=True, axis=1)
msa_tracts['cbd'] = msa_tracts['cbd'].astype(str)
msa_tracts['tract_id_2010'] = msa_tracts['tract_id_2010'].astype(str)
msa_tracts

Unnamed: 0,population,state,county,tract,pub_trans_gt_10pct,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,...,built_1960_69,built_1950_59,built_1940_49,built_1939_earlier,tract_id_2010,area,pop_density,LEAID,distance,cbd
0,2523.0,13,53,20100,0,17980,0.765298,106,179,241,...,511,361,183,312,13053020100,166017244,0.000015,1301050,,13263960300
1,3167.0,13,53,20201,0,17980,1.353766,41,31,55,...,328,280,155,277,13053020201,6448825,0.000491,1300002,,13263960300
2,3167.0,13,53,20201,0,17980,1.353766,41,31,55,...,328,280,155,277,13053020201,6448825,0.000491,1313053,,13263960300
3,1678.0,13,53,20203,0,17980,1.118623,106,179,241,...,511,361,183,312,13053020203,164314662,0.000010,1300002,,13263960300
4,1678.0,13,53,20203,0,17980,1.118623,106,179,241,...,511,361,183,312,13053020203,164314662,0.000010,1313053,,13263960300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64443,4105.0,56,21,1401,0,16940,1.102899,68,244,155,...,425,195,0,18,56021001401,3993394,0.001028,5601980,,56021002000
64444,2671.0,56,21,1402,0,16940,0.974324,68,244,155,...,425,195,0,18,56021001402,12662870,0.000211,5601980,,56021002000
64445,5088.0,56,21,1901,0,16940,1.478837,171,371,364,...,155,113,66,172,56021001901,1516337079,0.000003,5601980,,56021002000
64446,5088.0,56,21,1901,0,16940,1.478837,171,371,364,...,155,113,66,172,56021001901,1516337079,0.000003,5604120,,56021002000


#### Calculating Distances

In [7]:
msa_tracts['distance'] = None
msa_tracts

Unnamed: 0,population,state,county,tract,pub_trans_gt_10pct,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,...,built_1960_69,built_1950_59,built_1940_49,built_1939_earlier,tract_id_2010,area,pop_density,LEAID,distance,cbd
0,2523.0,13,53,20100,0,17980,0.765298,106,179,241,...,511,361,183,312,13053020100,166017244,0.000015,1301050,,13263960300
1,3167.0,13,53,20201,0,17980,1.353766,41,31,55,...,328,280,155,277,13053020201,6448825,0.000491,1300002,,13263960300
2,3167.0,13,53,20201,0,17980,1.353766,41,31,55,...,328,280,155,277,13053020201,6448825,0.000491,1313053,,13263960300
3,1678.0,13,53,20203,0,17980,1.118623,106,179,241,...,511,361,183,312,13053020203,164314662,0.000010,1300002,,13263960300
4,1678.0,13,53,20203,0,17980,1.118623,106,179,241,...,511,361,183,312,13053020203,164314662,0.000010,1313053,,13263960300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64443,4105.0,56,21,1401,0,16940,1.102899,68,244,155,...,425,195,0,18,56021001401,3993394,0.001028,5601980,,56021002000
64444,2671.0,56,21,1402,0,16940,0.974324,68,244,155,...,425,195,0,18,56021001402,12662870,0.000211,5601980,,56021002000
64445,5088.0,56,21,1901,0,16940,1.478837,171,371,364,...,155,113,66,172,56021001901,1516337079,0.000003,5601980,,56021002000
64446,5088.0,56,21,1901,0,16940,1.478837,171,371,364,...,155,113,66,172,56021001901,1516337079,0.000003,5604120,,56021002000


In [8]:
msa_tracts[msa_tracts['tract_id_2010'] == msa_tracts['cbd']]['distance'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  msa_tracts[msa_tracts['tract_id_2010'] == msa_tracts['cbd']]['distance'] = 0


In [9]:
msa_tracts_with_dist = None

In [10]:
tract_distances = pd.read_csv('./data/sf12010tractdistance100miles.csv', dtype={'county1': str,'tract1': str, 'county2': str,'tract2': str}, chunksize=5000)
for chunk in tract_distances:
    chunk['tid1'] = chunk['county1'] + chunk['tract1']
    chunk.drop(['county1', 'tract1'], axis=1, inplace=True)
    chunk['tid2'] = chunk['county2'] + chunk['tract2']
    chunk.drop(['county2', 'tract2'], axis=1, inplace=True)
    chunk.rename({'mi_to_tract': 'distance'}, axis=1, inplace=True)
    m = pd.merge(left=msa_tracts, right=chunk, left_on=['cbd', 'tract_id_2010'], right_on=['tid1', 'tid2'])
    if msa_tracts_with_dist is None:
        msa_tracts_with_dist = m
    else:
        msa_tracts_with_dist = pd.concat((m, msa_tracts_with_dist))

msa_tracts_with_dist

Unnamed: 0,population,state,county,tract,pub_trans_gt_10pct,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,...,built_1939_earlier,tract_id_2010,area,pop_density,LEAID,distance_x,cbd,distance_y,tid1,tid2
0,3345.0,56,25,200,0,16220,0.541786,84,120,81,...,869,56025000200,5066966,0.000660,5604510,,56025001800,31.971663,56025001800,56025000200
1,4170.0,56,25,400,0,16220,0.794956,35,47,11,...,73,56025000400,2740366,0.001522,5604510,,56025001800,33.572169,56025001800,56025000400
2,3807.0,56,25,800,0,16220,0.709026,25,24,6,...,115,56025000800,3998718,0.000952,5604510,,56025001800,30.489648,56025001800,56025000800
3,5730.0,56,25,1401,0,16220,1.041356,187,248,218,...,911,56025001401,2900004318,0.000002,5604510,,56025001800,37.594932,56025001800,56025001401
4,5099.0,56,25,501,0,16220,0.821130,8,23,56,...,0,56025000501,3298941,0.001546,5604510,,56025001800,33.958160,56025001800,56025000501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49,16940.0,10,1,40202,0,20100,1.217304,41,198,211,...,339,10001040202,32627551,0.000519,1001620,,10001043400,21.649411,10001043400,10001040202
50,5443.0,10,1,40203,0,20100,1.096651,16,223,191,...,95,10001040203,60922673,0.000089,1001620,,10001043400,22.112634,10001043400,10001040203
51,2646.0,10,1,40900,0,20100,0.485405,34,94,20,...,247,10001040900,1652086,0.001602,1000190,,10001043400,13.923058,10001043400,10001040900
52,2221.0,10,1,41300,0,20100,0.628600,0,13,26,...,266,10001041300,2323562,0.000956,1000190,,10001043400,12.789227,10001043400,10001041300


In [12]:
msa_tracts_with_dist.drop(['distance_x', 'tid1', 'tid2'], axis=1, inplace=True)
msa_tracts_with_dist.rename({'distance_y': 'distance'}, axis=1, inplace=True)
msa_tracts_with_dist

Unnamed: 0,population,state,county,tract,pub_trans_gt_10pct,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,...,built_1960_69,built_1950_59,built_1940_49,built_1939_earlier,tract_id_2010,area,pop_density,LEAID,cbd,distance
0,3345.0,56,25,200,0,16220,0.541786,84,120,81,...,328,324,310,869,56025000200,5066966,0.000660,5604510,56025001800,31.971663
1,4170.0,56,25,400,0,16220,0.794956,35,47,11,...,209,1165,70,73,56025000400,2740366,0.001522,5604510,56025001800,33.572169
2,3807.0,56,25,800,0,16220,0.709026,25,24,6,...,145,1123,240,115,56025000800,3998718,0.000952,5604510,56025001800,30.489648
3,5730.0,56,25,1401,0,16220,1.041356,187,248,218,...,502,440,343,911,56025001401,2900004318,0.000002,5604510,56025001800,37.594932
4,5099.0,56,25,501,0,16220,0.821130,8,23,56,...,264,152,0,0,56025000501,3298941,0.001546,5604510,56025001800,33.958160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49,16940.0,10,1,40202,0,20100,1.217304,41,198,211,...,242,163,94,339,10001040202,32627551,0.000519,1001620,10001043400,21.649411
50,5443.0,10,1,40203,0,20100,1.096651,16,223,191,...,152,145,34,95,10001040203,60922673,0.000089,1001620,10001043400,22.112634
51,2646.0,10,1,40900,0,20100,0.485405,34,94,20,...,130,142,54,247,10001040900,1652086,0.001602,1000190,10001043400,13.923058
52,2221.0,10,1,41300,0,20100,0.628600,0,13,26,...,127,294,142,266,10001041300,2323562,0.000956,1000190,10001043400,12.789227


In [16]:
cbd_tracts = msa_tracts[msa_tracts['tract_id_2010'] == msa_tracts['cbd']]
cbd_tracts['distance'] = 0
msa_tracts_with_dist = pd.concat((msa_tracts_with_dist, cbd_tracts))
msa_tracts_with_dist

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cbd_tracts['distance'] = 0


Unnamed: 0,population,state,county,tract,pub_trans_gt_10pct,msa_code,income,built_1999_2000,built_1995_1998,built_1990_1994,...,built_1960_69,built_1950_59,built_1940_49,built_1939_earlier,tract_id_2010,area,pop_density,LEAID,cbd,distance
0,3345.0,56,25,200,0,16220,0.541786,84,120,81,...,328,324,310,869,56025000200,5066966,6.601584e-04,5604510,56025001800,31.971663
1,4170.0,56,25,400,0,16220,0.794956,35,47,11,...,209,1165,70,73,56025000400,2740366,1.521695e-03,5604510,56025001800,33.572169
2,3807.0,56,25,800,0,16220,0.709026,25,24,6,...,145,1123,240,115,56025000800,3998718,9.520551e-04,5604510,56025001800,30.489648
3,5730.0,56,25,1401,0,16220,1.041356,187,248,218,...,502,440,343,911,56025001401,2900004318,1.975859e-06,5604510,56025001800,37.594932
4,5099.0,56,25,501,0,16220,0.821130,8,23,56,...,264,152,0,0,56025000501,3298941,1.545648e-03,5604510,56025001800,33.958160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64317,7090.0,55,73,2300,0,48140,1.039428,53,199,158,...,141,152,158,1000,55073002300,633184236,1.119737e-05,5515900,55073002300,0.000000
64356,2512.0,55,117,11400,0,43100,0.760709,9,31,13,...,104,85,118,896,55117011400,2413711,1.040721e-03,5513650,55117011400,0.000000
64418,4680.0,56,25,1800,0,16220,1.330880,156,419,180,...,503,301,229,380,56025001800,10864029883,4.307794e-07,5604510,56025001800,0.000000
64426,9853.0,56,21,2000,0,16940,1.303751,172,373,175,...,288,276,320,575,56021002000,2730553549,3.608426e-06,5601980,56021002000,0.000000


In [None]:
msa_tracts_with_dist.drop(['population', 'state', 'county', 'tract'], axis=1, inplace=True)
msa_tracts_with_dist.to_csv('./data/msa_tracts_dist.csv')