# Final Project
Logan Cooper

In [1]:
import pandas as pd
import numpy as np

## Data

### Data Import

#### NCDB Building Age Data (2000)

In [None]:
ncdb_dtypes = {
    'tract_id': str,
    'built_1999_2000': int,
    'built_1995_1998': int,
    'built_1990_1994': int,
    'built_1980_1989': int,
    'built_1970_79': int,
    'built_1960_69': int,
    'built_1950_59': int,
    'built_1940_49': int,
    'built_1939_earlier': int
}
ncdb_data = pd.read_csv('./data/NCDB_2000.csv', dtype=ncdb_dtypes)
ncdb_data

#### Tract Level Data (2019)

In [None]:
# import & rename cols
tract_data_2019 = pd.read_json('./data/tract_data_2019.json', dtype=False)
year_ranges = ['2014_later', '2010_2013', '2000_2009', '1990_1999', '1980_1989', '1970_1979', '1960_1969', '1950_1959', '1940_1949', '1939_earlier']
built_ranges = [f'built_{el}' for el in year_ranges]
tract_data_2019.columns = ['name', 'median_income', 'num_pub_trans', 'population'] + built_ranges + ['state', 'county', 'tract']

# trim out building dates
tract_data_2019['tract_id'] = tract_data_2019['state'] + tract_data_2019['county'] + tract_data_2019['tract']
tract_data_2019['pub_trans_gt_10pct'] = ((tract_data_2019['num_pub_trans'] / tract_data_2019['population']) >= 0.1).astype(int)

tract_data_2019_trim = tract_data_2019.drop(built_ranges + ['num_pub_trans'], axis=1)
tract_data_2019_trim

In [None]:
tract_data_2019_trim['pub_trans_gt_10pct'].value_counts()

#### MSA-Level Income Data (2019)

In [None]:
msa_data = pd.read_json('./data/msa_data.json')
msa_data.columns = ['name', 'median_income', 'msa_code']
msa_data['msa_code'] = msa_data['msa_code'].astype(str)
msa_data

#### MSA-Tract Crosswalk

In [None]:
msa_lookup = pd.read_excel('./data/msa_codes.xls', 
                           converters={'CBSA Code': str, 
                                       'CBSA Title': str, 
                                       'Metropolitan/Micropolitan Statistical Area': str, 
                                       'FIPS State Code': str,
                                       'FIPS County Code': str}
                           ).drop(range(1916,1920))

msa_lookup

#### Conversion for 2000 => 2010

In [None]:
tract_conversion = pd.read_csv('./data/us2010trf.txt')
tract_conversion

In [None]:
tract_conversion.columns

In [None]:
tract_conversion = tract_conversion[['GEOID00', 'GEOID10', 'AREALAND10']]
tract_conversion['GEOID00'] = tract_conversion['GEOID00'].astype(str)
tract_conversion['GEOID10'] = tract_conversion['GEOID10'].astype(str)
tract_conversion

#### Tract-School District Crosswalk

In [None]:
school_districts = pd.read_excel('./data/grf19_lea_tract.xlsx')
# school_districts.drop(['NAME_LEA19', 'COUNT', 'LANDAREA', 'WATERAREA'], axis=1, inplace=True)
school_districts['LEAID'] = school_districts['LEAID'].astype(str)
school_districts['TRACT'] = school_districts['TRACT'].astype(str)
school_districts

## Data Combination

#### Trim Tract Level Data to MSAs Only

In [None]:
only_metros = msa_lookup[msa_lookup['Metropolitan/Micropolitan Statistical Area'] == 'Metropolitan Statistical Area']
msa_tracts = pd.merge(left=tract_data_2019_trim, right=only_metros, left_on=['state', 'county'], right_on=['FIPS State Code', 'FIPS County Code'])
msa_tracts

In [None]:
msa_tracts.columns

In [None]:
msa_tracts.drop(['name', 'CBSA Title', 'Metropolitan/Micropolitan Statistical Area', 'FIPS State Code', 'FIPS County Code'], axis=1, inplace=True)
msa_tracts

#### Merge MSA-Level Data

In [None]:
msa_tracts = pd.merge(left=msa_tracts, right=msa_data, left_on='CBSA Code', right_on='msa_code', suffixes=('_tract', '_msa'))
msa_tracts['income'] = msa_tracts['median_income_tract'] / msa_tracts['median_income_msa']
msa_tracts.drop(['CBSA Code', 'name', 'median_income_tract', 'median_income_msa'], axis=1, inplace=True)
msa_tracts

#### Convert 2000-Tracts and Merge

In [None]:
ages_2019 = pd.merge(left=ncdb_data, right=tract_conversion, left_on='tract_id', right_on='GEOID00')
ages_2019.drop(['tract_id', 'GEOID00'], axis=1, inplace=True)
ages_2019.rename({'AREALAND10': 'land_area', 'GEOID10': 'tract_id_2010'}, inplace=True, axis=1)

ages_2019

In [None]:
msa_tracts = pd.merge(left=msa_tracts, right=ages_2019, left_on='tract_id', right_on='tract_id_2010')
msa_tracts.drop(['tract_id'], axis=1, inplace=True)

msa_tracts = msa_tracts[(msa_tracts['population'] >= 1) & (msa_tracts['income'] > 0) & (msa_tracts['land_area'] > 0)] # drop weird tracts

msa_tracts['pop_density'] = msa_tracts['population'] / msa_tracts['land_area']
msa_tracts

#### Add School District

In [None]:
msa_tracts = pd.merge(left=msa_tracts, right=school_districts, left_on='tract_id_2010', right_on='TRACT')
msa_tracts.drop(['NAME_LEA19', 'TRACT', 'COUNT', 'LANDAREA', 'WATERAREA'], axis=1, inplace=True)
msa_tracts

### Calculating Distances

#### Finding Central Business District

In [None]:
msa_tracts.columns

In [None]:
msa_tracts['msa_code'].value_counts()

In [None]:
cdb_candidates = msa_tracts[['msa_code', 'tract_id_2010', 'pop_density']]
cdbs = cdb_candidates.groupby('msa_code').max()
cdbs

#### Calculating Distances

In [2]:
msa_tracts['distance'] = None
msa_tracts

NameError: name 'msa_tracts' is not defined

In [None]:
tract_distances = pd.read_csv('./data/sf12010tractdistance50miles.csv')
tract_distances