## Imports

In [1]:
import pandas as pd
import geopandas as gpd
import folium
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.linear_model import LinearRegression
import re
import json
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import cdist
%matplotlib inline

In [2]:
#suppress warnings
import warnings
warnings.filterwarnings("ignore")

## Bring in our file

In [3]:
# 2019 shapefile
data = pd.read_csv('../output/csv/agegroupshomevalue_2014_2019_cbsa.csv')

In [4]:
data.head(3)

Unnamed: 0,CBSA,GEOID,total,child,taxbase,O65,Pchild,Ptaxbase,PO65,homevalue,...,totalpercchange,homepercchange,O65sharechange,taxbasesharechange,childsharechange,O65realchange,taxbaserealchange,childrealchange,totalrealchange,homerealchange
0,"Big Stone Gap, VA Micro Area",310M500US13720,42456,8403,26596,7457,19.8,62.6,17.6,86200,...,-29.82,4.11,2.2,-1.5,-0.7,-1867,-12204,-3972,-18043,3400
1,"Billings, MT Metro Area",310M500US13740,179071,41435,106634,31002,23.1,59.5,17.3,227900,...,9.93,21.81,2.2,-2.1,-0.2,6360,6358,3461,16179,40800
2,"Binghamton, NY Metro Area",310M500US13780,241874,47899,148264,45711,19.8,61.3,18.9,117600,...,-2.96,8.09,2.0,-1.4,-0.6,3571,-8059,-2899,-7387,8800


In [5]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 859 entries, 0 to 858
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CBSA                859 non-null    object 
 1   GEOID               859 non-null    object 
 2   total               859 non-null    int64  
 3   child               859 non-null    int64  
 4   taxbase             859 non-null    int64  
 5   O65                 859 non-null    int64  
 6   Pchild              859 non-null    float64
 7   Ptaxbase            859 non-null    float64
 8   PO65                859 non-null    float64
 9   homevalue           859 non-null    int64  
 10  O*total             859 non-null    int64  
 11  O*child             859 non-null    int64  
 12  O*taxbase           859 non-null    int64  
 13  O*O65               859 non-null    int64  
 14  O*Pchild            859 non-null    float64
 15  O*Ptaxbase          859 non-null    float64
 16  O*PO65  

## Bring in shapefile to join here for areas

In [14]:
#extract GeoDataFrame from Tiger
url= 'https://www2.census.gov/geo/tiger/TIGER2019/CBSA/tl_2019_us_cbsa.zip'
geo = gpd.read_file(url)
geo.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 938 entries, 0 to 937
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   CSAFP     550 non-null    object  
 1   CBSAFP    938 non-null    object  
 2   GEOID     938 non-null    object  
 3   NAME      938 non-null    object  
 4   NAMELSAD  938 non-null    object  
 5   LSAD      938 non-null    object  
 6   MEMI      938 non-null    object  
 7   MTFCC     938 non-null    object  
 8   ALAND     938 non-null    int64   
 9   AWATER    938 non-null    int64   
 10  INTPTLAT  938 non-null    object  
 11  INTPTLON  938 non-null    object  
 12  geometry  938 non-null    geometry
dtypes: geometry(1), int64(2), object(10)
memory usage: 95.4+ KB


In [15]:
geo['CBSAFP'] = geo['CBSAFP'].astype(int)

In [16]:
data_geo = geo.merge(data, left_on='CBSAFP', right_on='CBSAFIPS')

In [17]:
data_geo.info(verbose=True)

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 859 entries, 0 to 858
Data columns (total 48 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   CSAFP               497 non-null    object  
 1   CBSAFP              859 non-null    int32   
 2   GEOID_x             859 non-null    object  
 3   NAME                859 non-null    object  
 4   NAMELSAD            859 non-null    object  
 5   LSAD                859 non-null    object  
 6   MEMI                859 non-null    object  
 7   MTFCC               859 non-null    object  
 8   ALAND               859 non-null    int64   
 9   AWATER              859 non-null    int64   
 10  INTPTLAT            859 non-null    object  
 11  INTPTLON            859 non-null    object  
 12  geometry            859 non-null    geometry
 13  CBSA                859 non-null    object  
 14  GEOID_y             859 non-null    object  
 15  total               859 non-null

In [18]:
data_geo = data_geo.drop(columns = ['CSAFP','CBSAFP','GEOID_y','NAME','Area','Name','NAMELSAD','MEMI','MTFCC','INTPTLAT','INTPTLON'])

In [19]:
data_geo = data_geo.rename(columns={'LSAD':'Micro_Metro','GEOID_x':'GEOID'})

In [20]:
data_geo.head(3)

Unnamed: 0,GEOID,Micro_Metro,ALAND,AWATER,geometry,CBSA,total,child,taxbase,O65,...,totalpercchange,homepercchange,O65sharechange,taxbasesharechange,childsharechange,O65realchange,taxbaserealchange,childrealchange,totalrealchange,homerealchange
0,12020,M1,2654601832,26140309,"POLYGON ((-83.53739 33.96591, -83.53184 33.968...","Athens-Clarke County, GA Metro Area",208457,42122,139587,26748,...,6.2,12.67,1.9,-1.7,-0.2,5284,4799,2087,12170,20100
1,12100,M1,1438776649,301268696,"POLYGON ((-74.85675 39.42076, -74.85670 39.420...","Atlantic City-Hammonton, NJ Metro Area",266105,57085,162363,46657,...,-3.35,-5.34,2.6,-1.4,-1.1,5500,-9486,-5234,-9220,-12300
2,12140,M2,939731962,2657419,"POLYGON ((-85.19295 41.38001, -85.19296 41.381...","Auburn, IN Micro Area",42927,10472,25677,6778,...,1.43,14.67,1.8,-0.8,-1.1,871,43,-308,606,15900


In [21]:
data_geo['ALAND'] = data_geo['ALAND'] * 0.0000003861
data_geo['AWATER'] = data_geo['AWATER'] * 0.0000003861

In [22]:
data_geo['popdensity14'] = data_geo['O*total']/data_geo['ALAND']
data_geo['popdensity19'] = data_geo['O*total']/data_geo['ALAND']
data_geo['elderlypopdensity14'] = data_geo['O*O65']/data_geo['ALAND']
data_geo['elderlypopdensity19'] = data_geo['O65']/data_geo['ALAND']
data_geo['tbdensity14'] = data_geo['O*taxbase']/data_geo['ALAND']
data_geo['tbdensity19'] = data_geo['taxbase']/data_geo['ALAND']
data_geo['childdensity14'] = data_geo['O*child']/data_geo['ALAND']
data_geo['childdensity19'] = data_geo['child']/data_geo['ALAND']

###### Convert land and water areas to proper units.

We have this in square meters, so the converstion factor is 0.0000003861 to get this to square miles.

## Look only at Nashville, for our Linear Regression

In [24]:
nash = data_geo.loc[data_geo['CBSA']=='Nashville-Davidson--Murfreesboro--Franklin, TN Metro Area'].reset_index(drop = True)