## Imports

In [43]:
import pandas as pd
import geopandas as gpd
import folium
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
%matplotlib inline

## Bring in our two files to join for time comparison.  

We want to look at the change in population per age group, as well as share of total population per age group, between the two five year periods represented by the American Community Survey 5-Year estimates. Here that's 2010-2014, and 2015-2019. We're going to bring in the 2014 file and rename the columns so that it's clear which is the older set - but I'm not going to specify the year so that this code can be reused with minimal changes when the next 5 year estimates come out. I'm going to bring in the 2019 csv as well. We can create a geodataframe later if we want.

In [2]:
# 2019 shapefile
new = pd.read_csv('../output/csv/agegroupsnogender_2019_cbsa.csv')

In [3]:
#2014 csv
old = pd.read_csv('../output/csv/agegroupsnogender_2014_cbsa.csv')

In [6]:
new.head(3)

Unnamed: 0,CBSA,GEOID,total,U5,school,18_20s,30s,40s,50_65,O65,PU5,Pschool,P18_20s,P30s,P40s,P50_65,PO65
0,"Big Stone Gap, VA Micro Area",310M500US13720,42456,2214,6189,6933,5337,5350,8976,7457,5.2,14.6,16.3,12.6,12.6,21.1,17.6
1,"Billings, MT Metro Area",310M500US13740,179071,10811,30624,25857,23260,21387,36130,31002,6.0,17.1,14.4,13.0,11.9,20.2,17.3
2,"Binghamton, NY Metro Area",310M500US13780,241874,12580,35319,44875,26009,25965,51415,45711,5.2,14.6,18.6,10.8,10.7,21.3,18.9


In [15]:
new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 938 entries, 0 to 937
Data columns (total 17 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CBSA     938 non-null    object 
 1   GEOID    938 non-null    object 
 2   total    938 non-null    int64  
 3   U5       938 non-null    int64  
 4   school   938 non-null    int64  
 5   18_20s   938 non-null    int64  
 6   30s      938 non-null    int64  
 7   40s      938 non-null    int64  
 8   50_65    938 non-null    int64  
 9   O65      938 non-null    int64  
 10  PU5      938 non-null    float64
 11  Pschool  938 non-null    float64
 12  P18_20s  938 non-null    float64
 13  P30s     938 non-null    float64
 14  P40s     938 non-null    float64
 15  P50_65   938 non-null    float64
 16  PO65     938 non-null    float64
dtypes: float64(7), int64(8), object(2)
memory usage: 124.7+ KB


Before joining, look at the older file and add a tag to the column names - the groups are the same so this will allow us to tell them apart when we calculate our time series.

In [7]:
old.head(3)

Unnamed: 0,CBSA,GEOID,total,U5,school,18_20s,30s,40s,50_65,O65,PU5,Pschool,P18_20s,P30s,P40s,P50_65,PO65
0,"Homosassa Springs, FL Metro Area",310M200US26140,139771,5432,16056,13331,10585,14630,32784,46953,3.9,11.5,9.5,7.6,10.5,23.5,33.6
1,"Hickory-Lenoir-Morganton, NC Metro Area",310M200US25860,363936,20102,60758,50605,43272,53505,76741,58953,5.5,16.7,13.9,11.9,14.7,21.1,16.2
2,"Hobbs, NM Micro Area",310M200US26020,66876,5790,14142,11683,9115,7799,11186,7161,8.7,21.1,17.5,13.6,11.7,16.7,10.7


In [16]:
old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929 entries, 0 to 928
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CBSA       929 non-null    object 
 1   GEOID      929 non-null    object 
 2   O*total    929 non-null    int64  
 3   O*U5       929 non-null    int64  
 4   O*school   929 non-null    int64  
 5   O*18_20s   929 non-null    int64  
 6   O*30s      929 non-null    int64  
 7   O*40s      929 non-null    int64  
 8   O*50_65    929 non-null    int64  
 9   O*O65      929 non-null    int64  
 10  O*PU5      929 non-null    float64
 11  O*Pschool  929 non-null    float64
 12  O*P18_20s  929 non-null    float64
 13  O*P30s     929 non-null    float64
 14  O*P40s     929 non-null    float64
 15  O*P50_65   929 non-null    float64
 16  O*PO65     929 non-null    float64
dtypes: float64(7), int64(8), object(2)
memory usage: 123.5+ KB


In [8]:
#for loop for renaming them with an O* for old at the beginning of each column
for col in old.columns:
    old.rename(columns = {col:'O*'+col}, inplace = True)

In [9]:
#check that this was effective
old.head(3)

Unnamed: 0,O*CBSA,O*GEOID,O*total,O*U5,O*school,O*18_20s,O*30s,O*40s,O*50_65,O*O65,O*PU5,O*Pschool,O*P18_20s,O*P30s,O*P40s,O*P50_65,O*PO65
0,"Homosassa Springs, FL Metro Area",310M200US26140,139771,5432,16056,13331,10585,14630,32784,46953,3.9,11.5,9.5,7.6,10.5,23.5,33.6
1,"Hickory-Lenoir-Morganton, NC Metro Area",310M200US25860,363936,20102,60758,50605,43272,53505,76741,58953,5.5,16.7,13.9,11.9,14.7,21.1,16.2
2,"Hobbs, NM Micro Area",310M200US26020,66876,5790,14142,11683,9115,7799,11186,7161,8.7,21.1,17.5,13.6,11.7,16.7,10.7


In [10]:
#rename the O*GEOID to the reverted column name to join on it
old = old.rename(columns = {'O*GEOID':'GEOID', 'O*CBSA':'CBSA'})

Now we can join the old df onto the new geodataframe

In [11]:
new['GEOID'] = new['GEOID'].astype(str)
old['GEOID'] = old['GEOID'].astype(str)
new['CBSA'] = new['CBSA'].astype(str)
old['CBSA'] = old['CBSA'].astype(str)

In [12]:
cbsa = pd.merge(new, old, on='CBSA', how = 'inner')

In [13]:
cbsa.head()

Unnamed: 0,CBSA,GEOID_x,total,U5,school,18_20s,30s,40s,50_65,O65,...,O*40s,O*50_65,O*O65,O*PU5,O*Pschool,O*P18_20s,O*P30s,O*P40s,O*P50_65,O*PO65
0,"Big Stone Gap, VA Micro Area",310M500US13720,42456,2214,6189,6933,5337,5350,8976,7457,...,8379,12926,9324,5.3,15.1,16.1,12.8,13.8,21.4,15.4
1,"Billings, MT Metro Area",310M500US13740,179071,10811,30624,25857,23260,21387,36130,31002,...,20400,34472,24642,6.5,16.8,15.2,12.7,12.5,21.2,15.1
2,"Binghamton, NY Metro Area",310M500US13780,241874,12580,35319,44875,26009,25965,51415,45711,...,30782,53366,42140,5.2,15.2,18.3,10.7,12.3,21.4,16.9
3,"Birmingham-Hoover, AL Metro Area",310M500US13820,1085330,67638,182860,168169,144073,140593,213217,168780,...,155983,227967,155901,6.5,17.1,15.7,13.2,13.7,20.1,13.7
4,"Bismarck, ND Metro Area",310M500US13900,127503,8528,20643,20699,18413,14349,24697,20174,...,14746,24346,16893,6.7,16.5,17.3,13.2,12.2,20.2,14.0


In [14]:
cbsa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 859 entries, 0 to 858
Data columns (total 33 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CBSA       859 non-null    object 
 1   GEOID_x    859 non-null    object 
 2   total      859 non-null    int64  
 3   U5         859 non-null    int64  
 4   school     859 non-null    int64  
 5   18_20s     859 non-null    int64  
 6   30s        859 non-null    int64  
 7   40s        859 non-null    int64  
 8   50_65      859 non-null    int64  
 9   O65        859 non-null    int64  
 10  PU5        859 non-null    float64
 11  Pschool    859 non-null    float64
 12  P18_20s    859 non-null    float64
 13  P30s       859 non-null    float64
 14  P40s       859 non-null    float64
 15  P50_65     859 non-null    float64
 16  PO65       859 non-null    float64
 17  GEOID_y    859 non-null    object 
 18  O*total    859 non-null    int64  
 19  O*U5       859 non-null    int64  
 20  O*school  

# Now we start thinking about the kinds of change we want to look at.

With the measures I think the two most sensical ways to look at things are: percent change in the raw numbers, and change in population share.

## Percent change, limited to 65+, u18, and the rest as one group aka "tax base", I'm dropping the gender aspect here.

First add up the tax base group:

In [19]:
cbsa['O*child'] = cbsa['O*U5']+cbsa['O*school']
cbsa['child'] = cbsa['U5']+cbsa['school']
cbsa['O*taxbase'] = cbsa['O*18_20s']+cbsa['O*30s']+cbsa['O*40s']+cbsa['O*50_65']
cbsa['taxbase'] = cbsa['18_20s']+cbsa['30s']+cbsa['40s']+cbsa['50_65']

In [31]:
cbsa['elderlypercchange'] = round((cbsa['O65'] - cbsa['O*O65'])*100/cbsa['O*O65'], 2)
cbsa['taxbasepercchange'] = round((cbsa['taxbase'] - cbsa['O*taxbase'])*100/cbsa['O*taxbase'], 2)
cbsa['childpercchange'] = round((cbsa['child'] - cbsa['O*child'])*100/cbsa['O*child'], 2)
cbsa['totalpercchange'] = round((cbsa['total']-cbsa['O*total'])*100/cbsa['O*total'], 2)

## Change in population share for our same three groups

Probably move these calculations to the first files and just export groups eventually for the entire project.... gonna finish this for now brain melting.

In [46]:
cbsa['O*Pchild'] = cbsa['O*PU5'] + cbsa['O*Pschool']
cbsa['Pchild'] = cbsa['PU5'] + cbsa['Pschool']
cbsa['O*Ptaxbase'] = cbsa['O*P18_20s']+cbsa['O*P30s']+cbsa['O*P40s']+cbsa['O*P50_65']
cbsa['Ptaxbase'] = cbsa['P18_20s']+cbsa['P30s']+cbsa['P40s']+cbsa['P50_65']

cbsa['elderlysharechange'] = cbsa['PO65'] - cbsa['O*PO65']
cbsa['tbsharechange'] = cbsa['Ptaxbase'] - cbsa['O*Ptaxbase']
cbsa['childsharechange'] = cbsa['Pchild'] - cbsa['O*Pchild']
cbsa['elderlyrealchange'] = cbsa['O65'] - cbsa['O*O65']
cbsa['tbrealchange'] = cbsa['taxbase'] - cbsa['O*taxbase']
cbsa['childrealchange'] = cbsa['child'] - cbsa['O*child']

In [47]:
#make another small dataframe to check out high and low values, who's around the Nashville MSA... etc.
groupperc = cbsa[['CBSA', 'GEOID_x', 'elderlypercchange',
                  'taxbasepercchange', 'childpercchange', 'totalpercchange', 
                  'elderlysharechange', 'tbsharechange', 'childsharechange', 
                 'elderlyrealchange', 'tbrealchange', 'childrealchange']]

In [48]:
groupperc.head()

Unnamed: 0,CBSA,GEOID_x,elderlypercchange,taxbasepercchange,childpercchange,totalpercchange,elderlysharechange,tbsharechange,childsharechange,elderlyrealchange,tbrealchange,childrealchange
0,"Big Stone Gap, VA Micro Area",310M500US13720,-20.02,-31.45,-32.1,-29.82,2.2,-1.5,-0.6,-1867,-12204,-3972
1,"Billings, MT Metro Area",310M500US13740,25.81,6.34,9.11,9.93,2.2,-2.1,-0.2,6360,6358,3461
2,"Binghamton, NY Metro Area",310M500US13780,8.47,-5.16,-5.71,-2.96,2.0,-1.3,-0.6,3571,-8059,-2899
3,"Birmingham-Hoover, AL Metro Area",310M500US13820,8.26,-6.46,-6.4,-4.42,1.9,-1.3,-0.6,12879,-45967,-17116
4,"Bismarck, ND Metro Area",310M500US13900,19.42,3.07,4.31,5.65,1.8,-1.6,-0.3,3281,2329,1205


In [49]:
#index into where the Nashville MSA is to check the rankings out
nash = groupperc.loc[groupperc['CBSA'] == 'Nashville-Davidson--Murfreesboro--Franklin, TN Metro Area'].reset_index(drop = True)

In [50]:
print("The following numbers are Nashville's percent growth in a 5 year over 5 year period for total elderly population, then the share change between the 2014 and 2019 periods:")
print('______________________________________________________________________________')
print(nash['elderlypercchange'])
print(nash['elderlysharechange'])
print("The following number are Nashville's percent growth in a 5 year over 5 year period for total tax base population, then the share change between the 2014 and 2019 periods:")
print('______________________________________________________________________________')
print(nash['taxbasepercchange'])
print(nash['tbsharechange'])
print("The following number are Nashville's percent growth in a 5 year over 5 year period for total child population, then the share change between the 2014 and 2019 periods:")
print('______________________________________________________________________________')
print(nash['childpercchange'])
print(nash['childsharechange'])

The following numbers are Nashville's percent growth in a 5 year over 5 year period for total elderly population, then the share change between the 2014 and 2019 periods:
______________________________________________________________________________
0    21.72
Name: elderlypercchange, dtype: float64
0    1.4
Name: elderlysharechange, dtype: float64
The following number are Nashville's percent growth in a 5 year over 5 year period for total tax base population, then the share change between the 2014 and 2019 periods:
______________________________________________________________________________
0    6.97
Name: taxbasepercchange, dtype: float64
0   -0.7
Name: tbsharechange, dtype: float64
The following number are Nashville's percent growth in a 5 year over 5 year period for total child population, then the share change between the 2014 and 2019 periods:
______________________________________________________________________________
0    4.9
Name: childpercchange, dtype: float64
0   -0.8
N

In [51]:
# plt.scatter(groupperc['elderlypercchange'], groupperc['elderlysharechange'])

In [52]:
# plt.scatter(groupperc['elderlypercchange'], groupperc['taxbasepercchange'])

In [53]:
# plt.scatter(groupperc['elderlypercchange'], groupperc['childpercchange'])

In [54]:
# plt.scatter(groupperc['elderlypercchange'], groupperc['totalpercchange'])

In [55]:
# km=KMeans(n_clusters=3)
# km

In [56]:
# Area_predicted = km.fit_predict(groupperc[['elderlypercchange','totalpercchange']])
# Area_predicted

In [57]:
# groupperc['cluster']= Area_predicted
# groupperc.head()

In [44]:
groupperc.corr()

Unnamed: 0,elderlypercchange,taxbasepercchange,childpercchange,totalpercchange,elderlysharechange,tbsharechange,childsharechange,cluster
elderlypercchange,1.0,0.846605,0.734001,0.87804,0.387301,-0.163541,-0.273503,-0.489713
taxbasepercchange,0.846605,1.0,0.900428,0.987418,-0.056927,0.156218,-0.105607,-0.321459
childpercchange,0.734001,0.900428,1.0,0.930825,-0.166048,-0.103778,0.306397,-0.359766
totalpercchange,0.87804,0.987418,0.930825,1.0,0.033986,0.006756,-0.048008,-0.379866
elderlysharechange,0.387301,-0.056927,-0.166048,0.033986,1.0,-0.606549,-0.499081,-0.291932
tbsharechange,-0.163541,0.156218,-0.103778,0.006756,-0.606549,1.0,-0.37747,0.267981
childsharechange,-0.273503,-0.105607,0.306397,-0.048008,-0.499081,-0.37747,1.0,0.04632
cluster,-0.489713,-0.321459,-0.359766,-0.379866,-0.291932,0.267981,0.04632,1.0


### Check out our Distributions

In [None]:
#Distribution of elderly real change
plt.figure(figsize=(10, 6))
sns.set(style='whitegrid')
sns.distplot(groupperc)