
# Organize and Cleanse City Demographic Data
  1. import metro area Demographic ("ACS_12_1YR_CP05" and "ACS_17_1YR_CP05") data. Source:
     * https://factfinder.census.gov/faces/nav/jsf/pages/download_center.xhtml
  2. ACS_12: drop unneeded cities; drop unneeded cols; cleanse data 
  3. ACS_17: drop unneeded cities; drop unneeded cols; cleanse data
  4. merge ACS_12 and ACS_17

In [None]:
import pandas as pd

### 1) import metro area Demographic ("ACS_12_1YR_CP05" and "ACS_17_1YR_CP05") data

In [None]:
# import the US Census's "American Community Survey" (ACS) Metro Area Demographic data: 2012
dfD12 = pd.read_csv('raw/ACS_12_1YR_CP05_with_ann.csv')
dfD12.head()

In [None]:
# import the US Census's "American Community Survey" (ACS) Metro Area Demographic data:2017
dfD17 = pd.read_csv('raw/ACS_17_1YR_CP05_with_ann.csv')
dfD17.head()

In [None]:
# eliminate " (Metro Area)" suffix in each row: 2012
dfD12['GEO.display-label'] = dfD12['GEO.display-label'].str.replace(' Metro Area','')
dfD12['GEO.display-label'].unique()

In [None]:
# eliminate " (Metro Area)" suffix in each row: 2017
dfD17['GEO.display-label'] = dfD17['GEO.display-label'].str.replace(' Metro Area','')
dfD17['GEO.display-label'].unique()

In [None]:
# import the city file and group it by the 2012 Metro Areas
dfCities12 = pd.read_csv('cities.csv')
grpCities12 = dfCities12.groupby('2012Metro')
dfCities12 = grpCities12.count()
dfCities12

In [None]:
# import the city file and group it by the 2017 Metro Areas
dfCities17 = pd.read_csv('cities.csv')
grpCities17 = dfCities17.groupby('2017Metro')
dfCities17 = grpCities17.count()
dfCities17

## 2) ACS_12: drop unneeded cities; drop unneeded cols; cleanse data 

In [None]:
# use the grouped city DataFrame to drop unneeded rows from dfGDP via merge
dfD12 = pd.merge(dfD12, dfCities12, left_on='GEO.display-label', right_on='2012Metro')
dfD12.head()

In [None]:
# import ACS metadata file
dfD12Meta = pd.read_csv('raw/ACS_12_1YR_CP05_metadata.csv', header=None, names=['col','description'])
dfD12Meta.head(10)

In [None]:
# create col to identify multi-year rows in metadata file
dfD12Meta['droprow'] = dfD12Meta['description'].str.contains('\d\d\d\d - \d\d\d\d')
dfD12Meta.head(10)

In [None]:
# drop multi-year rows from metadata file
dfD12Meta = dfD12Meta.loc[dfD12Meta['droprow'] == False]
dfD12Meta = dfD12Meta[['col','description']]
dfD12Meta.head(20)

In [None]:
# drop unneeded cols
dfD12 = dfD12[dfD12Meta['col']]
dfD12.head()

In [None]:
# eliminate "(X)" values in DataFrame
for strCol in dfD12.columns:
    dfD12[strCol] = dfD12[strCol].str.replace('\(X\)','')
dfD12.head()

## 3) ACS_17: drop unneeded cities; drop unneeded cols; cleanse data 

In [None]:
# use the grouped city DataFrame to drop unneeded rows from dfGDP via merge
dfD17 = pd.merge(dfD17, dfCities17, left_on='GEO.display-label', right_on='2017Metro')
dfD17.head()

In [None]:
# import ACS metadata file
dfD17Meta = pd.read_csv('raw/ACS_17_1YR_CP05_metadata.csv', header=None, names=['col','description'])
dfD17Meta.head(10)

In [21]:
# create col to identify multi-year rows in metadata file
dfD17Meta['droprow'] = dfD17Meta['description'].str.contains('\d\d\d\d - \d\d\d\d')
dfD17Meta.head(10)

Unnamed: 0,col,description,droprow
0,GEO.id,Id,False
1,GEO.id2,Id2,False
2,GEO.display-label,Geography,False
3,HC01_VC03,2017 Estimate; SEX AND AGE - Total population,False
4,HC02_VC03,2016 Estimate; SEX AND AGE - Total population,False
5,HC03_VC03,2017 - 2016 Statistical Significance; SEX AND ...,True
6,HC04_VC03,2015 Estimate; SEX AND AGE - Total population,False
7,HC05_VC03,2017 - 2015 Statistical Significance; SEX AND ...,True
8,HC06_VC03,2014 Estimate; SEX AND AGE - Total population,False
9,HC07_VC03,2017 - 2014 Statistical Significance; SEX AND ...,True


In [None]:
# drop multi-year rows from metadata file
dfD17Meta = dfD17Meta.loc[dfD17Meta['droprow'] == False]
dfD17Meta = dfD17Meta[['col','description']]
dfD17Meta.head(100)

In [23]:
# drop unneeded cols
dfD17 = dfD17[dfD17Meta['col']]
dfD17.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC03,HC02_VC03,HC04_VC03,HC06_VC03,HC08_VC03,HC01_VC04,HC02_VC04,...,HC01_VC114,HC02_VC114,HC04_VC114,HC06_VC114,HC08_VC114,HC01_VC115,HC02_VC115,HC04_VC115,HC06_VC115,HC08_VC115
0,310M300US12060,12060,"Atlanta-Sandy Springs-Roswell, GA",5882450,5790210,5709731,5611829,5524693,48.4,48.3,...,47.0,47.0,47.1,47.2,47.1,53.0,53.0,52.9,52.8,52.9
1,310M300US12580,12580,"Baltimore-Columbia-Towson, MD",2808175,2798886,2797407,2785874,2770738,48.2,48.2,...,47.3,47.3,47.2,47.3,47.2,52.7,52.7,52.8,52.7,52.8
2,310M300US14460,14460,"Boston-Cambridge-Newton, MA-NH",4836531,4794447,4774321,4732161,4684299,48.7,48.6,...,47.7,47.6,47.6,47.6,47.5,52.3,52.4,52.4,52.4,52.5
3,310M300US15380,15380,"Buffalo-Cheektowaga-Niagara Falls, NY",1136856,1132804,1135230,1136360,1134115,48.5,48.6,...,47.7,47.8,47.7,47.8,47.8,52.3,52.2,52.3,52.2,52.2
4,310M300US16740,16740,"Charlotte-Concord-Gastonia, NC-SC",2525305,2474314,2426363,2380314,2335358,48.4,48.4,...,47.4,47.4,47.3,47.3,47.2,52.6,52.6,52.7,52.7,52.8


In [24]:
# eliminate "(X)" values in DataFrame
for strCol in dfD17.columns:
    dfD17[strCol] = dfD17[strCol].str.replace('\(X\)','')
dfD17.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC03,HC02_VC03,HC04_VC03,HC06_VC03,HC08_VC03,HC01_VC04,HC02_VC04,...,HC01_VC114,HC02_VC114,HC04_VC114,HC06_VC114,HC08_VC114,HC01_VC115,HC02_VC115,HC04_VC115,HC06_VC115,HC08_VC115
0,310M300US12060,12060,"Atlanta-Sandy Springs-Roswell, GA",5882450,5790210,5709731,5611829,5524693,48.4,48.3,...,47.0,47.0,47.1,47.2,47.1,53.0,53.0,52.9,52.8,52.9
1,310M300US12580,12580,"Baltimore-Columbia-Towson, MD",2808175,2798886,2797407,2785874,2770738,48.2,48.2,...,47.3,47.3,47.2,47.3,47.2,52.7,52.7,52.8,52.7,52.8
2,310M300US14460,14460,"Boston-Cambridge-Newton, MA-NH",4836531,4794447,4774321,4732161,4684299,48.7,48.6,...,47.7,47.6,47.6,47.6,47.5,52.3,52.4,52.4,52.4,52.5
3,310M300US15380,15380,"Buffalo-Cheektowaga-Niagara Falls, NY",1136856,1132804,1135230,1136360,1134115,48.5,48.6,...,47.7,47.8,47.7,47.8,47.8,52.3,52.2,52.3,52.2,52.2
4,310M300US16740,16740,"Charlotte-Concord-Gastonia, NC-SC",2525305,2474314,2426363,2380314,2335358,48.4,48.4,...,47.4,47.4,47.3,47.3,47.2,52.6,52.6,52.7,52.7,52.8


## 4) merge ACS_12 and ACS_17

In [25]:
# manually tweak the GEO.id2 of the Los Angeles Metro Area--it changed from 31100 to 31080 between 2012 and 2017
# this is needed to merge the two data sets
dfD12['GEO.id2'] = dfD12['GEO.id2'].replace({'31100':'31080'})
print(dfD12[dfD12['GEO.display-label']=='Los Angeles-Long Beach-Santa Ana, CA'])

            GEO.id GEO.id2                     GEO.display-label HC01_VC03  \
18  310M100US31100   31080  Los Angeles-Long Beach-Santa Ana, CA  13052921   

   HC02_VC03 HC04_VC03 HC06_VC03 HC08_VC03 HC01_VC04 HC02_VC04  ... HC01_VC96  \
18  12944801  12849383                          49.3      49.4  ...       2.1   

   HC02_VC96 HC04_VC96 HC06_VC96 HC08_VC96 HC01_VC98 HC02_VC98 HC04_VC98  \
18       1.9       2.0                       4497179   4500396   4493949   

   HC06_VC98 HC08_VC98  
18                      

[1 rows x 408 columns]


In [26]:
dfDemo = pd.merge(dfD12, dfD17, on='GEO.id2')
dfDemo.to_csv('demodata.csv')

In [27]:
dfDemo.head(50)

Unnamed: 0,GEO.id_x,GEO.id2,GEO.display-label_x,HC01_VC03_x,HC02_VC03_x,HC04_VC03_x,HC06_VC03_x,HC08_VC03_x,HC01_VC04_x,HC02_VC04_x,...,HC01_VC114,HC02_VC114,HC04_VC114,HC06_VC114,HC08_VC114,HC01_VC115,HC02_VC115,HC04_VC115,HC06_VC115,HC08_VC115
0,310M100US12060,12060,"Atlanta-Sandy Springs-Marietta, GA",5442113,5365726,5288302,,,48.7,48.9,...,47.0,47.0,47.1,47.2,47.1,53.0,53.0,52.9,52.8,52.9
1,310M100US12580,12580,"Baltimore-Towson, MD",2753149,2729110,2714183,,,48.1,48.1,...,47.3,47.3,47.2,47.3,47.2,52.7,52.7,52.8,52.7,52.8
2,310M100US14460,14460,"Boston-Cambridge-Quincy, MA-NH",4640802,4591112,4560689,,,48.5,48.4,...,47.7,47.6,47.6,47.6,47.5,52.3,52.4,52.4,52.4,52.5
3,310M100US15380,15380,"Buffalo-Niagara Falls, NY",1134210,1134039,1135198,,,48.4,48.4,...,47.7,47.8,47.7,47.8,47.8,52.3,52.2,52.3,52.2,52.2
4,310M100US16740,16740,"Charlotte-Gastonia-Rock Hill, NC-SC",1831084,1795472,1764313,,,48.5,48.4,...,47.4,47.4,47.3,47.3,47.2,52.6,52.6,52.7,52.7,52.8
5,310M100US16980,16980,"Chicago-Joliet-Naperville, IL-IN-WI",9522446,9504024,9474211,,,48.9,48.9,...,48.1,47.8,47.8,47.7,47.8,51.9,52.2,52.2,52.3,52.2
6,310M100US17140,17140,"Cincinnati-Middletown, OH-KY-IN",2146560,2137735,2133203,,,48.8,49.0,...,48.1,48.2,48.1,48.1,48.1,51.9,51.8,51.9,51.9,51.9
7,310M100US17460,17460,"Cleveland-Elyria-Mentor, OH",2063535,2068283,2075758,,,48.1,48.1,...,47.5,47.4,47.5,47.3,47.3,52.5,52.6,52.5,52.7,52.7
8,310M100US18140,18140,"Columbus, OH",1878714,1858464,1840631,,,49.2,49.1,...,48.4,48.5,48.6,48.6,48.4,51.6,51.5,51.4,51.4,51.6
9,310M100US19100,19100,"Dallas-Fort Worth-Arlington, TX",6647496,6526566,6402922,,,49.4,49.3,...,48.0,47.9,47.8,47.9,47.9,52.0,52.1,52.2,52.1,52.1


In [38]:
dfDemo = dfDemo.rename(columns={"GEO.id_x": "GEO_idx", "GEO.id2": "GEO_id2", "GEO.display-label_x": "GEO_display_x", "GEO.id_y": "GEO_idx", "GEO.display-label_y": "GEO_display_y"})
dfDemo = dfDemo.dropna(axis='columns', how='all')
dfDemo

Unnamed: 0,GEO_id,GEO_id2,GEO_display,HC01_VC03_x,HC02_VC03_x,HC04_VC03_x,HC06_VC03_x,HC08_VC03_x,HC01_VC04_x,HC02_VC04_x,...,HC01_VC114,HC02_VC114,HC04_VC114,HC06_VC114,HC08_VC114,HC01_VC115,HC02_VC115,HC04_VC115,HC06_VC115,HC08_VC115
0,310M100US12060,12060,"Atlanta-Sandy Springs-Marietta, GA",5442113,5365726,5288302,,,48.7,48.9,...,47.0,47.0,47.1,47.2,47.1,53.0,53.0,52.9,52.8,52.9
1,310M100US12580,12580,"Baltimore-Towson, MD",2753149,2729110,2714183,,,48.1,48.1,...,47.3,47.3,47.2,47.3,47.2,52.7,52.7,52.8,52.7,52.8
2,310M100US14460,14460,"Boston-Cambridge-Quincy, MA-NH",4640802,4591112,4560689,,,48.5,48.4,...,47.7,47.6,47.6,47.6,47.5,52.3,52.4,52.4,52.4,52.5
3,310M100US15380,15380,"Buffalo-Niagara Falls, NY",1134210,1134039,1135198,,,48.4,48.4,...,47.7,47.8,47.7,47.8,47.8,52.3,52.2,52.3,52.2,52.2
4,310M100US16740,16740,"Charlotte-Gastonia-Rock Hill, NC-SC",1831084,1795472,1764313,,,48.5,48.4,...,47.4,47.4,47.3,47.3,47.2,52.6,52.6,52.7,52.7,52.8
5,310M100US16980,16980,"Chicago-Joliet-Naperville, IL-IN-WI",9522446,9504024,9474211,,,48.9,48.9,...,48.1,47.8,47.8,47.7,47.8,51.9,52.2,52.2,52.3,52.2
6,310M100US17140,17140,"Cincinnati-Middletown, OH-KY-IN",2146560,2137735,2133203,,,48.8,49.0,...,48.1,48.2,48.1,48.1,48.1,51.9,51.8,51.9,51.9,51.9
7,310M100US17460,17460,"Cleveland-Elyria-Mentor, OH",2063535,2068283,2075758,,,48.1,48.1,...,47.5,47.4,47.5,47.3,47.3,52.5,52.6,52.5,52.7,52.7
8,310M100US18140,18140,"Columbus, OH",1878714,1858464,1840631,,,49.2,49.1,...,48.4,48.5,48.6,48.6,48.4,51.6,51.5,51.4,51.4,51.6
9,310M100US19100,19100,"Dallas-Fort Worth-Arlington, TX",6647496,6526566,6402922,,,49.4,49.3,...,48.0,47.9,47.8,47.9,47.9,52.0,52.1,52.2,52.1,52.1


In [39]:
dfDemo.to_csv('Demo.csv')