
# Organize and Cleanse City Demographic Data
  1. import metro area Demographic ("ACS_17_1YR_CP05") data. Source:
     * https://factfinder.census.gov/faces/nav/jsf/pages/download_center.xhtml
  2. ACS_17: drop unneeded cities; drop unneeded cols; cleanse data

In [1]:
import pandas as pd

### 1) import metro area Demographic ("ACS_17_1YR_CP05") data

In [2]:
# import the US Census's "American Community Survey" (ACS) Metro Area Demographic data:2017
dfD17 = pd.read_csv('raw/ACS_17_1YR_CP05_with_ann.csv')
dfD17.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC03,HC02_VC03,HC03_VC03,HC04_VC03,HC05_VC03,HC06_VC03,HC07_VC03,...,HC09_VC114,HC01_VC115,HC02_VC115,HC03_VC115,HC04_VC115,HC05_VC115,HC06_VC115,HC07_VC115,HC08_VC115,HC09_VC115
0,Id,Id2,Geography,2017 Estimate; SEX AND AGE - Total population,2016 Estimate; SEX AND AGE - Total population,2017 - 2016 Statistical Significance; SEX AND ...,2015 Estimate; SEX AND AGE - Total population,2017 - 2015 Statistical Significance; SEX AND ...,2014 Estimate; SEX AND AGE - Total population,2017 - 2014 Statistical Significance; SEX AND ...,...,"2017 - 2013 Statistical Significance; CITIZEN,...","2017 Estimate; CITIZEN, VOTING AGE POPULATION ...","2016 Estimate; CITIZEN, VOTING AGE POPULATION ...","2017 - 2016 Statistical Significance; CITIZEN,...","2015 Estimate; CITIZEN, VOTING AGE POPULATION ...","2017 - 2015 Statistical Significance; CITIZEN,...","2014 Estimate; CITIZEN, VOTING AGE POPULATION ...","2017 - 2014 Statistical Significance; CITIZEN,...","2013 Estimate; CITIZEN, VOTING AGE POPULATION ...","2017 - 2013 Statistical Significance; CITIZEN,..."
1,310M300US10180,10180,"Abilene, TX Metro Area",169747,170860,,168922,,166900,,...,,49.9,48.7,*,49.3,,49.1,*,49.2,
2,310M300US10420,10420,"Akron, OH Metro Area",703505,702221,c,704243,c,703825,c,...,*,52.0,52.0,,52.2,,52.4,*,52.4,*
3,310M300US10500,10500,"Albany, GA Metro Area",151754,152506,,156997,*,152596,,...,,53.9,54.2,,54.6,,54.0,,53.4,
4,310M300US10540,10540,"Albany, OR Metro Area",125047,122849,c,120547,c,119356,c,...,,51.0,51.4,,51.0,,51.1,,51.1,


In [3]:
# eliminate " (Metro Area)" suffix in each row: 2017
dfD17['GEO.display-label'] = dfD17['GEO.display-label'].str.replace(' Metro Area','')
dfD17['GEO.display-label'].unique()

array(['Geography', 'Abilene, TX', 'Akron, OH', 'Albany, GA',
       'Albany, OR', 'Albany-Schenectady-Troy, NY', 'Albuquerque, NM',
       'Alexandria, LA', 'Allentown-Bethlehem-Easton, PA-NJ',
       'Altoona, PA', 'Amarillo, TX', 'Ames, IA', 'Anchorage, AK',
       'Ann Arbor, MI', 'Anniston-Oxford-Jacksonville, AL',
       'Appleton, WI', 'Asheville, NC', 'Athens-Clarke County, GA',
       'Atlanta-Sandy Springs-Roswell, GA', 'Atlantic City-Hammonton, NJ',
       'Auburn-Opelika, AL', 'Augusta-Richmond County, GA-SC',
       'Austin-Round Rock, TX', 'Bakersfield, CA',
       'Baltimore-Columbia-Towson, MD', 'Bangor, ME',
       'Barnstable Town, MA', 'Baton Rouge, LA', 'Battle Creek, MI',
       'Bay City, MI', 'Beaumont-Port Arthur, TX', 'Beckley, WV',
       'Bellingham, WA', 'Bend-Redmond, OR', 'Billings, MT',
       'Binghamton, NY', 'Birmingham-Hoover, AL', 'Bismarck, ND',
       'Blacksburg-Christiansburg-Radford, VA', 'Bloomington, IL',
       'Bloomington, IN', 'Bloomsburg-

In [4]:
# import the city file and group it by the 2017 Metro Areas
dfCities17 = pd.read_csv('cities.csv')
grpCities17 = dfCities17.groupby('2017Metro')
dfCities17 = grpCities17.count()
dfCities17

Unnamed: 0_level_0,2012Metro,WikipediaMetro,City,State
2017Metro,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Atlanta-Sandy Springs-Roswell, GA",1,1,1,1
"Baltimore-Columbia-Towson, MD",1,1,1,1
"Boston-Cambridge-Newton, MA-NH",1,1,1,1
"Buffalo-Cheektowaga-Niagara Falls, NY",1,1,1,1
"Charlotte-Concord-Gastonia, NC-SC",1,1,1,1
"Chicago-Naperville-Elgin, IL-IN-WI",1,1,1,1
"Cincinnati, OH-KY-IN",1,1,1,1
"Cleveland-Elyria, OH",1,1,1,1
"Columbus, OH",1,1,1,1
"Dallas-Fort Worth-Arlington, TX",2,2,2,2


## 2) ACS_17: drop unneeded cities; drop unneeded cols; cleanse data 

In [5]:
# use the grouped city DataFrame to drop unneeded rows from dfGDP via merge
dfD17 = pd.merge(dfD17, dfCities17, left_on='GEO.display-label', right_on='2017Metro')
dfD17.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC03,HC02_VC03,HC03_VC03,HC04_VC03,HC05_VC03,HC06_VC03,HC07_VC03,...,HC04_VC115,HC05_VC115,HC06_VC115,HC07_VC115,HC08_VC115,HC09_VC115,2012Metro,WikipediaMetro,City,State
0,310M300US12060,12060,"Atlanta-Sandy Springs-Roswell, GA",5882450,5790210,*,5709731,*,5611829,*,...,52.9,,52.8,*,52.9,,1,1,1,1
1,310M300US12580,12580,"Baltimore-Columbia-Towson, MD",2808175,2798886,c,2797407,c,2785874,c,...,52.8,,52.7,,52.8,,1,1,1,1
2,310M300US14460,14460,"Boston-Cambridge-Newton, MA-NH",4836531,4794447,c,4774321,c,4732161,c,...,52.4,,52.4,,52.5,*,1,1,1,1
3,310M300US15380,15380,"Buffalo-Cheektowaga-Niagara Falls, NY",1136856,1132804,c,1135230,c,1136360,c,...,52.3,,52.2,,52.2,,1,1,1,1
4,310M300US16740,16740,"Charlotte-Concord-Gastonia, NC-SC",2525305,2474314,c,2426363,c,2380314,c,...,52.7,,52.7,,52.8,*,1,1,1,1


In [6]:
# import ACS metadata file
dfD17Meta = pd.read_csv('raw/ACS_17_1YR_CP05_metadata.csv', header=None, names=['col','description'])
dfD17Meta.head(10)

Unnamed: 0,col,description
0,GEO.id,Id
1,GEO.id2,Id2
2,GEO.display-label,Geography
3,HC01_VC03,2017 Estimate; SEX AND AGE - Total population
4,HC02_VC03,2016 Estimate; SEX AND AGE - Total population
5,HC03_VC03,2017 - 2016 Statistical Significance; SEX AND ...
6,HC04_VC03,2015 Estimate; SEX AND AGE - Total population
7,HC05_VC03,2017 - 2015 Statistical Significance; SEX AND ...
8,HC06_VC03,2014 Estimate; SEX AND AGE - Total population
9,HC07_VC03,2017 - 2014 Statistical Significance; SEX AND ...


In [7]:
# create col to identify non-2017 rows in metadata file
dfD17Meta['droprow'] = ~(dfD17Meta['description'].str.startswith('2017 Estimate') |\
                       dfD17Meta['col'].str.startswith('GEO.'))
dfD17Meta.head(10)

Unnamed: 0,col,description,droprow
0,GEO.id,Id,False
1,GEO.id2,Id2,False
2,GEO.display-label,Geography,False
3,HC01_VC03,2017 Estimate; SEX AND AGE - Total population,False
4,HC02_VC03,2016 Estimate; SEX AND AGE - Total population,True
5,HC03_VC03,2017 - 2016 Statistical Significance; SEX AND ...,True
6,HC04_VC03,2015 Estimate; SEX AND AGE - Total population,True
7,HC05_VC03,2017 - 2015 Statistical Significance; SEX AND ...,True
8,HC06_VC03,2014 Estimate; SEX AND AGE - Total population,True
9,HC07_VC03,2017 - 2014 Statistical Significance; SEX AND ...,True


In [8]:
# drop multi-year rows from metadata file
dfD17Meta = dfD17Meta.loc[dfD17Meta['droprow'] == False]
dfD17Meta = dfD17Meta[['col','description']]
dfD17Meta.head(100)

Unnamed: 0,col,description
0,GEO.id,Id
1,GEO.id2,Id2
2,GEO.display-label,Geography
3,HC01_VC03,2017 Estimate; SEX AND AGE - Total population
12,HC01_VC04,2017 Estimate; SEX AND AGE - Total population ...
21,HC01_VC05,2017 Estimate; SEX AND AGE - Total population ...
30,HC01_VC06,2017 Estimate; SEX AND AGE - Total population ...
39,HC01_VC09,2017 Estimate; SEX AND AGE - Under 5 years
48,HC01_VC10,2017 Estimate; SEX AND AGE - 5 to 9 years
57,HC01_VC11,2017 Estimate; SEX AND AGE - 10 to 14 years


In [9]:
# drop unneeded cols
dfD17 = dfD17[dfD17Meta['col']]
dfD17.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC03,HC01_VC04,HC01_VC05,HC01_VC06,HC01_VC09,HC01_VC10,HC01_VC11,...,HC01_VC102,HC01_VC103,HC01_VC104,HC01_VC105,HC01_VC106,HC01_VC107,HC01_VC109,HC01_VC113,HC01_VC114,HC01_VC115
0,310M300US12060,12060,"Atlanta-Sandy Springs-Roswell, GA",5882450,48.4,51.6,94,6.3,6.8,7.3,...,5.8,0.0,0.4,2.4,0.1,2.3,2299498,4014060,47.0,53.0
1,310M300US12580,12580,"Baltimore-Columbia-Towson, MD",2808175,48.2,51.8,93,6.0,6.0,6.2,...,5.7,0.0,0.3,2.8,0.1,2.6,1160769,2073321,47.3,52.7
2,310M300US14460,14460,"Boston-Cambridge-Newton, MA-NH",4836531,48.7,51.3,95,5.3,5.4,5.7,...,7.9,0.0,0.9,2.0,0.2,1.7,1955479,3462624,47.7,52.3
3,310M300US15380,15380,"Buffalo-Cheektowaga-Niagara Falls, NY",1136856,48.5,51.5,94,5.4,5.6,5.6,...,3.1,0.0,0.1,2.1,0.1,2.1,529009,877979,47.7,52.3
4,310M300US16740,16740,"Charlotte-Concord-Gastonia, NC-SC",2525305,48.4,51.6,94,6.3,6.6,7.0,...,3.6,0.1,0.2,2.2,0.1,2.2,1028021,1776437,47.4,52.6


In [10]:
# eliminate "(X)" values in DataFrame
for strCol in dfD17.columns:
    dfD17[strCol] = dfD17[strCol].str.replace('\(X\)','')
dfD17.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC03,HC01_VC04,HC01_VC05,HC01_VC06,HC01_VC09,HC01_VC10,HC01_VC11,...,HC01_VC102,HC01_VC103,HC01_VC104,HC01_VC105,HC01_VC106,HC01_VC107,HC01_VC109,HC01_VC113,HC01_VC114,HC01_VC115
0,310M300US12060,12060,"Atlanta-Sandy Springs-Roswell, GA",5882450,48.4,51.6,94,6.3,6.8,7.3,...,5.8,0.0,0.4,2.4,0.1,2.3,2299498,4014060,47.0,53.0
1,310M300US12580,12580,"Baltimore-Columbia-Towson, MD",2808175,48.2,51.8,93,6.0,6.0,6.2,...,5.7,0.0,0.3,2.8,0.1,2.6,1160769,2073321,47.3,52.7
2,310M300US14460,14460,"Boston-Cambridge-Newton, MA-NH",4836531,48.7,51.3,95,5.3,5.4,5.7,...,7.9,0.0,0.9,2.0,0.2,1.7,1955479,3462624,47.7,52.3
3,310M300US15380,15380,"Buffalo-Cheektowaga-Niagara Falls, NY",1136856,48.5,51.5,94,5.4,5.6,5.6,...,3.1,0.0,0.1,2.1,0.1,2.1,529009,877979,47.7,52.3
4,310M300US16740,16740,"Charlotte-Concord-Gastonia, NC-SC",2525305,48.4,51.6,94,6.3,6.6,7.0,...,3.6,0.1,0.2,2.2,0.1,2.2,1028021,1776437,47.4,52.6


In [11]:
dfD17.to_csv('demodata2017only.csv')