# Organize and Cleanse City Demographic Data
  1. import metro area Demographic ("ACS_12_1YR_CP05" and "ACS_17_1YR_CP05") data. Source:
     * https://factfinder.census.gov/faces/nav/jsf/pages/download_center.xhtml
  2. drop unneeded cities; drop unneeded cols

In [1]:
import pandas as pd

### 1) import metro area Demographic ("ACS_12_1YR_CP05" and "ACS_17_1YR_CP05") data

In [2]:
# import the US Census's "American Community Survey" (ACS) Metro Area Demographic data
dfD12 = pd.read_csv('raw/ACS_12_1YR_CP05_with_ann.csv')
dfD12.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC03,HC02_VC03,HC03_VC03,HC04_VC03,HC05_VC03,HC06_VC03,HC07_VC03,...,HC09_VC96,HC01_VC98,HC02_VC98,HC03_VC98,HC04_VC98,HC05_VC98,HC06_VC98,HC07_VC98,HC08_VC98,HC09_VC98
0,Id,Id2,Geography,2012 Estimate; SEX AND AGE - Total population,2011 Estimate; SEX AND AGE - Total population,2012 - 2011 Statistical Significance; SEX AND ...,2010 Estimate; SEX AND AGE - Total population,2012 - 2010 Statistical Significance; SEX AND ...,2009 Estimate; SEX AND AGE - Total population,2012 - 2009 Statistical Significance; SEX AND ...,...,2012 - 2008 Statistical Significance; HISPANIC...,2012 Estimate; HISPANIC OR LATINO AND RACE - T...,2011 Estimate; HISPANIC OR LATINO AND RACE - T...,2012 - 2011 Statistical Significance; HISPANIC...,2010 Estimate; HISPANIC OR LATINO AND RACE - T...,2012 - 2010 Statistical Significance; HISPANIC...,2009 Estimate; HISPANIC OR LATINO AND RACE - T...,2012 - 2009 Statistical Significance; HISPANIC...,2008 Estimate; HISPANIC OR LATINO AND RACE - T...,2012 - 2008 Statistical Significance; HISPANIC...
1,310M100US10180,10180,"Abilene, TX Metro Area",167800,165858,,164941,*,(X),,...,,70553,69887,,69711,,(X),,(X),
2,310M100US10420,10420,"Akron, OH Metro Area",702262,701456,c,702951,c,(X),,...,,312864,313023,,312600,,(X),,(X),
3,310M100US10500,10500,"Albany, GA Metro Area",155019,161617,*,162659,*,(X),,...,,65861,66150,,67169,,(X),,(X),
4,310M100US10580,10580,"Albany-Schenectady-Troy, NY Metro Area",874646,871478,c,870832,c,(X),,...,,394602,395203,,393443,,(X),,(X),


In [3]:
# eliminate " (Metropolitan Statistical Area)" suffix in each row
dfD12['GEO.display-label'] = dfD12['GEO.display-label'].str.replace(' Metro Area','')
dfD12['GEO.display-label'].unique()

array(['Geography', 'Abilene, TX', 'Akron, OH', 'Albany, GA',
       'Albany-Schenectady-Troy, NY', 'Albuquerque, NM', 'Alexandria, LA',
       'Allentown-Bethlehem-Easton, PA-NJ', 'Altoona, PA', 'Amarillo, TX',
       'Ames, IA', 'Anchorage, AK', 'Anderson, IN', 'Anderson, SC',
       'Ann Arbor, MI', 'Anniston-Oxford, AL', 'Appleton, WI',
       'Asheville, NC', 'Athens-Clarke County, GA',
       'Atlanta-Sandy Springs-Marietta, GA',
       'Atlantic City-Hammonton, NJ', 'Auburn-Opelika, AL',
       'Augusta-Richmond County, GA-SC',
       'Austin-Round Rock-San Marcos, TX', 'Bakersfield-Delano, CA',
       'Baltimore-Towson, MD', 'Bangor, ME', 'Barnstable Town, MA',
       'Baton Rouge, LA', 'Battle Creek, MI', 'Bay City, MI',
       'Beaumont-Port Arthur, TX', 'Bellingham, WA', 'Bend, OR',
       'Billings, MT', 'Binghamton, NY', 'Birmingham-Hoover, AL',
       'Bismarck, ND', 'Blacksburg-Christiansburg-Radford, VA',
       'Bloomington, IN', 'Bloomington-Normal, IL',
       'Boise

In [4]:
# import the city file and group it
dfCities = pd.read_csv('cities.csv')
grpCities = dfCities.groupby('ACSMetro')
dfCities = grpCities.count()
dfCities

Unnamed: 0_level_0,BEAMetro,WikipediaMetro,City,State
ACSMetro,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Atlanta-Sandy Springs-Marietta, GA",1,1,1,1
"Baltimore-Towson, MD",1,1,1,1
"Boston-Cambridge-Quincy, MA-NH",1,1,1,1
"Buffalo-Niagara Falls, NY",1,1,1,1
"Charlotte-Gastonia-Rock Hill, NC-SC",1,1,1,1
"Chicago-Joliet-Naperville, IL-IN-WI",1,1,1,1
"Cincinnati-Middletown, OH-KY-IN",1,1,1,1
"Cleveland-Elyria-Mentor, OH",1,1,1,1
"Columbus, OH",1,1,1,1
"Dallas-Fort Worth-Arlington, TX",2,2,2,2


In [5]:
# use the grouped city DataFrame to drop unneeded rows from dfGDP via merge
dfD12 = pd.merge(dfD12, dfCities, left_on='GEO.display-label', right_on='ACSMetro')
dfD12.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC03,HC02_VC03,HC03_VC03,HC04_VC03,HC05_VC03,HC06_VC03,HC07_VC03,...,HC04_VC98,HC05_VC98,HC06_VC98,HC07_VC98,HC08_VC98,HC09_VC98,BEAMetro,WikipediaMetro,City,State
0,310M100US12060,12060,"Atlanta-Sandy Springs-Marietta, GA",5442113,5365726,*,5288302,*,(X),,...,2168806,*,(X),,(X),,1,1,1,1
1,310M100US12580,12580,"Baltimore-Towson, MD",2753149,2729110,c,2714183,c,(X),,...,1133012,*,(X),,(X),,1,1,1,1
2,310M100US14460,14460,"Boston-Cambridge-Quincy, MA-NH",4640802,4591112,c,4560689,c,(X),,...,1883591,,(X),,(X),,1,1,1,1
3,310M100US15380,15380,"Buffalo-Niagara Falls, NY",1134210,1134039,c,1135198,c,(X),,...,519128,,(X),,(X),,1,1,1,1
4,310M100US16740,16740,"Charlotte-Gastonia-Rock Hill, NC-SC",1831084,1795472,c,1764313,c,(X),,...,741184,*,(X),,(X),,1,1,1,1


In [6]:
# import ACS metadata file
dfD12Meta = pd.read_csv('raw/ACS_12_1YR_CP05_metadata.csv', header=None, names=['col','description'])
dfD12Meta.head(10)

Unnamed: 0,col,description
0,GEO.id,Id
1,GEO.id2,Id2
2,GEO.display-label,Geography
3,HC01_VC03,2012 Estimate; SEX AND AGE - Total population
4,HC02_VC03,2011 Estimate; SEX AND AGE - Total population
5,HC03_VC03,2012 - 2011 Statistical Significance; SEX AND ...
6,HC04_VC03,2010 Estimate; SEX AND AGE - Total population
7,HC05_VC03,2012 - 2010 Statistical Significance; SEX AND ...
8,HC06_VC03,2009 Estimate; SEX AND AGE - Total population
9,HC07_VC03,2012 - 2009 Statistical Significance; SEX AND ...


In [7]:
# create col to identify multi-year rows in metadata file
dfD12Meta['droprow'] = dfD12Meta['description'].str.contains('\d\d\d\d - \d\d\d\d')
dfD12Meta.head(10)

Unnamed: 0,col,description,droprow
0,GEO.id,Id,False
1,GEO.id2,Id2,False
2,GEO.display-label,Geography,False
3,HC01_VC03,2012 Estimate; SEX AND AGE - Total population,False
4,HC02_VC03,2011 Estimate; SEX AND AGE - Total population,False
5,HC03_VC03,2012 - 2011 Statistical Significance; SEX AND ...,True
6,HC04_VC03,2010 Estimate; SEX AND AGE - Total population,False
7,HC05_VC03,2012 - 2010 Statistical Significance; SEX AND ...,True
8,HC06_VC03,2009 Estimate; SEX AND AGE - Total population,False
9,HC07_VC03,2012 - 2009 Statistical Significance; SEX AND ...,True


In [8]:
# drop multi-year rows from metadata file
dfD12Meta = dfD12Meta.loc[dfD12Meta['droprow'] == False]
dfD12Meta = dfD12Meta[['col','description']]
dfD12Meta.head(10)

Unnamed: 0,col,description
0,GEO.id,Id
1,GEO.id2,Id2
2,GEO.display-label,Geography
3,HC01_VC03,2012 Estimate; SEX AND AGE - Total population
4,HC02_VC03,2011 Estimate; SEX AND AGE - Total population
6,HC04_VC03,2010 Estimate; SEX AND AGE - Total population
8,HC06_VC03,2009 Estimate; SEX AND AGE - Total population
10,HC08_VC03,2008 Estimate; SEX AND AGE - Total population
12,HC01_VC04,2012 Estimate; SEX AND AGE - Male
13,HC02_VC04,2011 Estimate; SEX AND AGE - Male


In [9]:
# drop unneeded cols
dfD12 = dfD12[dfD12Meta['col']]
dfD12.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC03,HC02_VC03,HC04_VC03,HC06_VC03,HC08_VC03,HC01_VC04,HC02_VC04,...,HC01_VC96,HC02_VC96,HC04_VC96,HC06_VC96,HC08_VC96,HC01_VC98,HC02_VC98,HC04_VC98,HC06_VC98,HC08_VC98
0,310M100US12060,12060,"Atlanta-Sandy Springs-Marietta, GA",5442113,5365726,5288302,(X),(X),48.7,48.9,...,1.6,1.7,1.4,(X),(X),2175303,2169873,2168806,(X),(X)
1,310M100US12580,12580,"Baltimore-Towson, MD",2753149,2729110,2714183,(X),(X),48.1,48.1,...,2.2,2.2,2.0,(X),(X),1139559,1138113,1133012,(X),(X)
2,310M100US14460,14460,"Boston-Cambridge-Quincy, MA-NH",4640802,4591112,4560689,(X),(X),48.5,48.4,...,1.5,1.6,1.5,(X),(X),1885961,1891039,1883591,(X),(X)
3,310M100US15380,15380,"Buffalo-Niagara Falls, NY",1134210,1134039,1135198,(X),(X),48.4,48.4,...,1.6,1.7,1.8,(X),(X),518979,518757,519128,(X),(X)
4,310M100US16740,16740,"Charlotte-Gastonia-Rock Hill, NC-SC",1831084,1795472,1764313,(X),(X),48.5,48.4,...,2.1,1.6,1.8,(X),(X),747886,742559,741184,(X),(X)
