# Census Data
Census data was collected from statsamerica.org (http://www.statsamerica.org/USCP/) for 2017. This notebook processes statistics for particular census items of interest. These will be used as the features for modeling. the features are assembled into a dataframe and exported as a csv file.

In [1]:
import pandas as pd

In [2]:
# import county code file

df_counties = pd.read_csv('data/us_county_codes.csv')

In [3]:
# import census data csv files

df_county_size = pd.read_csv('data/county_size.csv', encoding = "ISO-8859-1")

df_college_rate = pd.read_csv('data/statsamerica_data/college_rate.csv')
df_growth = pd.read_csv('data/statsamerica_data/growth.csv')
df_hh_income = pd.read_csv('data/statsamerica_data/hh_income.csv')
df_households = pd.read_csv('data/statsamerica_data/households.csv')
df_hs_rate = pd.read_csv('data/statsamerica_data/hs_rate.csv')
df_labor_force = pd.read_csv('data/statsamerica_data/labor_force.csv')
df_median_age = pd.read_csv('data/statsamerica_data/median_age.csv')
df_per_cap_income = pd.read_csv('data/statsamerica_data/per_cap_income.csv')
df_population = pd.read_csv('data/statsamerica_data/population.csv')
df_poverty_rate = pd.read_csv('data/statsamerica_data/poverty_rate.csv')
df_unempl_rate = pd.read_csv('data/statsamerica_data/unempl_rate.csv')
df_hisp_pop = pd.read_csv('data/statsamerica_data/hisp_pop.csv')
df_white_pop = pd.read_csv('data/statsamerica_data/white_pop.csv')

In [4]:
df_county_size.head()
df_county_size = df_county_size[['GCT_STUB.target-geo-id2', 'SUBHD0303']]

In [5]:
df_county_size.rename(columns={'GCT_STUB.target-geo-id2': 'code', 'SUBHD0303': 'land_area'}, inplace=True)
df_county_size = df_county_size.iloc[2:, :]

In [6]:
# reduce to two columns for each dataframe

df_college_rate = df_college_rate[['code', 'college_rate']]
df_growth = df_growth[['code', 'growth']]
df_hh_income = df_hh_income[['code', 'hh_income']]
df_households = df_households[['code', 'households']]
df_hs_rate = df_hs_rate[['code', 'hs_rate']]
df_labor_force = df_labor_force[['code', 'labor_force']]
df_median_age = df_median_age[['code', 'median_age']]
df_per_cap_income = df_per_cap_income[['code', 'per_cap_income']]
df_population = df_population[['code', 'population']]
df_poverty_rate = df_poverty_rate[['code', 'poverty_rate']]
df_unempl_rate = df_unempl_rate[['code', 'unempl_rate']]

df_hisp_pop = df_hisp_pop[['code', 'hisp_pop']]
df_white_pop = df_white_pop[['code', 'white_pop']]

In [7]:
# setup dataframe table

df_counties.head()

Unnamed: 0,FIPS,Name,State
0,1001,Autauga,AL
1,1003,Baldwin,AL
2,1005,Barbour,AL
3,1007,Bibb,AL
4,1009,Blount,AL


In [8]:
# remove trailing whitespace from column titles
df_counties.columns = df_counties.columns.str.strip()

In [9]:
df_counties[df_counties['FIPS'] == 48201]

Unnamed: 0,FIPS,Name,State
2620,48201,Harris,TX


In [11]:
df_population.head()

Unnamed: 0,code,population
0,6037,10163507
1,17031,5211263
2,48201,4652980
3,4013,4307033
4,6073,3337685


In [12]:
df_county_data = df_counties

In [13]:
# remove US territories
df_county_data = df_counties[(df_counties['State'] != 'PR') & 
            (df_counties['State'] != 'VI') & 
            (df_counties['State'] != 'MP') & 
            (df_counties['State'] != 'GU') &
            (df_counties['State'] != 'AS')]

In [14]:
df_county_data.rename(columns={'FIPS': 'code'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [15]:
df_county_size['code'] = df_county_size['code'].apply(lambda x: int(x))

In [16]:
# merge datasets

df_county_data = pd.merge(df_county_data, df_population, on='code', how='left')
df_county_data = pd.merge(df_county_data, df_county_size, on='code', how='left')
df_county_data = pd.merge(df_county_data, df_growth, on='code', how='left')
df_county_data = pd.merge(df_county_data, df_households, on='code', how='left')
df_county_data = pd.merge(df_county_data, df_median_age, on='code', how='left')
df_county_data = pd.merge(df_county_data, df_hisp_pop, on='code', how='left')
df_county_data = pd.merge(df_county_data, df_white_pop, on='code', how='left')
df_county_data = pd.merge(df_county_data, df_per_cap_income, on='code', how='left')
df_county_data = pd.merge(df_county_data, df_hh_income, on='code', how='left')
df_county_data = pd.merge(df_county_data, df_labor_force, on='code', how='left')
df_county_data = pd.merge(df_county_data, df_unempl_rate, on='code', how='left')
df_county_data = pd.merge(df_county_data, df_poverty_rate, on='code', how='left')
df_county_data = pd.merge(df_county_data, df_hs_rate, on='code', how='left')
df_county_data = pd.merge(df_county_data, df_college_rate, on='code', how='left')

In [17]:
df_county_data.head()

Unnamed: 0,code,Name,State,population,land_area,growth,households,median_age,hisp_pop,white_pop,per_cap_income,hh_income,labor_force,unempl_rate,poverty_rate,hs_rate,college_rate
0,1001,Autauga,AL,55504,594.44,1.70%,20800,37.8,1416,42311,"$39,721","$54,487",25909,3.9,13.5,87.60%,24.60%
1,1003,Baldwin,AL,212628,1589.78,16.70%,75149,42.3,8712,172441,"$41,286","$56,460",91567,4.0,11.7,90.00%,29.50%
2,1005,Barbour,AL,25270,884.88,-8.00%,9122,38.7,1147,12430,"$31,788","$32,884",8236,5.9,29.9,73.80%,12.90%
3,1007,Bibb,AL,22668,622.58,-1.10%,7048,40.2,502,17370,"$29,264","$43,079",8506,4.4,20.1,80.70%,12.00%
4,1009,Blount,AL,58013,644.78,1.20%,20619,40.8,5036,55073,"$31,470","$47,213",24494,4.0,14.1,80.00%,13.00%


In [18]:
# define function to reformat monetary and count values

def clean_num(num_str):
    try:
        return int(str(num_str).replace('$','').replace(',',''))
    except:
        pass

In [19]:
# define function to reformat rate values

def clean_rate(rate_str):
    try:
        return float(str(rate_str).replace('%', '').strip())
    except:
        pass

In [20]:
# reformat numerical data

df_county_data['population'] = df_county_data['population'].apply(clean_num)
df_county_data['households'] = df_county_data['households'].apply(clean_num)
df_county_data['per_cap_income'] = df_county_data['per_cap_income'].apply(clean_num)
df_county_data['hh_income'] = df_county_data['hh_income'].apply(clean_num)
df_county_data['labor_force'] = df_county_data['labor_force'].apply(clean_num)
df_county_data['hisp_pop'] = df_county_data['hisp_pop'].apply(clean_num)
df_county_data['white_pop'] = df_county_data['white_pop'].apply(clean_num)

df_county_data['growth'] = df_county_data['growth'].apply(clean_rate)
df_county_data['hs_rate'] = df_county_data['hs_rate'].apply(clean_rate)
df_county_data['college_rate'] = df_county_data['college_rate'].apply(clean_rate)

In [21]:
df_county_data.head()

Unnamed: 0,code,Name,State,population,land_area,growth,households,median_age,hisp_pop,white_pop,per_cap_income,hh_income,labor_force,unempl_rate,poverty_rate,hs_rate,college_rate
0,1001,Autauga,AL,55504.0,594.44,1.7,20800.0,37.8,1416.0,42311.0,39721.0,54487.0,25909.0,3.9,13.5,87.6,24.6
1,1003,Baldwin,AL,212628.0,1589.78,16.7,75149.0,42.3,8712.0,172441.0,41286.0,56460.0,91567.0,4.0,11.7,90.0,29.5
2,1005,Barbour,AL,25270.0,884.88,-8.0,9122.0,38.7,1147.0,12430.0,31788.0,32884.0,8236.0,5.9,29.9,73.8,12.9
3,1007,Bibb,AL,22668.0,622.58,-1.1,7048.0,40.2,502.0,17370.0,29264.0,43079.0,8506.0,4.4,20.1,80.7,12.0
4,1009,Blount,AL,58013.0,644.78,1.2,20619.0,40.8,5036.0,55073.0,31470.0,47213.0,24494.0,4.0,14.1,80.0,13.0


In [22]:
# output df_county_data file

df_county_data.to_csv('data/county_out.csv')