In [1]:
import sys
import os
import re
import pandas as pd
import numpy as np

In [3]:
county_facts = pd.read_csv('county_facts.csv')

In [29]:
county_facts.set_index(county_facts['fips'])
county_facts

Unnamed: 0,fips,area_name,state_abbreviation,PST045214,PST040210,PST120214,POP010210,AGE135214,AGE295214,AGE775214,...,SBO415207,SBO015207,MAN450207,WTN220207,RTN130207,RTN131207,AFN120207,BPS030214,LND110210,POP060210
0,0,United States,,318857056,308758105,3.3,308745538,6.2,23.1,14.5,...,8.3,28.8,5319456312,4174286516,3917663456,12990,613795732,1046363,3531905.43,87.4
1,1000,Alabama,,4849377,4780127,1.4,4779736,6.1,22.8,15.3,...,1.2,28.1,112858843,52252752,57344851,12364,6426342,13369,50645.33,94.4
2,1001,Autauga County,AL,55395,54571,1.5,54571,6.0,25.2,13.8,...,0.7,31.7,0,0,598175,12003,88157,131,594.44,91.8
3,1003,Baldwin County,AL,200111,182265,9.8,182265,5.6,22.2,18.7,...,1.3,27.3,1410273,0,2966489,17166,436955,1384,1589.78,114.6
4,1005,Barbour County,AL,26887,27457,-2.1,27457,5.7,21.2,16.5,...,0.0,27.0,0,0,188337,6334,0,8,884.88,31.0
5,1007,Bibb County,AL,22506,22919,-1.8,22915,5.3,21.0,14.8,...,0.0,0.0,0,0,124707,5804,10757,19,622.58,36.8
6,1009,Blount County,AL,57719,57322,0.7,57322,6.1,23.6,17.0,...,0.0,23.2,341544,0,319700,5622,20941,3,644.78,88.9
7,1011,Bullock County,AL,10764,10915,-1.4,10914,6.3,21.4,14.9,...,0.0,38.8,0,0,43810,3995,3670,1,622.81,17.5
8,1013,Butler County,AL,20296,20946,-3.1,20947,6.1,23.6,18.0,...,0.0,0.0,399132,56712,229277,11326,28427,2,776.83,27.0
9,1015,Calhoun County,AL,115916,118586,-2.3,118572,5.7,22.2,16.0,...,0.5,24.7,2679991,0,1542981,13678,186533,114,605.87,195.7


The cell below prints out the definitions matching each county fact abbreviation.

In [26]:
county_facts_dictionary = open('county_facts_dictionary.csv', 'r')
lines = county_facts_dictionary.readlines()
for line in lines:
    print(line)

column_name,description

PST045214,"Population, 2014 estimate"

PST040210,"Population, 2010 (April 1) estimates base"

PST120214,"Population, percent change - April 1, 2010 to July 1, 2014"

POP010210,"Population, 2010"

AGE135214,"Persons under 5 years, percent, 2014"

AGE295214,"Persons under 18 years, percent, 2014"

AGE775214,"Persons 65 years and over, percent, 2014"

SEX255214,"Female persons, percent, 2014"

RHI125214,"White alone, percent, 2014"

RHI225214,"Black or African American alone, percent, 2014"

RHI325214,"American Indian and Alaska Native alone, percent, 2014"

RHI425214,"Asian alone, percent, 2014"

RHI525214,"Native Hawaiian and Other Pacific Islander alone, percent, 2014"

RHI625214,"Two or More Races, percent, 2014"

RHI725214,"Hispanic or Latino, percent, 2014"

RHI825214,"White alone, not Hispanic or Latino, percent, 2014"

POP715213,"Living in same house 1 year & over, percent, 2009-2013"

POP645213,"Foreign born persons, percent, 2009-2013"

POP815213,"Langua

Our country fact data are on different scales and could cause us to reach a poor fit for the models we will train. Since we cannot assume a priori that all the features are normally distributed we will instead scale each feature using: $x\rightarrow\frac{x-\min{x}}{\max{x}-\min{x}}$

In [27]:
cols = county_facts.columns.values
print(cols)

['fips' 'area_name' 'state_abbreviation' 'PST045214' 'PST040210'
 'PST120214' 'POP010210' 'AGE135214' 'AGE295214' 'AGE775214' 'SEX255214'
 'RHI125214' 'RHI225214' 'RHI325214' 'RHI425214' 'RHI525214' 'RHI625214'
 'RHI725214' 'RHI825214' 'POP715213' 'POP645213' 'POP815213' 'EDU635213'
 'EDU685213' 'VET605213' 'LFE305213' 'HSG010214' 'HSG445213' 'HSG096213'
 'HSG495213' 'HSD410213' 'HSD310213' 'INC910213' 'INC110213' 'PVY020213'
 'BZA010213' 'BZA110213' 'BZA115213' 'NES010213' 'SBO001207' 'SBO315207'
 'SBO115207' 'SBO215207' 'SBO515207' 'SBO415207' 'SBO015207' 'MAN450207'
 'WTN220207' 'RTN130207' 'RTN131207' 'AFN120207' 'BPS030214' 'LND110210'
 'POP060210']


We can ignore the first 3 columns, and only look at the first ten features for now.

In [28]:
cols = cols[3:13]
cols

array(['PST045214', 'PST040210', 'PST120214', 'POP010210', 'AGE135214',
       'AGE295214', 'AGE775214', 'SEX255214', 'RHI125214', 'RHI225214'], dtype=object)

In [24]:
cf = county_facts[cols].copy() # A partial copy of county facts for us to modify.
for c in cols:
    c_max = county_facts[c].max()
    c_min = county_facts[c].min()
    #print(c_max)
    #print(c_min)
    cf[c] = (cf[c] - c_min)/(c_max - c_min)
cf

Unnamed: 0,POP010210,AGE135214,AGE295214,AGE775214,SEX255214,RHI125214,RHI225214,RHI325214,RHI425214,RHI525214
0,1.000000,0.452555,0.550000,0.274102,0.894366,0.779456,0.155112,0.013015,0.127358,0.004141
1,0.015481,0.445255,0.542857,0.289225,0.906690,0.701913,0.313749,0.007592,0.030660,0.002070
2,0.000176,0.437956,0.600000,0.260870,0.904930,0.784491,0.219741,0.005423,0.025943,0.002070
3,0.000590,0.408759,0.528571,0.353497,0.901408,0.877140,0.112808,0.007592,0.021226,0.002070
4,0.000089,0.416058,0.504762,0.311909,0.820423,0.505539,0.559342,0.006508,0.011792,0.004141
5,0.000074,0.386861,0.500000,0.279773,0.808099,0.768379,0.259694,0.004338,0.004717,0.002070
6,0.000185,0.445255,0.561905,0.321361,0.889085,0.966767,0.021152,0.006508,0.007075,0.002070
7,0.000035,0.459854,0.509524,0.281664,0.797535,0.270896,0.823737,0.008677,0.007075,0.014493
8,0.000068,0.445255,0.561905,0.340265,0.943662,0.542800,0.517039,0.004338,0.021226,0.000000
9,0.000384,0.416058,0.528571,0.302457,0.911972,0.763343,0.247944,0.005423,0.021226,0.002070
