In [1]:
import pandas as pd

Census bureau definition of fips_class_codes:

    fips_class_codes
    H1:  identifies an active county or statistically equivalent entity that does not qualify under subclass C7 or H6.
    H4:  identifies a legally defined inactive or nonfunctioning county or statistically equivalent entity that does not qualify under subclass H6.
    H5:  identifies census areas in Alaska, a statistical county equivalent entity.
    H6:  identifies a county or statistically equivalent entity that is areally coextensive or governmentally consolidated with an incorporated place, part of an incorporated place, or a consolidated city. 
    C7:  identifies an incorporated place that is an independent city; that is, it also serves as a county equivalent because it is not part of any county, and a minor civil division (MCD) equivalent because it is not part of any MCD.

Wikipedia says:
    
    As of 2013, the United States has 3,007 counties and 137 county
    equivalents for a total of 3,144 counties and county equivalents.

Let's look the different types of county equivalents within the FIPS data

In [2]:
# county_fips data from https://www.census.gov/geo/reference/codes/cou.html
census = pd.read_csv('national_county.txt',sep=',',header=None)
census.columns = ['state_abbr', 'state_fips', 'county_fips', 'county_name', 'fips_class_code']
print(census.shape)
census.head()

(3235, 5)


Unnamed: 0,state_abbr,state_fips,county_fips,county_name,fips_class_code
0,AL,1,1,Autauga County,H1
1,AL,1,3,Baldwin County,H1
2,AL,1,5,Barbour County,H1
3,AL,1,7,Bibb County,H1
4,AL,1,9,Blount County,H1


In [3]:
# look at some states data
census[census['state_abbr'] == 'CT']

Unnamed: 0,state_abbr,state_fips,county_fips,county_name,fips_class_code
308,CT,9,1,Fairfield County,H4
309,CT,9,3,Hartford County,H4
310,CT,9,5,Litchfield County,H4
311,CT,9,7,Middlesex County,H4
312,CT,9,9,New Haven County,H4
313,CT,9,11,New London County,H4
314,CT,9,13,Tolland County,H4
315,CT,9,15,Windham County,H4


In [4]:
# Look at independent cities
# should include Baltimore, Maryland; Carson City, Nevada; St. Louis, Missouri; and all 38 cities in Virginia
indCities = census[census['fips_class_code'] == 'C7']

indCities.head()

Unnamed: 0,state_abbr,state_fips,county_fips,county_name,fips_class_code
1216,MD,24,510,Baltimore city,C7
1597,MO,29,510,St. Louis city,C7
1763,NV,32,510,Carson City,C7
2915,VA,51,510,Alexandria city,C7
2916,VA,51,515,Bedford city,C7


In [5]:
# import Guardian data
# the excel file has been converted from .xls to .xlsx
# get all data within the 'FULL DATA' worksheet within the Excel Table
# and pass it into a DataFrame
guardian = pd.read_excel('US_elect_county.xlsx',
                      sheetname = 'FULL DATA',
                      header = 0,
                      parse_cols = "A, D, E, K, T, AF", 
                      convert_float = True)
 
guardian.columns = ['state_abbr','county_fips_combined', 'county_name','votes_total','votes_Dem','votes_Repub']

In [6]:
# view votes_total cast results by state, county_fips, county_name, and candidate
guardian.head()

Unnamed: 0,state_abbr,county_fips_combined,county_name,votes_total,votes_Dem,votes_Repub
0,AK,0,Alaska,220596,91696,121234
1,AK,2000,Alaska,220596,91696,121234
2,AL,0,Alabama,2064699,793620,1252453
3,AL,1001,Autauga,23909,6354,17366
4,AL,1003,Baldwin,84988,18329,65772


Guardian data summarizes county-level data to the state level within the same table. We need to create two dataframes that segregate county- and state-level data

In [7]:
# state-level data
state_level_guardian = guardian[guardian['county_fips_combined'] == 0]
state_level_guardian.drop(state_level_guardian.columns[[1,2]], axis=1, inplace=True)
print(state_level_guardian.shape)
state_level_guardian.head(n=10)

(51, 4)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,state_abbr,votes_total,votes_Dem,votes_Repub
0,AK,220596,91696,121234
2,AL,2064699,793620,1252453
70,AR,1062831,391953,643717
146,AZ,2041519,900081,1107130
162,CA,10538656,6241648,4046524
221,CO,2419698,5807,1238490
286,CT,1562187,631432,912531
456,DC,243348,222332,17337
458,DE,413844,242547,165476
462,FL,8470205,4162081,4235270


In [8]:
# county-level data
county_level_guardian = guardian[guardian['county_fips_combined'] != 0]
print(county_level_guardian.shape)
county_level_guardian.head(n=10)

(4588, 6)


Unnamed: 0,state_abbr,county_fips_combined,county_name,votes_total,votes_Dem,votes_Repub
1,AK,2000,Alaska,220596,91696,121234
3,AL,1001,Autauga,23909,6354,17366
4,AL,1003,Baldwin,84988,18329,65772
5,AL,1005,Barbour,11459,5873,5539
6,AL,1007,Bibb,8391,2200,6131
7,AL,1009,Blount,23980,2961,20741
8,AL,1011,Bullock,5318,4058,1250
9,AL,1013,Butler,9483,4367,5081
10,AL,1015,Calhoun,46240,15500,30272
11,AL,1017,Chambers,14562,6853,7596


Guardian 'county_fips_combined' column seems to be a combination of Census 'state_fips' + numerical placeholder '##' + 'county_fips'. We'll need to match the state_fips from census data to county level Guardian data

In [9]:
# create DataFrame with just county_name and county_fips
county_FIPS_census = census[[1,2,3]].drop_duplicates()
print(county_FIPS_census.shape)
county_FIPS_census.head()

(3235, 3)


Unnamed: 0,state_fips,county_fips,county_name
0,1,1,Autauga County
1,1,3,Baldwin County
2,1,5,Barbour County
3,1,7,Bibb County
4,1,9,Blount County


In [10]:
# create a combined FIPS column to emulate Guardian data
#county_FIPS_census.loc[:,'county_fips_combined'] = pd.Series('', index=county_FIPS_census.index)

combined_fips = []

for index, row in county_FIPS_census.iterrows():
    fips_county_len = len(str(row['county_fips']))
    if (fips_county_len < 2):
        combined_fips.append(int(str(row['state_fips']) + '00' + str(row['county_fips'])))
    elif (fips_county_len < 3):
        combined_fips.append(int(str(row['state_fips']) + '0' + str(row['county_fips'])))
    elif (fips_county_len < 4):
        combined_fips.append(int(str(row['state_fips']) + str(row['county_fips'])))

county_FIPS_census['county_fips_combined'] = combined_fips
county_FIPS_census.head()

Unnamed: 0,state_fips,county_fips,county_name,county_fips_combined
0,1,1,Autauga County,1001
1,1,3,Baldwin County,1003
2,1,5,Barbour County,1005
3,1,7,Bibb County,1007
4,1,9,Blount County,1009


In [11]:
combined = pd.merge(county_FIPS_census, county_level_guardian, on='county_fips_combined')
print(combined.shape)
combined.head()

(4587, 9)


Unnamed: 0,state_fips,county_fips,county_name_x,county_fips_combined,state_abbr,county_name_y,votes_total,votes_Dem,votes_Repub
0,1,1,Autauga County,1001,AL,Autauga,23909,6354,17366
1,1,3,Baldwin County,1003,AL,Baldwin,84988,18329,65772
2,1,5,Barbour County,1005,AL,Barbour,11459,5873,5539
3,1,7,Bibb County,1007,AL,Bibb,8391,2200,6131
4,1,9,Blount County,1009,AL,Blount,23980,2961,20741


In [12]:
# look at some states
combined[combined['state_abbr'] == 'DC']

Unnamed: 0,state_fips,county_fips,county_name_x,county_fips_combined,state_abbr,county_name_y,votes_total,votes_Dem,votes_Repub
451,11,1,District of Columbia,11001,DC,District of Columbia,243348,222332,17337


In [13]:
# calculate percentage of total vote per major candidates
combined['per_Dem'] = combined['votes_Dem'] / combined['votes_total']
combined['per_Repub'] = combined['votes_Repub'] / combined['votes_total']
combined.head()

Unnamed: 0,state_fips,county_fips,county_name_x,county_fips_combined,state_abbr,county_name_y,votes_total,votes_Dem,votes_Repub,per_Dem,per_Repub
0,1,1,Autauga County,1001,AL,Autauga,23909,6354,17366,0.265758,0.726337
1,1,3,Baldwin County,1003,AL,Baldwin,84988,18329,65772,0.215666,0.773897
2,1,5,Barbour County,1005,AL,Barbour,11459,5873,5539,0.512523,0.483376
3,1,7,Bibb County,1007,AL,Bibb,8391,2200,6131,0.262186,0.730664
4,1,9,Blount County,1009,AL,Blount,23980,2961,20741,0.123478,0.864929


In [14]:
combined.to_csv('US presidential election results by county.csv',sep=',')