In [71]:
import pandas as pd
import numpy as np

In [72]:
# pull in data set
df = pd.read_csv('../data_sources/countypres_2000-2016.csv', converters={'FIPS':str})
df

Unnamed: 0,year,state,state_po,county,FIPS,office,candidate,party,candidatevotes,totalvotes,version
0,2000,Alabama,AL,Autauga,1001,President,Al Gore,democrat,4942.0,17208,20191203
1,2000,Alabama,AL,Autauga,1001,President,George W. Bush,republican,11993.0,17208,20191203
2,2000,Alabama,AL,Autauga,1001,President,Ralph Nader,green,160.0,17208,20191203
3,2000,Alabama,AL,Autauga,1001,President,Other,,113.0,17208,20191203
4,2000,Alabama,AL,Baldwin,1003,President,Al Gore,democrat,13997.0,56480,20191203
...,...,...,...,...,...,...,...,...,...,...,...
50519,2016,Alaska,AK,District 40,2040,President,Donald Trump,republican,1377.0,4610,20191203
50520,2016,Alaska,AK,District 40,2040,President,Other,,895.0,4610,20191203
50521,2016,Alaska,,District 99,2099,President,Hillary Clinton,democrat,274.0,5056,20191203
50522,2016,Alaska,,District 99,2099,President,Donald Trump,republican,40.0,5056,20191203


In [73]:
df.dtypes

year                int64
state              object
state_po           object
county             object
FIPS               object
office             object
candidate          object
party              object
candidatevotes    float64
totalvotes          int64
version             int64
dtype: object

In [74]:
# need to get to one row per county per period.  How many parties are listed in each period
df.party.unique()
# we can split this into 4 columns

array(['democrat', 'republican', 'green', nan], dtype=object)

In [75]:
# split party into multiple columns, two per party (candidate and votes)

# democrat
df.loc[df['party']=='democrat','democrat_candidate']=df['candidate']
df.loc[df['party']=='democrat','democrat_votes']=df['candidatevotes']

# republican
df.loc[df['party']=='republican','republican_candidate']=df['candidate']
df.loc[df['party']=='republican','republican_votes']=df['candidatevotes']

# green
df.loc[df['party']=='green','green_candidate']=df['candidate']
df.loc[df['party']=='green','green_votes']=df['candidatevotes']

# other
df.loc[df['party'].isnull(),'other_candidate']=df['candidate']
df.loc[df['party'].isnull(),'other_votes']=df['candidatevotes']

df

Unnamed: 0,year,state,state_po,county,FIPS,office,candidate,party,candidatevotes,totalvotes,version,democrat_candidate,democrat_votes,republican_candidate,republican_votes,green_candidate,green_votes,other_candidate,other_votes
0,2000,Alabama,AL,Autauga,1001,President,Al Gore,democrat,4942.0,17208,20191203,Al Gore,4942.0,,,,,,
1,2000,Alabama,AL,Autauga,1001,President,George W. Bush,republican,11993.0,17208,20191203,,,George W. Bush,11993.0,,,,
2,2000,Alabama,AL,Autauga,1001,President,Ralph Nader,green,160.0,17208,20191203,,,,,Ralph Nader,160.0,,
3,2000,Alabama,AL,Autauga,1001,President,Other,,113.0,17208,20191203,,,,,,,Other,113.0
4,2000,Alabama,AL,Baldwin,1003,President,Al Gore,democrat,13997.0,56480,20191203,Al Gore,13997.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50519,2016,Alaska,AK,District 40,2040,President,Donald Trump,republican,1377.0,4610,20191203,,,Donald Trump,1377.0,,,,
50520,2016,Alaska,AK,District 40,2040,President,Other,,895.0,4610,20191203,,,,,,,Other,895.0
50521,2016,Alaska,,District 99,2099,President,Hillary Clinton,democrat,274.0,5056,20191203,Hillary Clinton,274.0,,,,,,
50522,2016,Alaska,,District 99,2099,President,Donald Trump,republican,40.0,5056,20191203,,,Donald Trump,40.0,,,,


In [76]:
# combine rows

df = df.groupby([df.year,df.FIPS], as_index=False).first()
df

Unnamed: 0,year,FIPS,state,state_po,county,office,candidate,party,candidatevotes,totalvotes,version,democrat_candidate,democrat_votes,republican_candidate,republican_votes,green_candidate,green_votes,other_candidate,other_votes
0,2000,10001,Delaware,DE,Kent,President,Al Gore,democrat,22790.0,48247,20191203,Al Gore,22790.0,George W. Bush,24081.0,Ralph Nader,1082.0,Other,294.0
1,2000,10003,Delaware,DE,New Castle,President,Al Gore,democrat,127539.0,212995,20191203,Al Gore,127539.0,George W. Bush,78587.0,Ralph Nader,5767.0,Other,1102.0
2,2000,10005,Delaware,DE,Sussex,President,Al Gore,democrat,29739.0,66287,20191203,Al Gore,29739.0,George W. Bush,34620.0,Ralph Nader,1458.0,Other,470.0
3,2000,1001,Alabama,AL,Autauga,President,Al Gore,democrat,4942.0,17208,20191203,Al Gore,4942.0,George W. Bush,11993.0,Ralph Nader,160.0,Other,113.0
4,2000,1003,Alabama,AL,Baldwin,President,Al Gore,democrat,13997.0,56480,20191203,Al Gore,13997.0,George W. Bush,40872.0,Ralph Nader,1033.0,Other,578.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15774,2016,9009,Connecticut,CT,New Haven,President,Hillary Clinton,democrat,205609.0,379006,20191203,Hillary Clinton,205609.0,Donald Trump,159048.0,,,Other,14349.0
15775,2016,9011,Connecticut,CT,New London,President,Hillary Clinton,democrat,62278.0,123528,20191203,Hillary Clinton,62278.0,Donald Trump,54058.0,,,Other,7192.0
15776,2016,9013,Connecticut,CT,Tolland,President,Hillary Clinton,democrat,38506.0,77424,20191203,Hillary Clinton,38506.0,Donald Trump,34194.0,,,Other,4724.0
15777,2016,9015,Connecticut,CT,Windham,President,Hillary Clinton,democrat,21792.0,50823,20191203,Hillary Clinton,21792.0,Donald Trump,25747.0,,,Other,3284.0


In [84]:
# check that votes recorded match total votes
diff = df.democrat_votes.fillna(0) + df.republican_votes.fillna(0) + df.green_votes.fillna(0) + df.other_votes.fillna(0) - df.totalvotes

# min is high indicating that there are some areas where the parts do not sum to the total
print(diff.min())

# there are 9 different exeptions here
diff_error=np.where(diff != 0)
print(diff_error)


-74045.0
(array([10250, 12622, 13406, 14119, 14584, 14595, 14606, 15778]),)


In [85]:
# These are areas that do show some descrepencies, but the differences appear minor. The first row is a case where data was not collected.
# Also notice the "Statewide write in" for conneticut, these may have to be eliminated
df.iloc[diff_error]

Unnamed: 0,year,FIPS,state,state_po,county,office,candidate,party,candidatevotes,totalvotes,version,democrat_candidate,democrat_votes,republican_candidate,republican_votes,green_candidate,green_votes,other_candidate,other_votes
10250,2012,2099.0,Alaska,,District 99,President,Barack Obama,democrat,,74045,20191203,Barack Obama,,Mitt Romney,,,,Other,
12622,2012,,Connecticut,,Statewide writein,President,Barack Obama,democrat,39366.0,74045,20191203,Barack Obama,39366.0,Mitt Romney,30450.0,,,Other,1175.0
13406,2016,2099.0,Alaska,,District 99,President,Hillary Clinton,democrat,274.0,5056,20191203,Hillary Clinton,274.0,Donald Trump,40.0,,,Other,28.0
14119,2016,31103.0,Nebraska,NE,Keya Paha,President,Hillary Clinton,democrat,40.0,479,20191203,Hillary Clinton,40.0,Donald Trump,460.0,,,Other,19.0
14584,2016,4007.0,Arizona,AZ,Gila,President,Hillary Clinton,democrat,7003.0,36697,20191203,Hillary Clinton,7003.0,Donald Trump,14182.0,,,Other,1127.0
14595,2016,4009.0,Arizona,AZ,Graham,President,Hillary Clinton,democrat,3301.0,20306,20191203,Hillary Clinton,3301.0,Donald Trump,8025.0,,,Other,808.0
14606,2016,4011.0,Arizona,AZ,Greenlee,President,Hillary Clinton,democrat,1092.0,5192,20191203,Hillary Clinton,1092.0,Donald Trump,1892.0,,,Other,286.0
15778,2016,,Connecticut,,Statewide writein,President,Hillary Clinton,democrat,3017.0,5056,20191203,Hillary Clinton,3017.0,Donald Trump,648.0,,,Other,321.0


In [None]:
# how were the values on state wide write in calculated?  Why did major party candidates get lumped in with write in?

In [79]:
# calculate the mix of dem/rep votes