# Tasks

- Read in campaign outcome datasets for senate, house √
- Asses & clean senate and house data √
- Merge senate and house √

In [436]:
import pandas as pd
import numpy as np

# Gather Data

In [437]:
senate = pd.read_csv('senate_results.csv')
house = pd.read_csv('house_results.csv')

In [340]:
senate.head()

Unnamed: 0,1,STATE ABBREVIATION,STATE,D,FEC ID#,(I),CANDIDATE NAME (First),CANDIDATE NAME (Last),CANDIDATE NAME,TOTAL VOTES,...,RUNOFF VOTES,RUNOFF %,GENERAL VOTES,GENERAL %,GE RUNOFF ELECTION VOTES (LA),GE RUNOFF ELECTION % (LA),"COMBINED GE PARTY TOTALS (CT, NY, SC)","COMBINED % (CT, NY, SC)",GE WINNER INDICATOR,FOOTNOTES
0,2,AL,Alabama,S,S6AL00195,(I),Jeff,Sessions,"Sessions, Jeff",,...,,,795606.0,97.25%,,,,,W,
1,3,AL,Alabama,S,,,,Scattered,Scattered,,...,,,22484.0,17.37%,,,,,,
2,4,AL,Alabama,S,,,,,,Total State Votes:,...,,,818090.0,,,,,,,
3,5,,,,,,,,,,...,,,,,,,,,,
4,6,AK,Alaska,S,S4AK00214,,Dan,Sullivan,"Sullivan, Dan",,...,,,135445.0,47.96%,,,,,W,


In [341]:
house.head()

Unnamed: 0,1,STATE ABBREVIATION,STATE,D,FEC ID#,(I),CANDIDATE NAME (First),CANDIDATE NAME (Last),CANDIDATE NAME,TOTAL VOTES,...,RUNOFF VOTES,RUNOFF %,GENERAL VOTES,GENERAL %,GE RUNOFF ELECTION VOTES (LA),GE RUNOFF ELECTION % (LA),"COMBINED GE PARTY TOTALS (CT, NY, SC)","COMBINED % (CT, NY, SC)",GE WINNER INDICATOR,FOOTNOTES
0,2,AL,Alabama,1.0,H4AL01123,(I),Bradley,Byrne,"Byrne, Bradley",,...,,,103758.0,68.16%,,,,,W,
1,3,AL,Alabama,1.0,H4AL01156,,Burton R.,LeFlore,"LeFlore, Burton R.",,...,,,48278.0,31.71%,,,,,,
2,4,AL,Alabama,1.0,,,,,Scattered,,...,,,198.0,0.13%,,,,,,
3,5,AL,Alabama,1.0,,,,,,District Votes:,...,,,152234.0,,,,,,,
4,6,AL,Alabama,,,,,,,,...,,,,,,,,,,


# Asses Senate 

In [342]:
# make a copy we can manipulate
senate_assess = senate.copy()

# voteshare is a string; let's convert to numeric and check that there are no values > 100, less than 1
senate_assess['GENERAL %'] = pd.to_numeric(senate_assess['GENERAL %'].str.extract(pat='(\d+\.*\d+)'))

# are candidates with < 1% inaccurate data (different percent formats?)
senate_assess[senate_assess['GENERAL %']<1]

# nope, looks like these are just people with very few votes

# how many candidates do we have in here?

len(senate_assess[senate_assess['FEC ID#']!='n/a']['FEC ID#'].unique())

# do we have a lot of rows with duplicate candidate data?

print('unique candidate ids: ',len(senate_assess[senate_assess['FEC ID#']!='n/a']['FEC ID#'].unique()))
print('unique candidates with non-null voteshare data: ',len(senate_assess[(senate_assess['FEC ID#']!='n/a')&(senate_assess['GENERAL VOTES '].isnull()==False)]['FEC ID#']))

# no, only 1

# how many of these races were runoffs?

len(senate_assess['RUNOFF VOTES'].value_counts())

unique candidate ids:  320
unique candidates with non-null voteshare data:  166


  """


12

## Clean Senate

### Define

- Include only senate candidates who have an FEC ID and a non-null voteshare in the general election
- Keep only FEC ID#, General Votes, General %, and Win Indicator
- Convert win indicator to 0/1 
- Determine which record to keep for Brad Hutto, FEC ID# 'S4SC00364', who has two rows

### Code

In [381]:
# make clean copy
senate_clean = senate.copy()

# only candidates with FEC ID
senate_clean = senate_clean[(senate_clean['FEC ID#']!='n/a')]

# only candidates with non-null voteshare (%)
senate_clean['GENERAL %'] = pd.to_numeric(senate_clean['GENERAL %'].str.extract(pat='(\d+\.*\d+)'))

# only candidates with non-null voteshare (count)
senate_clean = senate_clean[senate_clean['GENERAL VOTES '].isin(['nan','#','##'])==False]
senate_clean['GENERAL VOTES '] = pd.to_numeric(senate_clean['GENERAL VOTES '].str.replace(',',''))

senate_clean = senate_clean[senate_clean['GENERAL VOTES '].isnull()==False]

# keep only relevant columns and rename them
senate_clean = senate_clean[['FEC ID#','GENERAL VOTES ','GENERAL %','GE WINNER INDICATOR']]
cols = {'FEC ID#':'CAND_ID','GENERAL VOTES ':'VOTECOUNT','GENERAL %':'VOTESHARE','GE WINNER INDICATOR':'WIN_LOSS'}
senate_clean = senate_clean.rename(columns=cols)

# convert win/loss column
senate_clean['WIN_LOSS'] = senate_clean['WIN_LOSS'].apply(lambda x: 1 if str(x)=='W' else 0)

# drop the first Brad Hutto record (the one with the smaller voteshare)
senate_clean = senate_clean.drop(senate_clean[senate_clean.CAND_ID=='S4SC00364'].index[0])

  


### Test

In [382]:
print(senate_clean.tail())
print(senate_clean.describe())
print(senate_clean['WIN_LOSS'].value_counts())
assert max(senate_clean.VOTESHARE[senate_clean.WIN_LOSS==0]) < 50, "Candidates with greater than 50% of voteshare marked as loss"
assert max(senate_clean.CAND_ID.value_counts()) == 1

        CAND_ID  VOTECOUNT  VOTESHARE  WIN_LOSS
492   S4WV00241        5.0       0.00         0
496  S6WY00126    121554.0      72.19         1
503  S4WY00097     29377.0      17.45         0
509   S4WY00105    13311.0       7.90         0
510   S4WY00154     3677.0       2.18         0
          VOTECOUNT   VOTESHARE    WIN_LOSS
count  1.620000e+02  162.000000  162.000000
mean   2.848466e+05   22.176358    0.216049
std    4.628402e+05   25.091052    0.412824
min    4.000000e+00    0.000000    0.000000
25%    5.969750e+03    0.780000    0.000000
50%    4.755900e+04    3.075000    0.000000
75%    3.759538e+05   45.017500    0.000000
max    2.861531e+06   97.250000    1.000000
0    127
1     35
Name: WIN_LOSS, dtype: int64


## Assess House

In [494]:
# make a copy we can manipulate
house_assess = house.copy()

# voteshare is a string; let's convert to numeric and check that there are no values > 100, less than 1
house_assess['GENERAL %'] = pd.to_numeric(house_assess['GENERAL %'].str.extract(pat='(\d+\.*\d+)'))

# are candidates with < 1% inaccurate data (different percent formats?)
house_assess[house_assess['GENERAL %']<0]
house_assess[house_assess['GENERAL %']>100]
house_assess[house_assess['GENERAL %']<1]

# nope, looks like these are just people with very few votes

# how about votecount?

house_assess['GENERAL VOTES '].value_counts()
house_assess[house_assess['GENERAL VOTES ']=='Unopposed']

    # we'll want to drop these unchallenged races
    
# do we have a lot of rows with duplicate candidate data?
print('unique candidates with non-null voteshare data: ',len(house_assess[(house_assess['FEC ID#']!='n/a')&(house_assess['GENERAL VOTES '].isnull()==False)]['FEC ID#']))    
house_assess[(house_assess['FEC ID#']!='n/a')&(house_assess['GENERAL VOTES '].isnull()==False)&(house_assess['FEC ID#'].duplicated(False))]

# ok so this has to do with parties 
# e.g., candidate with ID 'H8CT01046' ran with both the Working Families party and the Democrats
    
    # In cases of duplicates, let's keep the record with the higher votecount

# let's take a look at win/loss

house_assess['GE WINNER INDICATOR'].value_counts()
house_assess[(house_assess['GE WINNER INDICATOR']=='W')&(house_assess['GENERAL %'].isnull())]

    # we can expect 6 to get lost because GE voteshare is NaN; 
    # we will drop to at most 476 wins in our new dataset
    
# issues with Mark Takai and David Brat discovered during cleaning:
house_assess[(house_assess['GE WINNER INDICATOR'].isnull())&(house_assess['GENERAL %']>50)]
    
    # Mark Takai should be a win--data entry error
    # Drop everything in the unexpired term Virginia race -- we don't want to double-count that race


unique candidates with non-null voteshare data:  1263


  """


Unnamed: 0,1,STATE ABBREVIATION,STATE,D,FEC ID#,(I),CANDIDATE NAME (First),CANDIDATE NAME (Last),CANDIDATE NAME,TOTAL VOTES,...,RUNOFF VOTES,RUNOFF %,GENERAL VOTES,GENERAL %,GE RUNOFF ELECTION VOTES (LA),GE RUNOFF ELECTION % (LA),"COMBINED GE PARTY TOTALS (CT, NY, SC)","COMBINED % (CT, NY, SC)",GE WINNER INDICATOR,FOOTNOTES
837,839,HI,Hawaii,01,H4HI01134,,Mark,Takai,"Takai, Mark",,...,,,93390,51.93,,,,,,
3429,3431,VA,Virginia,07 - UNEXPIRED TERM,H4VA07143,,Dave A.,Brat,"Brat, Dave A.",,...,,,148841,61.68,,,,,,# Dave Brat and Jack Trammell were nominated ...


## Clean House

### Define

- Include only house candidates who have an FEC ID and a non-null voteshare in the general election
- Keep only FEC ID#, General Votes, General %, and Win Indicator
- Convert win indicator to 0/1 
- Delete records in Unopposed or Expired term elections
- In cases with duplicate records for one candidate (e.g. a candidate was endorsed by multiple parties), keep the highest vote share
- Mark Takai should have a Win in win/loss

### Code

In [515]:
# make clean copy
house_clean = house.copy()

# only candidates with FEC ID
house_clean = house_clean[(house_clean['FEC ID#']!='n/a')]

# drop rows where district = '07 - UNEXPIRED TERM'
house_clean = house_clean[house_clean['D']!='07 - UNEXPIRED TERM']

# only candidates with non-null voteshare (%)
house_clean['GENERAL %'] = pd.to_numeric(house_clean['GENERAL %'].str.extract(pat='(\d+\.*\d+)'))

# only candidates with non-null voteshare (count)
house_clean = house_clean[house_clean['GENERAL VOTES '].isin(['nan','#','##','Unopposed'])==False]
house_clean['GENERAL VOTES '] = pd.to_numeric(house_clean['GENERAL VOTES '].str.replace(',',''))

house_clean = house_clean[house_clean['GENERAL VOTES '].isnull()==False]

# keep only relevant columns and rename them
house_clean = house_clean[['FEC ID#','GENERAL VOTES ','GENERAL %','GE WINNER INDICATOR']]
cols = {'FEC ID#':'CAND_ID','GENERAL VOTES ':'VOTECOUNT','GENERAL %':'VOTESHARE','GE WINNER INDICATOR':'WIN_LOSS'}
house_clean = house_clean.rename(columns=cols)

# convert win/loss column
house_clean['WIN_LOSS'] = house_clean['WIN_LOSS'].apply(lambda x: 1 if str(x)=='W' else 0)

# change Mark Takai to a win (data entry error)
house_clean.loc[house_clean[house_clean.CAND_ID=='H4HI01134'].index.values,'WIN_LOSS'] = 1

# now, for cases with duplicated candidate data, keep the row with the higher voteshare
house_clean = house_clean.sort_values(by=['CAND_ID','VOTESHARE']).drop_duplicates(subset=['CAND_ID'],keep='last')

  # This is added back by InteractiveShellApp.init_path()


### Test

In [518]:
print(house_clean.tail())
print(house_clean.describe())
print(house_clean['WIN_LOSS'].value_counts())
assert max(house_clean.VOTESHARE[house_clean.WIN_LOSS==0]) < 50, "Candidates with greater than 50% of voteshare marked as loss"
assert max(house_clean.CAND_ID.value_counts()) == 1, "Duplicate candidate ids: " + house_clean.CAND_ID[house_clean.CAND_ID.duplicated(False)]

         CAND_ID  VOTECOUNT  VOTESHARE  WIN_LOSS
3531   H8WA07132   203954.0      80.97         1
3590   H8WI01024   182316.0      63.27         1
3607  H8WI02121    103619.0      31.51         0
3642  H8WI09050    231160.0      69.45         1
3697   H8WY00148   113038.0      68.47         1
           VOTECOUNT    VOTESHARE     WIN_LOSS
count    1181.000000  1180.000000  1181.000000
mean    65661.647756    36.503000     0.366638
std     53222.985128    27.551308     0.482091
min         1.000000     0.000000     0.000000
25%      8593.000000     4.845000     0.000000
50%     63219.000000    36.745000     0.000000
75%    105716.000000    60.027500     1.000000
max    231160.000000   100.000000     1.000000
0    748
1    433
Name: WIN_LOSS, dtype: int64


## Merge House and Senate

In [540]:
election_outcomes = pd.concat(objs=[house_clean,senate_clean])
election_outcomes.index = range(0,len(election_outcomes))

In [542]:
election_outcomes[1175:1190]

Unnamed: 0,CAND_ID,VOTECOUNT,VOTESHARE,WIN_LOSS
1175,H8VT00141,2750.0,1.44,0
1176,H8WA07132,203954.0,80.97,1
1177,H8WI01024,182316.0,63.27,1
1178,H8WI02121,103619.0,31.51,0
1179,H8WI09050,231160.0,69.45,1
1180,H8WY00148,113038.0,68.47,1
1181,S6AL00195,795606.0,97.25,1
1182,S4AK00214,135445.0,47.96,1
1183,S8AK00090,129431.0,45.83,0
1184,S4AK00230,10512.0,3.72,0


In [550]:
# write clean data to a csv to open in Rstudio
file_name = 'election_outcomes_clean_data.csv'
election_outcomes.to_csv(path_or_buf=file_name,sep=',')