# Formatting and cleaning data 

In [None]:
import pandas as pd
import numpy as np
import warnings, os

## Load and clean data

In [None]:
precincts = pd.read_csv('raw-data/statement_of_votes_alameda_senator_050324.csv', na_values=['','****'], skiprows=2)

How do we look at the first few rows?

Drop the na columns

What is in this data?

Rename precinct column

Drop the columns we don't need 

In [None]:
precincts_cleaned["precinct"] = precincts_cleaned["type"]

Check the columns

Drop the rows after 'Electionwide - Total'

In [None]:
end_index = precincts_cleaned.query("type == 'Electionwide - Total'").index[0]

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html#pandas.DataFrame.iloc

In [None]:
precincts_cleaned_cut = precincts_cleaned.iloc[:end_index]

more cleaning

In [None]:
precincts_cleaned["precinct"].replace(['Countywide', 'Electionwide', 'Election Day',
       'Vote by Mail', 'Total','Electionwide - Total', 'Cumulative', 'Cumulative - Total',
       'Countywide - Total'], np.nan, inplace=True)

Are the precincts unique? How many are unique?

https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.ffill.html#pandas.core.groupby.DataFrameGroupBy.ffill

In [None]:
precincts_cleaned["precinct"] = precincts_cleaned["precinct"].ffill()

In [None]:
precincts_cleaned.head(10)

In [None]:
precincts_cleaned["precinct"].nunique()

Now, filter to just the total vote rows

In [None]:
total.head()

Check the counts of the precincts again

Move the total votes to the first position

In [None]:
total_column = total.pop('Total Votes') 
total.insert(0, 'total_votes', total_column) 

Shift column 'Total Votes' to first position 

In [None]:
precinct_column = total.pop('precinct') 
total.insert(0, 'precinct', precinct_column) 

Split the column names

In [None]:
total.columns = total.columns.str.split('\n').str[0]

In [None]:
total.head()

## Reshape dataframe

Note: Null values are suppressed data

Make dataframe long for all columns after ballots cast - put candidates in a candidate column

In [None]:
long = pd.melt(total, 
               id_vars=['precinct','total_votes'], 
               value_vars=total.columns[3:])

Grab just a sample of the data 

In [None]:
long.sample()

What is in this data?

In [None]:
long.columns = ['precinct','total_votes','candidate_name','candidate_votes']

Make names title case

In [None]:
long['candidate_name'] = long['candidate_name'].str.title()

In [None]:
long.sort_values("precinct").head(2)

Clean up the data

In [None]:
long["candidate_votes"] = long["candidate_votes"].astype(str)
long["total_votes"] = long["total_votes"].astype(str)
long["candidate_votes"] = long["candidate_votes"].str.replace("0.0","0")
long["candidate_votes"] = long["candidate_votes"].str.replace(",","")
long["total_votes"] = long["total_votes"].str.replace("0.0","0")
long["total_votes"] = long["total_votes"].str.replace(",","")

In [None]:
long["candidate_votes"] = pd.to_numeric(long["candidate_votes"], errors='coerce')
long["total_votes"] = pd.to_numeric(long["total_votes"], errors='coerce')

Which ones are na?

## Get candidate order by precinct and create variables

In [None]:
from scipy.stats import rankdata 

Make a copy

In [None]:
rank = long.copy()

Group by and rank by candidate votes

In [None]:
rank

Fill nas in 

In [None]:
rank["rank"] = rank["rank"].fillna(99)

Find the unique candidate names

In [None]:
len(rank["candidate_name"].unique())

Get the unique ranks

In [None]:
rank["rank"].unique()

Set rank as an int

In [None]:
rank["rank"] = rank["rank"].astype(int)

For candidates that are in 5 place or lower, set the rank to "other" and group their votes
If there is a tie for 5th place, one of the tied candidates drops to the other group - for what we are doing this is ok, but something to note!

In [None]:
rank.loc[rank["rank"] >= 5, "rank"] = 5

rank["rank"] = rank["rank"].astype(str)

In [None]:
rank.loc[rank["rank"] == "5", "rank"] = "5+"
rank.loc[rank["rank"] == "5+", "candidate_name"] = "Other"

Group by and sum the candidate votes

In [None]:
rank_grouped = rank.groupby(["precinct","total_votes","candidate_name","rank"])["candidate_votes"].sum().reset_index()

Create a candidate percentage of votes

In [None]:
rank_grouped["candidate_pct"] = rank_grouped ["candidate_votes"] /rank_grouped ["total_votes"]

Sort on rank and precinct

In [None]:
rank_grouped.sort_values(["precinct","rank"]).head(10)

## Reshape the data from long to wide

In [None]:
wide = rank_grouped.pivot(index=["precinct","total_votes"], columns="rank", values=["candidate_name","candidate_pct","candidate_votes"])

In [None]:
wide = wide.reset_index()

In [None]:
wide.columns = wide.columns.map('_'.join).str.strip('_')

In [None]:
wide.sample()

In [None]:
# Set winner to candidate 1
wide["winner"] = wide["candidate_name_1"]
# If candidate 1 and candidate 2 are tied, then set winner to tie
wide.loc[wide["candidate_pct_1"] == wide["candidate_pct_2"], "winner"] = "tie"

In [None]:
# Create the margin between candidate 1 and 2
wide["spread"] = wide["candidate_pct_1"] - wide["candidate_pct_2"]

In [None]:
spread_values = wide["spread"].to_list()

In [None]:
spread_values_clean = [x for x in spread_values if ~np.isnan(x)]

In [None]:
wide["county"] = 'Alameda County'

In [None]:
wide

## Save

In [None]:
# export the csv, first making directories that may not exist yet locally
os.makedirs('processed', exist_ok=True)

In [None]:
wide.to_csv('processed/alameda_cleaned_election.csv', index=False)