In [None]:
import pandas as pd
import numpy as np
import re
import sys

from functools import reduce

sys.path.append('./src')
import helpers

%load_ext autoreload
%autoreload

In [None]:
# 💎😎
def p(v, fn=None, **kwargs):
  print(v)
  fn = lambda k, v: display([k, v]) if fn is None else fn
  [print(f"{k}: {v}") for k, v in kwargs.items()]
  return v

## Load Datasets

In [None]:
%%time
governor_urls = [f'https://www.maine.gov/sos/cec/elec/results/2018/govd-{x}.xlsx' for x in [1, 2, 3, 4, 5]]
congress_urls = [f'https://www.maine.gov/sos/cec/elec/results/2018/congressd2-{x}.xlsx' for x in [1, 2, 3, 4]]

raw_governor_datasets = [pd.read_excel(helpers.cached_file(url=f), index_col=0) for f in governor_urls]
raw_congress_datasets = [pd.read_excel(helpers.cached_file(url=f), index_col=0) for f in congress_urls]

#### What possible votes are we working with?

In [None]:
display([len(df) for df in raw_governor_datasets])
display([len(df) for df in raw_congress_datasets])

In [None]:
gcols = raw_governor_datasets[0].columns.tolist()
display(gcols)
raw_governor_datasets[0].groupby(gcols[3])[gcols[0]].count()

In [None]:
ccols = raw_congress_datasets[0].columns.tolist()
display(ccols)
raw_congress_datasets[0].groupby(ccols[3])[ccols[0]].count()

## Clean Votes
Remove the (1234) numbers at the end of candidates so they match across votes

In [None]:
governor_datasets = [df.replace(r' \([0-9]{4}\)', '', regex=True)
                    for df in raw_governor_datasets]
congress_datasets = [df.replace(r' \([0-9]{4}\)', '', regex=True)
                    for df in raw_congress_datasets]

## Scanning the ballots for possible Write-in errors

When tallying 2018 election results write-ins were treated as undervotes.

In [None]:
def writein_errors(df):
    CHOICES = df.columns[3:].tolist()

    undervotes = [df[c] == 'undervote' for c in CHOICES]
    writeins = [df[c] == 'Write-in' for c in CHOICES]
    display(undervotes[0])
    
    writein_writeins = [writeins[i] & writeins[i+1] & ~undervotes[i+2] for i in range(len(CHOICES) - 2)]
    writein_undervotes = [writeins[i] & undervotes[i+1] & ~undervotes[i+2] for i in range(len(CHOICES) - 2)]
    undervote_writeins = [undervotes[i] & writeins[i+1] & ~undervotes[i+2] for i in range(len(CHOICES) - 2)]
    
    any_writein_writein = any(writein_writeins)
    any_writein_undervote = any(writein_undervotes)
    any_undervote_writein = any(undervote_writeins)
    all_errors = or_indexers([any_writein_writein, any_writein_undervote, any_undervote_writein])
    return all_errors

def writein_errors2(df):
    CHOICES = df.columns[3:].tolist()

    undervotes = pd.concat([df[c] == 'undervote' for c in CHOICES], axis=1, keys=CHOICES)
    writeins = pd.concat([df[c] == 'Write-in' for c in CHOICES], axis=1, keys=CHOICES)

    
    choice_triple = [cs for cs in zip(CHOICES, CHOICES[1:], CHOICES[2:])]
    
    writein_writeins = pd.concat([writeins[c1] & writeins[c2] & ~undervotes[c3]
                                  for c1, c2, c3 in choice_triple], axis=1, keys=CHOICES)
    writein_undervotes = pd.concat([writeins[c1] & undervotes[c2] & ~undervotes[c3]
                                    for c1, c2, c3 in choice_triple], axis=1, keys=CHOICES)
    undervote_writeins = pd.concat([undervotes[c1] & writeins[c2] & ~undervotes[c3]
                                    for c1, c2, c3 in choice_triple], axis=1)
    df = pd.DataFrame()
    df['any_writein_writein'] = writein_writeins.any(axis=1)
    df['any_writein_undervote'] = writein_undervotes.any(axis=1)
    df['any_undervote_writein'] = undervote_writeins.any(axis=1)
#     all_errors = or_indexers([any_writein_writein, any_writein_undervote, any_undervote_writein])
    return df.any(axis=1)


In [None]:
with pd.ExcelWriter('./output/write-in-errors.xlsx') as writer:
    for i, df in enumerate(governor_datasets):
        df[writein_errors2(df)].to_excel(writer, sheet_name=f'Governor ({i+1})')
    for i, df in enumerate(congress_datasets):
        df[writein_errors2(df)].to_excel(writer, sheet_name=f'Rep to Congress - D2 ({i+1})')

In [None]:
def summarize(df):
    CHOICES = df.columns[2:].tolist()
    ID = df.index.name
    
    vote_counts = df[CHOICES].apply(pd.value_counts).fillna(0).astype(int)
    melted_choices = df.reset_index().melt(id_vars=[ID], value_vars=CHOICES)
    combined_vote_counts = melted_choices.drop_duplicates([ID, 'value'])['value'].value_counts().rename("total")
    
    return combined_vote_counts.to_frame().join(vote_counts)

def write_combined_summaries(writer, prefix, datasets):
    agg = summarize(governor_datasets[0])
    for df in governor_datasets[1:]:
        summary = summarize(df)
        summary.columns = agg.columns
        agg += summary.reindex(agg.index, fill_value=0)
    for i, df in enumerate(governor_datasets):
        summary = summarize(df)
        summary.to_excel(writer, sheet_name=f'Governor ({i+1})')

with pd.ExcelWriter('./output/raw-vote-summaries.xlsx') as writer:
    for i, df in enumerate(congress_datasets):
        summary = summarize(df)
        summary.to_excel(writer, sheet_name=f'Rep to Congress - D2 ({i+1})')
        display(summary)

In [None]:
agg = summarize(governor_datasets[0])
for df in governor_datasets[1:]:
    summary = summarize(df)
    summary.columns = agg.columns
    agg += summary.reindex(agg.index, fill_value=0)
agg