this notebook lets you choose a race with a hotword, then 
- find all files related to that race, from all counties,
then compile all that data into a single spreadsheet
- it also creates a groupby version of that spreadsheet by county

#### imports & utility funcs

In [51]:
import re
import os
import pandas as pd
from tqdm.notebook import tqdm

def ld_nh(path):
    for f in os.listdir(path):
        if not f.startswith('.'): yield f
def listdir_nohidden(path):
    return list(ld_nh(path))

data_dir = 'raw'
files = listdir_nohidden(data_dir)

clean_dir = 'clean_by_county'
counties = listdir_nohidden(clean_dir)

### data source: https://results.enr.clarityelections.com/GA/105369/web.264614/

work with the singular incoming .txt file by manually delimited lines (`'\n'`) and cells (`' '*2`)

# load & join parsed data

parsed in notebook `3.`

#### id all files for the target race (for all counties)


In [52]:
def id_target_files(counties=counties):
    race_files = []
    for county in counties:
        fpath = os.path.join(clean_dir, county)
        county_files = listdir_nohidden(fpath)
        found = False
        for f in county_files:
            if race_hotword in f:
                race_files.append( os.path.join(fpath, f) )
                found = True

    # concat data from all counties
    merged = pd.concat( [pd.read_csv(f) for f in race_files] )

    merged.drop('index', axis=1, inplace=True)
    merged.reset_index(drop=True, inplace=True)
    
    return merged

### fix Lieberman typo
combine votes for lieberman/lie**r**berman

In [53]:
# ADD VOTES FOR MATT lie*R*berman to MATT LIEBERMAN:
def fix_lieberman(merged):
    
    # fill missing good spellings values from bad spellings column 
    good_spell = [c for c in merged.columns if 'Lieber' in c]
    bad_spell = [c for c in merged.columns if 'Lierber' in c]
    merged[good_spell] = merged[good_spell].fillna(merged[bad_spell])
    
    def fillna_info(c):
        n = c.split('(')[0].strip()
        n = n[:10].replace('r', 'R') + n[10:]
        print('Filling', merged[c].isna().sum(), f'nan values for {n}...')
        merged[c] = merged[c].fillna(0)
        return merged
    
    true = 'Matt Lieberman (Dem)_Choice Total'
    typo = 'Matt Lierberman (Dem)_Choice Total'
    merged = fillna_info(true)
    merged = fillna_info(typo)
    merged[true] += merged[typo]

    print(f'+ {int(sum(merged[true])-sum(merged[typo]))} correct votes'.upper())
    print(f'     {int(sum(merged[typo]))} misspelled'.upper())
    print('—'*24)
    print(f'  {int(sum(merged[true]))} total votes'.upper())
    return merged

## Fix horizontal value shift (errors in .txt delimiter) 

**by index**...
 In a few rows, all the values are shifted over to the left because the registered voter number was swallowed by the preceding "precinct" column
 
 So extract the precinct number with regex, then shift all the columns over the right. 
 
 These rows are detectable becaus the "Total" column is empty, and the values are visibly shifted.

In [54]:
def fix_shift(merged):
    for i in merged[merged.Total.isna()].index:
        #print(i, merged.loc[i, 'County'], merged.loc[i, 'Precinct'])
        for feat, val in dict(merged.loc[i]).items():
            if feat=='Precinct':
                ns_found = re.findall('[\d]*', val)
                n_registered = [n for n in ns_found if n!=''][-1]
                #print(n_registered)
            #print(feat, val)

        # shift all values over by one.
        merged.iloc[i:i+1, 2:] = merged.iloc[i:i+1, 2:].shift(1, axis=1)

        # add re-persed registered number
        merged.iloc[i:i+1, 2] = n_registered
        
    return merged

## identify failed rows (by vote total)

(check if all candidates' votes add up correctly in each precinct)

#### this loop also fixes things for lieberman in a few areas

In [55]:
def verify_totals(merged, verb=False):
    bad_math = []
    totals = merged[[c for c in merged.columns if 'total' in c.lower()]]
    for ridx in totals.index:
        if str(totals.loc[ridx, 'Total']).lower()=='nan':
            if verb: print('NaN val.:', ridx)
        elif totals.iloc[ridx].sum()/2 != totals.loc[ridx, 'Total']:
            if verb: print('Bad Math:', ridx, '|', totals.iloc[ridx].sum()/2, '!=', totals.loc[ridx, 'Total'])
            bad_math.append(ridx)
    return bad_math

# compile vote stats counts for each voter group 

In [56]:
def get_vote_stats(merged, bad_math):
    # define candidates in this race
    cands = []
    for c in merged.columns:
        if ')_' in c:
            cands.append(c.split('_')[0])      
    cands = list(set(cands))
    if 'Loeff' in race_hotword: cands.remove('Matt Lierberman (Dem)')


    # define proper and typo version of MATT LIE(r)BERMAN
    # -- to replace values in the correct spelling column when necessary
    good_spell = [c for c in merged.columns if 'Lieber' in c]
    bad_spell = [c for c in merged.columns if 'Lierber' in c]


    for i in tqdm(merged.index): # iterate precincts
        for cand in cands: # iterate all candidates in this race
            cand_cols = [c for c in merged.columns if cand in c]

            disag_cols = cand_cols[:-1] # names of cols of vote types
            cand_t_col = cand_cols[-1] # name of col for total votes

            total_init = merged.iloc[i:i+1][cand_t_col].values[0]
            sub_counts = merged.iloc[i:i+1][disag_cols]

            # if values are missing for the correct spelling of Lieberman...
            if i in bad_math:
                # just use the mispelled values directly (instead of the calced, added version)
                merged.loc[i, good_spell] = merged.loc[i, bad_spell].values

            sub_counts = merged.iloc[i:i+1][disag_cols]
            total_calc = sub_counts.values.sum()

            if 'nan'!=str(float(total_init)) != str(float(total_calc)):

                print()
                print('index:', i, '- BAD MATH!')
                print('   ', cand, '———', total_init, '!=', total_calc)

    # drop the misspelled MATT LIERB as it is no longer needed 
    for c in merged.columns:
        if 'Lierb' in c or 'Unnamed' in c: # drop the old typo column
            merged.drop(c, axis=1, inplace=True)
            continue
    verify_totals(merged)

    merged = merged.rename(columns={'Total':'Total Votes'})
    
    return merged

#### if no output above, all candidate totals add up in every precinct.

# save


In [57]:
def save(merged):

    # convert to numeric where possible
    for c in merged.columns:
        try:  merged[c] = merged[c].astype(float)
        except: pass

    # PRECINCT DATA
    sub_dir = 'all_precincts_joined'
    if 'US Senate' not in race_hotword: sub_dir += '/other_races'

    merged.to_csv(f'../{sub_dir}/'+race_hotword+'.csv', index=False)

    # GROUPBY FOR COUNTY DATA
    sub_dir = 'all_counties_joined'
    if 'US Senate' not in race_hotword: sub_dir += '/other_races'

    num_cols = merged._get_numeric_data().columns
    assert (len(num_cols)-2)%5==0
    counties = merged.groupby('County').sum()
    counties.to_csv(f'../{sub_dir}/'+race_hotword+'.csv')

# run all

In [66]:
all_races = []
for county in counties:
    all_races += os.listdir( os.path.join(clean_dir, county) )
all_races = [r.split('.csv')[0].strip() for r in set(all_races)]

all_races = [c for c in all_races if 'PARTIAL_nov2020_president_precincts.csv ' in c]

In [67]:
for race_hotword in all_races:
    try:
        print('>', race_hotword)
        merged = id_target_files()

        if 'Loeffler' in race_hotword:
            merged = fix_lieberman(merged)

        merged = fix_shift(merged)

        bad_math = verify_totals(merged)

        merged = get_vote_stats(merged, bad_math)

        save(merged)
    except: pass

In [65]:
merged.County.nunique()

124

# verify

In [49]:
fpath = '../all_precincts_joined/other_races/District Attorney - Western - Special.csv'
pre = pd.read_csv(fpath)
pre['Brian Patterson (Dem)_Choice Total'].sum()

11922.0

In [50]:
fpath = '../all_counties_joined/other_races/District Attorney - Western - Special.csv'
co = pd.read_csv(fpath)
co['Brian Patterson (Dem)_Choice Total'].sum()

11922.0