In [48]:
import os
import re
import folium
import pandas as pd
from fuzzywuzzy import fuzz
from tqdm.notebook import tqdm

def listdir_nohidden(path):
    def ld_nh(path):
        for f in os.listdir(path):
            if not f.startswith('.'):
                yield f
    return list (ld_nh(path))

### run the notebook once for each election
#### for primaries, run once again for each party

In [None]:
e_idx = 2 # CYCLE THIS
party_select = 'REP' # CYCLE THIS

In [49]:
data_dir = '.'
elections = [e for e in listdir_nohidden(data_dir) if '.' not in e ]

election = elections[e_idx]

target_dir = f'../../{election}/participation_demography'

if not os.path.exists(target_dir):
    os.makedirs(target_dir)
print('will save cleaned data to:', target_dir)

['2018_november', '2016_may', '2018_may']

### identify data for both races (`init` + `runoff`)

In [120]:
e_races = listdir_nohidden(os.path.join(data_dir, election )) 

# if parties are separated
if 'DEM' in os.listdir(os.path.join(data_dir, election, e_races[0])):
    party = party_select # CYCLE THIS
else:
    party = 'ALL'
    
print('party:', party)

party: REP


In [121]:
if party == 'ALL':
    init_race = [c for c in e_races if 'Runoff' not in c][0]
    runoff_race = [c for c in e_races if 'Runoff' in c][0]
else:
    init_race = [c+'/'+party for c in e_races if 'Runoff' not in c][0]
    runoff_race = [c+'/'+party for c in e_races if 'Runoff' in c][0]

init_race_files = listdir_nohidden(os.path.join(data_dir, election, init_race )) 
runoff_race_files = listdir_nohidden(os.path.join(data_dir, election, runoff_race )) 

try: assert init_race_files == runoff_race_files
except:
    print('Initial election data does not match final elections data')
    init_only = [i for i in init_race_files if i not in runoff_race_files]
    runoff_only = [i for i in runoff_race_files if i not in init_race_files]
    print('Missing from initial data:', runoff_only)
    print('Missing from runoffs data:', init_only)

e_races = {'INIT':
              {'name': init_race,
               'files': init_race_files,},
          
          'RUNOFF': 
              {'name': runoff_race,
               'files': runoff_race_files,},}

[race['name'] for race in e_races.values()]

['May 2018/REP', 'June 2018 (Runoff)/REP']

### parse data for one county in this election

In [122]:
def parse_pair_data(county):
    out = []
    for race_type, race in e_races.items(): # iterate county data for both INIT & RUNOFF

        county_file = os.path.join(data_dir, election, race['name'], county )

        data = pd.read_excel(county_file)

        #election_date = data.columns[0].split('\n')[-1].split(':')[1]

        # quick clean and specify columns
        data.loc[0] = data.T[0].fillna(method='ffill')
        data.loc[1] = data.loc[1]

        data.columns = (data.loc[1] + '_' + data.loc[0]
                       ).apply(
                            lambda x: 
                               str(x).replace('Reg #', 'N_registered'
                                    ).replace('Voted', 'N_voted'
                                    ).replace('%', 'PCT_voted'
                                    ).replace(' ', '_')
                                if str(x)!='nan' else ''                             
                            )
            
        data = data.rename(columns={k: race_type+'_'+k for k in data.columns})

        
        
        # grab index names from subsequent row & consolidate
        data.columns = list(data.loc[0].values[:2]) + list(data.columns)[2:]
        data = data[2:].reset_index(drop=True)

        out.append(data)
    
    out = pd.merge(out[0], out[1], on='PRECINCT ID', how='outer')
    
    out['County'] = county.split('.')[0]
    
    # calc participation rate change
    out = out[['County']+[c for c in out.columns if c!='County']]
    return out

out = parse_pair_data(sorted(init_race_files)[0])

### combine data for one election (init + runoff)

In [123]:
e_data = pd.DataFrame()
for county in tqdm(init_race_files):
    out = parse_pair_data(county)
    #if 'STEP' in county: break
    e_data = pd.concat([e_data, out])
e_data.reset_index(drop=True, inplace=True)
e_data.drop('PRECINCT DESCRIPTION_y', axis=1, inplace=True)
e_data.rename(columns={'PRECINCT DESCRIPTION_x': 'PRECINCT DESCRIPTION'}, inplace=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=159.0), HTML(value='')))




In [124]:
# drop duplicate columns
e_data = e_data.loc[:,~e_data.columns.duplicated()]

e_data = e_data.reset_index(drop=True)
e_data = e_data.sort_values('PRECINCT ID').sort_values('County')

# engineer sums for specific groups

In [125]:
for c in e_data.columns:
    e_data = e_data.rename(columns={c:c.replace('TOTAL_VOTERS', 'ALL_ALL')})
e_data.head()

Unnamed: 0,County,PRECINCT ID,PRECINCT DESCRIPTION,INIT_N_registered_BLACK_MALE,INIT_N_voted_BLACK_MALE,INIT_PCT_voted_BLACK_MALE,INIT_N_registered_BLACK_FEMALE,INIT_N_voted_BLACK_FEMALE,INIT_PCT_voted_BLACK_FEMALE,INIT_N_registered_BLACK_UNKNOWN,...,RUNOFF_PCT_voted_UNKNOWN_MALE,RUNOFF_N_registered_UNKNOWN_FEMALE,RUNOFF_N_voted_UNKNOWN_FEMALE,RUNOFF_PCT_voted_UNKNOWN_FEMALE,RUNOFF_N_registered_UNKNOWN,RUNOFF_N_voted_UNKNOWN,RUNOFF_PCT_voted_UNKNOWN,RUNOFF_N_registered_ALL_ALL,RUNOFF_N_voted_ALL_ALL,RUNOFF_PCT_voted_ALL_ALL
1659,APPLING,4D,4D,28,0,0.0,30,0,0.0,0,...,6.38,45,3,6.67,0,0,0,1420,282,19.86
1660,APPLING,5A,5A,34,0,0.0,55,0,0.0,0,...,0.0,44,1,2.27,0,0,0,1091,115,10.54
1661,APPLING,5B,5B,49,0,0.0,62,1,1.61,0,...,0.0,27,1,3.7,0,0,0,1078,134,12.43
1657,APPLING,3C,3C,141,1,0.71,180,1,0.56,0,...,2.08,52,2,3.85,0,0,0,1569,234,14.91
1654,APPLING,1C,1C,8,0,0.0,18,0,0.0,0,...,0.0,29,1,3.45,0,0,0,812,128,15.76


### create dataframe of group identity column keys 

In [126]:
ppl_stats = [c for c in e_data.columns if 'MALE' in c] # captures all sub-groups (interesting)

ppl_stat_dict = {} # compile list of columns relevant to each parent group type
for colname in ppl_stats:
    cparts = colname.split('_')
    ppl_stat_dict[colname] = {
        'sex': 'FEMALE' if 'FEMALE' in cparts[-1] else 'MALE',
        'race': cparts[-2]   } # get dummy descriptions for both categories in each column
    
ppl_cols_df = pd.DataFrame(ppl_stat_dict
    ).T.reset_index().rename(columns={'index':'col_name'})

print('Sexes:', ppl_cols_df.sex.unique())
print('Races:', ppl_cols_df.race.unique())

# create column groups for each parent person group

group_agg_map = {}

for p_idx in ppl_cols_df.index:    
    p_row = ppl_cols_df.loc[p_idx]
    
    if p_row.sex not in group_agg_map.keys():
        group_agg_map[p_row.sex] = []
    group_agg_map[p_row.sex].append(p_row.col_name)
    
    if p_row.race not in group_agg_map.keys():
        group_agg_map[p_row.race] = []
    group_agg_map[p_row.race].append(p_row.col_name)

print(len(group_agg_map), 'total identity groups defined')

# create custom super all-all category
group_agg_map['ALL'] = ppl_stats

Sexes: ['MALE' 'FEMALE']
Races: ['BLACK' 'WHITE' 'ASIA-PI' 'HISP-LT' 'NATIVE-AM' 'OTHER' 'UNKNOWN']
9 total identity groups defined


In [127]:
# function to get the sum total for each statistic in a parent identity group
def calc_group_sum(group):
    
    # id all cols for this group, drop PCT (all just calced.)
    target_cols = [c for c in group_agg_map[group] if 'pct' not in c.lower()]
    
    # group cols by stat type
    stat_cats = list(sorted(set(
        ['_'.join(c.split('_')[:-2]) for c in target_cols])))
    # the above are be universal ( [INIT/R.O.] + [REG/VOTED] )    
    
    # for the given group....
    stat_cols = {}
    for stat in stat_cats: # iterate to get sum of each statistic type
        
        stat_cols[stat] = [] # to compile relevant columns for this statistic
        for col in target_cols: 
            if stat in col: # identify target cols for THIS STAT
                stat_cols[stat].append(col) 
        
        if 'MALE' in group:
            sum_col_name = stat+'_ALL_'+group
        else:
            sum_col_name = stat+'_'+group+'_ALL' 
        
        
        e_data[sum_col_name] = e_data[ # this is where the addition occurs
                                stat_cols[stat]  # columns to add
                                     ].astype(float).sum(axis=1) # ADD ACROSS AXIS 1  
        
for group in group_agg_map.keys():
    calc_group_sum(group)

## create groupby'd version

In [128]:
for c in e_data.columns:
    try: e_data[c] = e_data[c].astype(float)
    except: pass
    
g_data = e_data.groupby('County').sum()

## calc delta & pct change stats for all groups

In [129]:
elecs = ['INIT', 'RUNOFF']
stats = ['N_voted', 'N_registered']

get_group = lambda x: '_'.join(x.split('_')[3:])

# ALL groups (original and engineered sums)
all_group_names = set([get_group(c) for c in e_data.columns[3:]])

for group in sorted(all_group_names):
    
    target_cols = [c for c in e_data.columns if group == get_group(c)]
        
    for stat in stats:
        stat_cols = [c for c in target_cols if stat in c]        
        
        delta_colname = stat_cols[0].replace('INIT', 'Delta')
        e_data[delta_colname] = e_data[stat_cols[1]].astype(float) - e_data[stat_cols[0]].astype(float)
        g_data[delta_colname] = g_data[stat_cols[1]].astype(float) - g_data[stat_cols[0]].astype(float)

        
        pct_colname = stat_cols[0].replace('INIT', 'pctChange')
        e_data[pct_colname] = e_data[delta_colname] / e_data[stat_cols[0]].astype(float)
        g_data[pct_colname] = g_data[delta_colname] / g_data[stat_cols[0]].astype(float)
        

### clean up feature names a bit more 

In [130]:
cols = list(e_data.columns)
re_map = {}
for col in cols[3:]:
    c = col.split('_')
    elec = c[0].capitalize()
    if 'Pct' in elec:
        elec = 'pctChange'
    stat = c[1].lower()+c[2].capitalize()
    race = c[3].capitalize()
    if len(c)>=5: sex = c[4].capitalize()
    else: sex = 'Unknown'        
    re_map[col] = f"{elec}_{stat}_{race}_{sex}"

e_data = e_data.rename(columns=re_map)
g_data = g_data.rename(columns=re_map)

In [131]:
# FUNCTION FOR DISPLAYING / RETURN DATA FOR A TARGET GROUP

def get_group_data(df, race, sex, elec=None):
    target_cols = []
    o_cols = []
    for col in df:
        if col.count('_')<3:
            o_cols.append(col)
            continue
        c = col.split('_')
        if not race or c[2] == race:
            if not elec or c[0] == elec:
                if not sex or c[3] == sex:
                    target_cols.append(col)
                        
    return df[ o_cols + target_cols ]

### re-calc group participation rates

In [132]:
elecs = ['Init', 'Runoff']
stats = ['nVoted', 'nRegistered']
races = sorted(set([c.split('_')[2] for c  in g_data.columns[3:]]))
sexes = sorted(set([c.split('_')[3] for c  in g_data.columns[3:]]))

for sex in sexes:
    for race in races:
        if race=='All' and sex=='Unknown': continue # doesnt exist
        
        for elec in elecs:
            group_data = get_group_data(g_data, race, sex, elec)

            for stat in stats:
                group_data[[c for c in group_data.columns if any(s in c for s in stats) ]]

            cols = group_data.columns
            if not len(cols):
                print(race, sex, elec)
                out = group_data
                

            target_fname = f"{elec}_pctVoted_{race}_{sex}"
            g_data[target_fname] = round(100*(g_data[cols[1]] / g_data[cols[0]]), 2)
            e_data[target_fname] = round(100*(e_data[cols[1]] / e_data[cols[0]]), 2)

## create state totals dataframe

In [133]:
target_dir

'../../2018_may/participation_demography'

In [134]:
vote_dict = {
    'Init': {},
    'Runoff': {},  }
regi_dict = {
    'Init': {},
    'Runoff': {},  }

for sex in sexes:
    for race in races:
        if race=='All' and sex=='Unknown': continue # doesnt exist
        for elec in elecs:
            group_data = get_group_data(g_data, race, sex, elec)
            
            category = '_'.join(group_data.columns[1].split('_')[2:])
            
            n_votes = group_data[group_data.columns[1]].sum()
            n_reg = group_data[group_data.columns[0]].sum()

            vote_dict[str(elec)][category] = n_votes
            regi_dict[str(elec)][category] = n_reg
            
state_totals = pd.DataFrame(vote_dict)

# quick features
state_totals = state_totals.rename(columns=
                {c:c.lower()+'Votes' for c in state_totals.columns})

# add n_registered back on
r_data = pd.DataFrame(regi_dict)
r_data = r_data.rename(columns=
                {c:c.lower()+'Registered' for c in r_data.columns})
state_totals = pd.merge(state_totals, r_data, left_index=True, right_index=True)

state_totals['initParticipation'] = round(100*(state_totals['initVotes'] / state_totals['initRegistered']), 2)
state_totals['runoffParticipation'] = round(100*(state_totals['runoffVotes'] / state_totals['runoffRegistered']), 2)

# sort columns for init then runoff
state_totals = state_totals[sorted(state_totals.columns)]



# CALCULATED FEATURES
state_totals['nVotesChange'] = state_totals['runoffVotes'] - state_totals['initVotes']


state_totals['ParticipationChange'] = round(100*(state_totals['nVotesChange'] / state_totals['initVotes']), 2)

# sort by group participation dropoff
#state_totals = state_totals.sort_values('ParticipationChange')

# OPTIONAL: ADD COMMA SEPARATORS FOR THOUSANDS (no longer numeric after this obviously)
for c in state_totals.columns:
    if 'Partic' not in c:
        state_totals[c] = state_totals.apply(lambda x:
                        "{:,}".format(int(x[c])) if str(x[c])!='nan' else '-', axis=1)
    else :
        state_totals[c] = state_totals.apply(lambda x:
                        "{}%".format(int(x[c]))  if str(x[c])!='nan' else '-', axis=1)

# CONVERT INDEX TO TWO SEPARATE COLUMNS
parse_race = lambda x: x.split('_')[0]
parse_sex = lambda x: x.split('_')[1]
state_totals['race'] = [parse_race(v) for v in state_totals.index]
state_totals['sex'] = [parse_sex(v) for v in state_totals.index]
cols = list(state_totals.columns)
state_totals = state_totals[cols[-2:]+cols[:-2]]

state_totals = state_totals.sort_values('sex')
state_totals = state_totals.sort_values('race')
state_totals = state_totals.replace('All', '(all)')

fname = target_dir+f'/{party}_statewide.csv'
state_totals.drop('ParticipationChange', axis=1, inplace=True)
state_totals.to_csv(fname, index=False) # index is now redundant
print(fname, 'saved:')
state_totals

../../2018_may/participation_demography/REP_statewide.csv saved:


Unnamed: 0,race,sex,initParticipation,initRegistered,initVotes,runoffParticipation,runoffRegistered,runoffVotes,nVotesChange
All_All,(all),(all),9%,6733692,619450,8%,6684291,589741,-29709
All_Male,(all),Male,10%,3108872,317873,9%,3077890,304566,-13307
All_Female,(all),Female,8%,3624820,301577,7%,3606401,285175,-16402
Asia-pi_All,Asia-pi,(all),2%,137678,3599,2%,137130,2882,-717
Asia-pi_Unknown,Asia-pi,Unknown,1%,365,4,2%,365,8,4
Asia-pi_Male,Asia-pi,Male,2%,65475,1821,2%,65162,1362,-459
Asia-pi_Female,Asia-pi,Female,2%,72203,1778,2%,71968,1520,-258
Black_All,Black,(all),0%,2037709,8503,0%,2018655,6018,-2485
Black_Unknown,Black,Unknown,0%,2427,5,0%,2383,3,-2
Black_Male,Black,Male,0%,867355,4547,0%,854943,3301,-1246


## save

In [135]:
  
elec_fpath = target_dir+f'/{party}_by_precinct.csv'

    
print('Saving precinct data...')
print('>', elec_fpath, '\n')
e_data.to_csv(elec_fpath, index=False)


elec_fpath = target_dir+f'/{party}_by_county.csv'   
print('Saving county data...')
print('>', elec_fpath)
g_data.to_csv(elec_fpath)

Saving precinct data...
> ../../2018_may/participation_demography/REP_by_precinct.csv 

Saving county data...
> ../../2018_may/participation_demography/REP_by_county.csv
