In [113]:
import os
import re
import jaro
import folium
import numpy as np
import pandas as pd
import geopandas as gp
from fuzzywuzzy import fuzz
from tqdm.notebook import tqdm

def listdir_nohidden(path):
    def ld_nh(path):
        for f in os.listdir(path):
            if not f.startswith('.'):
                yield f
    return list (ld_nh(path))

def show_county_data(county, data):
    d = data[[c for c in data.columns if 'geo' not in c]].copy()
    return d[d.county==county].sort_values(d.columns[1])

### import data

Using the native indexes for mapping, so don't reset or drop them!

In [114]:
# SHAPES DATA

shapes = gp.read_file('electoral_precincts/2018Precincts.shp')
shapes = shapes[list(shapes.columns[:4])+['geometry']]
# rows where loc_prec contains a value not the same as prec_shp
mismatch = np.where(((shapes['locality']+','+shapes['prec_shp']
                  ).str.lower()!=shapes['loc_prec'].str.lower()))
print('>', len(mismatch[0]), 'rows found with unique loc_prec values. Overwriting prec_shp...')
# replace odd prec_shp values (they are duplicated in prec_elect)
shapes.prec_elec = shapes.loc_prec.apply(lambda x: x.split(',')[1])
shapes.drop('loc_prec', axis=1, inplace=True)

# dna = Duplicate Named Area (areas with unique geometry that )
dna = shapes[shapes.duplicated(['locality', 'prec_shp'])].copy()
dup_idx = dna[dna.duplicated(['geometry'])].index
dna.drop(dup_idx, inplace=True) # drop straight-up duplicates
shapes.drop(dup_idx, inplace=True) # drop straight-up duplicates
print('>', len(dna), 'with unique geometry rows share names! Adding indexes to names...')
idxr = 1
for d_idx in dna.index: # remainig rows have unique gemoetry
    o_val = dna.loc[d_idx, 'prec_shp']
    n_val = f'{o_val} ({idxr}/{len(dna)})'
    shapes.loc[d_idx, 'prec_shp'] = n_val # assign back into shapes directly.
    idxr += 1
    print('  >', n_val)
    
# clean vals (force lowercase) and rename
for c in shapes.columns[:3]:
    shapes[c] = shapes[c].str.lower()
shapes.rename(columns={'locality':'county'}, inplace=True)

shapes[shapes.county=='appling'].sort_values('prec_shp').head(3)


# OLD RUNOFF PARTICPATION DATA

part = pd.read_csv('../recent_runoffs/2018_november_cleaned/all_precincts_participation.csv')
part = part[list(part.columns[:3])]
for c in part.columns: part[c] = part[c].str.lower()
part.rename(columns={
        'County':'county',
        'PRECINCT ID': 'prec_id',
        'PRECINCT DESCRIPTION': 'prec_desc'
    }, inplace=True)

part[part.county=='appling'].sort_values('prec_desc').head(3)


# 2018 NOVEMBER RESULTS DATA

res = pd.read_csv('../2020_november/all_precincts_joined/US Senate (Loeffler).csv')
res = res[list(res.columns[:2])]
res.rename(columns={k:k.lower() for k in res.columns}, inplace=True)
for c in res.columns: res[c] = res[c].str.lower()
    
res[res.county=='appling'].sort_values('precinct').head(3)

> 6 rows found with unique loc_prec values. Overwriting prec_shp...
> 2 with unique geometry rows share names! Adding indexes to names...
  > PATRIOTS PARK (1/2)
  > PATRIOTS PARK (2/2)


Unnamed: 0,county,precinct
1450,appling,1b
1451,appling,1c
1452,appling,2


## manual fixes - exact string replacements

In [115]:
# REPLACE EXACT STRINGS in SHAPES DATA

replace_strs = {
    'hoggard mill': 'hoggards mill',
    'south mill': 'south milledgeville',
    'north mill': 'north milledgeville',
    'bethlehem church - 211': 'bethlehem church',
    'chattahoochee acvitity center': 'activity center' ,
    'cjc': '#3 cjc',
}
for o, n in replace_strs.items():
    for c in shapes.columns[:3]:
        shapes[c] = shapes[c].str.replace(o, n)
        
# REPLACE EXACT STRINGS in PART DATA

replace_strs = {
    'austin \(dun\)': 'austin',
    'avondale \(avo\)': 'avondale',
    'lithonia \(lit\)': 'lithonia',
    'woodward \(bhavn\)': 'woodward',
    'fbc - flc': 'family life center',
}
for o, n in replace_strs.items():
    for c in part.columns[:3]:
        part[c] = part[c].str.replace(o, n)
        
# REPLACE EXACT STRINGS in RES DATA
replace_strs = {
    ' ':' '
}
for o, n in replace_strs.items():
    for c in res.columns[:3]:
        res[c] = res[c].str.replace(o, n)

## direct edits by iloc - warning

In [116]:
# RISKY — THESE MAY CHANGE!!
part.loc[133, 'prec_desc'] = 'fairground'
part.loc[358, 'prec_desc'] = 'eli whitney'
part.loc[291, 'prec_desc'] = 'wilmington island presbyterian'
part.loc[347, 'prec_desc'] = 'wilmington island united'

res.loc[1136, 'precinct_id'] =  'bramlett elementary'
res.loc[1137, 'precinct_id'] =  'westside middle'


## specific county cleaning

### strip numbers from barrow precincts

In [117]:
# ADD COLUMN FOR PARSED PRECINCT IDs EXACT STRINGS
county='barrow'
barrow_idx = show_county_data(county, res).index
barrow_ids = res.loc[barrow_idx, 'precinct'].apply(lambda x: ' '.join(re.findall('[A-Za-z]*', x)))
res.loc[barrow_idx, 'precinct'] = barrow_ids

### de-code rockdale precincts

In [118]:
rockdale_p_map = {'BA': 'Barkside',
 'BT': 'Bethel',
 'CO': 'Conyers',
 'FI': 'Fieldstone',
 'FS': 'Flat Shoals',
 'HC': 'Honey Creek',
 'HI': 'High Tower',
 'LA': 'The Lakes',
 'LO': 'Lorraine',
 'MA': 'Magnet',
 'MI': 'Milestead',
 'OT': 'Olde Town',
 'RO': 'Rockdale',
 'SM': 'Smyrna',
 'SP': 'St. Pius',
 'ST': 'Stanton'}

def convert_rockdale(ab):
    ab = ab.upper()
    if ab in rockdale_p_map.keys():
        return rockdale_p_map[ab].lower()
    else:
        return ab.lower()
    
rock_idx = part[part.county=='rockdale'].index

part.loc[rock_idx, 'prec_id'] = part.loc[rock_idx, 'prec_id'].apply(convert_rockdale)

### fix spalding leading zero numbers

In [119]:
spald = shapes[shapes.county=='spalding'].index

def convert_spald(n):
    if len(n)<2:
        return'0'+n
    else:
        return n

shapes.loc[spald, 'prec_shp'] = shapes.loc[spald, 'prec_shp'].apply(convert_spald)

---

# search

## iterate shape index objects, looking for data from `part` and `res`

In [120]:
# THIS LOOP searches for matches across datasets using multiple columns
# (precinct id, precinct, prec_elec, prec_id, precinct description)
# ideally, a perfect match is found. always search within a matching county.
# if no perfect match for a given search, try fuzzy or custom searching... 

def find_matches(l_df, r_df, l_comp_cols, r_comp_cols, p_groupby, min_score, testing=False):
    match_hist = {} # to store the results and stats about matches
    found = 0
    if not testing:
        iter_over = sorted(list(l_df[p_groupby].unique()))
    else: iter_over=testing 

    for p_item in tqdm(iter_over):    
        #   l = left data  |  r = right data

        # isolate data in just item group for both dataframes...
        l_group = l_df[l_df[p_groupby]==p_item] # geojson l_df data
        r_group = r_df[r_df[p_groupby]==p_item] # r_df grouped by county

        # for row in L_DATA data... (within this P_GROUP (ie COUNTY DATA)):
        for l_idx in l_group.index: # iterate items objects in this group
            l_row = l_group.loc[l_idx] 

            # to store results and break a search when a match is found
            match_hist[l_idx] = 0

            # search for PERFECT match across ALL COMPARISON COLUMNS:
            # this assumes there are no duplicates because it breaks when a match is found

        # PERFECT match search
            # iterating SHAPE precinct name columns...
            for l_compare in l_comp_cols: 
                if match_hist[l_idx]: break # break if perfect match found

                if '_found' in l_compare or '_fuzz' in l_compare:
                    continue # ignore found data column

                l_val = l_row[l_compare] # SHAPE VALUE NAME

                # for row in R_DATA data...
                for r_idx in r_group.index: 
                    if match_hist[l_idx]: break # if search done, break this inner
                    r_row = r_group.loc[r_idx]

                    # ...compare against both possible precinct name columns
                    for r_compare in r_comp_cols:
                        if '_found' in r_compare or '_fuzz' in r_compare:
                            continue # ignore found data column   
                        r_val = r_row[r_compare]
                        if l_val == r_val: # ***CHECK THE L==R VALUE EQUATION***
                            meth = f'{l_compare} == {r_compare}'
                            
                            match_hist[l_idx] = {
                                'match_idx': r_idx, 'method': meth }
                            
                            l_df.loc[l_idx, 'r_idx'] = r_idx
                            l_df.loc[l_idx, 'r_mthd'] = meth
                            
                            break 

            # end l_row search if perfect match found
            if match_hist[l_idx]: continue


        # FUZZY match search (if needed)
            # minimum score to beat from function input
            best_score = min_score-.01
            if p_item=='fulton':
                best_score = .94
        
            for l_compare in l_comp_cols:  # iterating shape columns
                if '_found' in l_compare or '_fuzz' in l_compare:
                    continue  # ignore found data column

                # no internal break, search ALL possible scores
                #if '_found' in l_compare: continue # ignore found data column

                l_val = l_row[l_compare] # SHAPE VALUE NAME


                for r_idx in r_group.index: # iterate R_DF rows
                    
                    if r_idx in l_df['r_idx'].values: continue # already assigned!
                        
                    # DONT break
                    r_row = r_group.loc[r_idx]
                    # search ALL possible FUZZY MATCHES...
                    for r_compare in r_row.index[1:]:
                        
                        if '_found' in r_compare or '_fuzz' in r_compare:
                            continue # ignore results column(s)
                            
                        if best_score==1: break  # not going to find anything better!

                        r_val = r_row[r_compare] # PART VALUE NAME

                        # jaro score
                        f_score = jaro.jaro_metric(str(l_val), str(r_val))

                        if f_score > best_score: # new best score
                            best_score = f_score
                            f_str = str(round(f_score, 2)).split('.')[1]
                            meth = f"(F.{f_str}) {l_compare} = {r_compare}"
                            match_hist[l_idx] = {
                                'match_idx': r_idx,
                                'method': meth }
                                                    
                            l_df.loc[l_idx, 'r_idx'] = r_idx
                            l_df.loc[l_idx, 'r_mthd'] = meth
                            l_df.loc[l_idx, 'r_fuzz'] = r_val
                            # dont break, search all. 

                            
    method_vals = []
    for val in match_hist.values():
        try: method_vals.append(val['method'])
        except: pass
    match_method_counts = {c: method_vals.count(c) for c in set(method_vals)}
    pct_found = sum(match_method_counts.values()) / len(l_df) # l_df with a match in both datasets
    pct_found = round(100*pct_found, 2)
    print(f"{sum(match_method_counts.values())}/{len(l_df)} precincts found exact match in PART data: {pct_found}%")
    return l_df, r_df

In [121]:
l_df = shapes
l_comp_cols = list(shapes.columns[1:3])
r_df = part
r_comp_cols = list(part.columns[1:])
p_groupby = 'county'
min_score = .7 # init to beat
testing = False#['fulton']
  
l_df, r_df = find_matches(l_df, r_df, l_comp_cols, r_comp_cols, p_groupby, min_score, testing)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=159.0), HTML(value='')))


2628/2655 precincts found exact match in PART data: 98.98%


## join all parsed counties

In [122]:
breaker = 0

data = pd.DataFrame()

counties = sorted(list(l_df.county.unique()))
for county in counties:
    lc_data = show_county_data(county, l_df)
    rc_data = show_county_data(county, r_df)

    rc_data = rc_data.rename(columns={c:'R_'+c for c in rc_data.columns})
    lc_data = lc_data.rename(columns={c:'L_'+c for c in lc_data.columns})
    #lc_data.drop('L_county', axis=1, inplace=True)

    c_data = pd.merge(rc_data, lc_data, left_index=True, right_on='L_r_idx', how='outer')
    
    data = pd.concat([data, c_data])
    
data['county'] = data['R_county'].fillna(data['L_county']).copy()
data = data.sort_values('R_prec_id').sort_values('R_county')
data.reset_index(drop=True, inplace=True)
data.drop(columns=['R_county', 'L_county'], inplace=True)

In [123]:
data[data.L_r_idx.isna()]

Unnamed: 0,R_prec_id,R_prec_desc,L_prec_shp,L_prec_elec,L_r_idx,L_r_mthd,L_r_fuzz,county
2855,,,fort stewart military reserve (no voters),fort stewart military reserve (no voters),,,,bryan
2856,,,fort pulaski national monument (no voters,fort pulaski national monument (no voters,,,,chatham
2857,,,fort benning military reservation (no vot,fort benning military reservation (no vot,,,,chattahoochee
2858,,,03p1b,03p1b,,,,fulton
2859,,,08f2,08f2,,,,fulton
2860,,,1200,12e2,,,,fulton
2861,,,121sc14b,121sc14b,,,,fulton
2862,,,ap01e,ap01e,,,,fulton
2863,,,ap12d,ap12d,,,,fulton
2864,,,ch04b,ch04b,,,,fulton


In [None]:
for row in merged.index:
    
    row_data = merged.loc[row]
    descript = [
        str(row_data['PRECINCT ID']),
        str(row_data['PRECINCT DESCRIPTION']),
        str(row_data['prec_shp']),
        str(row_data['prec_elec']),
    ]
    
    # pick longest, break ties manually
    longest = max([len(x) for x in descript])
    if len(descript[2])==longest:
        out = descript[2]
    elif len(descript[3])==longest:
        out = descript[3]
    elif len(descript[0])==longest:
        out = descript[0]
    elif len(descript[1])==longest:
        out = descript[1]
        
    if row == 2866:
        print(descript, longest, out)
        
    out = ' '.join([w.capitalize() for w in out.split()])
    
    merged.loc[row, 'best_name'] = out

# mapping

In [None]:
merged.geometry = merged.simplify(.001)

In [None]:
from branca.colormap import linear
colormap = linear.GnBu_06.scale(min(merged['CALC_CHANGE_PARTICIPATION']),
                                  max(merged['CALC_CHANGE_PARTICIPATION']))


In [None]:
from branca.colormap import linear


if 'Election - 2018' in election:
    fname = 'Nov-Dec 2018'
    colormap = linear.GnBu_06.scale(-50, -25)
elif 'Primary - 2018' in election:
    fname = 'May-Jul 2018'
    colormap = linear.GnBu_06.scale(-25, 0)
elif 'Primary - 2016' in election:
    colormap = linear.GnBu_06.scale(-25, 0)
    fname = 'May-Jun 2016'

def get_color(amt):
    try: return colormap(amt)
    except: return '#808080'
    
def get_opacity(amt):
    try:
        amt>5
        return .7
    except: return 0
                                  
m = folium.Map(location=[32.719440, -83.453088],
zoom_start = 7, tiles='cartodbpositron')

In [None]:
merged.INIT_PCT_voted_TOTAL_VOTERS

In [None]:
style_function = lambda x: {
    'fillColor': get_color(x['properties']['CALC_CHANGE_PARTICIPATION']),
    'color': 'black',
    'weight': .1,
    'opacity': 0.5,
    'fillOpacity': get_opacity(x['properties']['CALC_CHANGE_PARTICIPATION'])
}

merged.CALC_CHANGE_PARTICIPATION.fillna('Error / no data for this precinct')

folium.GeoJson(
    merged,
    style_function=style_function,
    tooltip=folium.GeoJsonTooltip(
        fields=['best_name', 'locality', 'INIT_PCT_voted_TOTAL_VOTERS', 'RUNOFF_PCT_voted_TOTAL_VOTERS'],
        aliases=['Precinct:', 'County:', 'Initial participation:', 'Run-off participation:'],
        #localize=True,
    )
).add_to(m)

colormap.add_to(m)
colormap.caption = f'Voting rates PCT CHANGE from initial to run-off elections ({fname})'

colormap.add_to(m);


m
#m.save(f'{fname}_Precinct_RunoffDeltas.html')