# parse each .txt results file

from `raw` to `clean_by_county`

#### imports & utility funcs

In [None]:
import re
import os
import pandas as pd
from tqdm.notebook import tqdm

def ld_nh(path):
    for f in os.listdir(path):
        if not f.startswith('.'):
            yield f
def listdir_nohidden(path):
    return list(ld_nh(path))

data_dir = 'raw'
files = listdir_nohidden(data_dir)

### data source: https://results.enr.clarityelections.com/GA/105369/web.264614/

work with the singular incoming .txt file by manually delimited lines (`'\n'`) and cells (`' '*2`)

In [None]:
def read_data(file):
    
    with open(os.path.join(data_dir, file)) as f:
        f = f.read()
    
    delim = lambda x: re.split('(?:\s){2,}', x) # delimit cells by 3 or more spaces
    rows = [delim(r) for r in f.split('\n')] # delimit rows with line breaks
    
    # function to get the category of each race
    # ( for organizing the repository )
    def parse_cat(race):
        if 'Service' in race:
            return ' '.join(race.split()[:3])
        else: return ' '.join(race.split()[:2]).replace('President of', 'US President')


    # find the consecutive batch of rows associated with each race
    data = [] # to compile info on all races

    for i, row in enumerate(rows[1:-2]):
        
        row_data = {}
        if row==['']: # indicates the beginning of data on a row

            if i>6: # identify the last row of & save the previous race
                last_row_data['data_ends'] = i-1
                last_row_data['data'] = rows[ last_row_data['data_starts'] : i ] 
                data.append(last_row_data)         

            if i < (len(rows)-10): # identify the first row of & other details on this race 
                row_data['race'] = ''.join(rows[i+2])
                row_data['race_cat'] = parse_cat(row_data['race'])
                row_data['candidates'] = rows[i+3]
                row_data['data_starts'] = i+4 
                last_row_data = row_data.copy()
    return data[1:]
    print(len(data), 'total races found.')

#### define desired column suffixes:

In [None]:
stats_should_be = ['Election Day Votes', 'Advanced Voting Votes', 'Absentee by Mail Votes',
                   'Provisional Votes', 'TOTAL VOTES']

append the candidate names to the corresponding cols describing vote type counts

In [None]:
def rename_cols(cols, candidates):
    # find sum total col
    for i, col in enumerate(cols):
        if col=='Total': final_total_col = i
            
    for i, col in enumerate(cols):
        if 'Total' in col and i!=final_total_col: cols[i] = 'Choice Total'

    new_cols = []
    
    for i, col in enumerate(cols):
        if i == final_total_col:
            new_cols.append(col)
        elif 'County' in col or 'Precinct' in col or 'Voters' in col or len(col)<3:
            new_cols.append(col)
        else:
            new_cols.append(candidates[0]+'_'+col)
            if 'Total' in col:
                candidates = candidates[1:]

    return new_cols           

#### load data for one race, parse with above functions.

In [None]:
def parse_data(data, race_idx, county_name, target_dir='clean_by_county'):
    race_data = pd.DataFrame(data[race_idx]['data'])
    race_data.columns = race_data.loc[0]
    race_data.drop(0, inplace=True)
    
    
    # most counties used "county" instead of "precinct" (all are actually precincts)
    if 'County' in race_data.columns:
        race_data.rename(columns={'County': 'Precinct'}, inplace=True) 
    race_data = race_data.set_index('Precinct')
    race_data['County'] = county_name.replace('_', ' ')

    
    # add candidate names to specific columns
    candidates = [c for c in data[race_idx]['candidates'] if len(c)>1]
    cols = list(race_data.columns)
    
    if False: # len(candidates)>5:
        print(county_name.upper())
        print(candidates)
        print(cols)
        print('\n\n\n')
    
    # run column renaming function, adding candidates to features
        
    new_cols = rename_cols(cols, candidates.copy())
    race_data.columns = new_cols.copy()
        
    # create directories & file details
    race_name = data[race_idx]['race'].split('/')[0]
    
    if target_dir not in os.listdir():
        os.makedirs(f'{target_dir}/')
    if county_name not in os.listdir(target_dir):
        os.makedirs(f'{target_dir}/{county_name}/')

    # save csv
    race_data = race_data[[c for c in race_data.columns if 'Unnamed' not in c]]
    race_data.reset_index(inplace=True)
    race_data = race_data[['County'] + [c for c in race_data.columns if c!='County']]
    race_data.reset_index(inplace=True)
    race_data.to_csv(f'{target_dir}/{county_name}/{race_name}.csv', index=False)

---

---

---


## parse & save data

In [None]:
for file in tqdm(sorted(files)): # iterate counties
    data = read_data(file)

    for race_idx in range(len(data)): # iterate races within county
        parse_data(data.copy(), race_idx, file.split('.')[0])