In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np

pd.set_option('display.max_columns', None)
%matplotlib inline

In [2]:
shapefile_root = "../data/processed/precincts/precincts_"
census_root = "../data/processed/census/precinct_vals_"
reg_root = "../data/processed/registration/reg_voter_"
results_root = "../data/processed/results/results_"

In [3]:
def unpack_and_lower(val):
    try:
        return val.lower()
    except AttributeError:
        return (x.lower() for x in val)

def read_shapes(year):
    return (gpd.read_file(shapefile_root+year+".geojson")
               .rename(columns=str.lower)
               .set_index('vtdst5'))

def read_census(year):
    return (pd.read_csv(census_root+year+".csv")
              .drop(columns=["Unnamed: 0"])
              .rename(columns=unpack_and_lower)
              .assign(vtdst5=lambda x: [str(y)[-5:] for y in x.precinct])
              .set_index('vtdst5'))

def read_registration(year):
    return (pd.read_csv(reg_root+year+".csv")
              .rename(columns=unpack_and_lower)
              .assign(vtdst5=lambda x: [str(y)[-5:] for y in x.precinct])
              .set_index('vtdst5')
              .sort_index())

def read_results(year):
    return (pd.read_csv(results_root+year+".csv")
              .rename(columns=unpack_and_lower)
              .assign(vtdst5=lambda x: [str(y)[-5:] for y in x.precinct])
              .set_index('vtdst5')
              .sort_index())

def read_precinct_dfs(year):
    return [read_shapes(year),
            read_census(year),
            read_registration(year),
            read_results(year)]

In [59]:
data_2018 = read_precinct_dfs("2018")
shapes, census, reg, results = data_2018

In [60]:
data_2016 = read_precinct_dfs("2016")

In [61]:
shapes.head(3)

Unnamed: 0_level_0,countyfp,countyname,countynum,vtdst3,vtdstlong,cd115fp,sldust,sldlst,geometry
vtdst5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1001,1,Adams,1,1,7213201001,7,21,32,"POLYGON ((-105.05325 39.79106, -105.05322 39.7..."
1002,1,Adams,1,2,7213201002,7,21,32,"POLYGON ((-105.02521 39.80072, -105.02452 39.8..."
1003,1,Adams,1,3,7213201003,7,21,32,"POLYGON ((-105.02500 39.82249, -105.02499 39.8..."


In [62]:
census.head(3)

Unnamed: 0_level_0,population,male,female,male under 18,male 18-29,male 30-49,male 50-64,male 65 plus,female under 18,female 18-29,female 30-49,female 50-64,female 65 plus,white,aa,aian,asian,nhopi,other,two+,male no hs,male some hs,male hs diploma,male some college,male bachelors,male masters,male doctorate,female no hs,female some hs,female hs diploma,female some college,female bachelors,female masters,female doctorate,family,married,other family,nonfamily,houses,occupied,vacant,owner-occupied,renter-occupied,less than 15k,15k-30k,30k-60k,60k-100k,100k-150k,more than 150k,less than 150k,150k-200k,200k-250k,250k-300k,300k-400k,more than 400k,precinct
vtdst5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1
1001,3783.101702,1763.770946,2019.330756,362.722729,529.760774,402.477198,285.002832,183.807413,317.100822,833.84542,413.392599,317.889508,137.102406,3408.449746,130.130701,0.0,69.582348,19.853181,91.484348,63.601378,308.020646,74.449429,220.535798,450.107514,221.732431,33.763878,7.956667,147.95623,155.126136,167.872913,333.883661,187.772276,112.314794,17.9025,672.795076,382.492803,290.302273,635.681061,1384.410606,1308.476136,75.93447,796.556818,511.919318,159.370833,261.567803,156.707197,217.274242,328.757955,116.599621,313.232576,201.636742,75.917045,114.789773,50.385227,40.595455,7213201001
1002,3469.141562,1816.98166,1652.159902,461.306746,471.415131,346.066083,272.556548,265.637152,460.215032,440.267095,371.101721,180.209341,200.366712,2841.187864,117.314856,16.097869,111.213329,0.0,183.021976,200.305668,143.047421,149.015422,353.025413,327.009224,103.298382,49.106911,6.083298,66.341404,97.000607,302.053015,298.168879,211.974654,54.251254,0.03125,729.791497,441.873919,287.917578,305.502763,1095.22387,1035.29426,59.929611,660.739032,374.555227,124.082549,63.284368,157.09675,88.260613,398.119938,139.247835,81.44531,239.136384,63.892401,172.860797,92.137442,11.266697,7213201002
1003,1106.924166,657.008278,449.915888,151.98114,163.670523,158.603201,132.363207,50.390206,79.095383,79.97385,105.823681,110.231777,74.791197,916.834312,30.330438,14.165064,10.040894,9.885661,16.199457,109.46834,131.898606,48.919887,96.33305,72.515534,67.475286,13.305754,11.956249,91.168628,26.825678,84.391395,85.808096,33.760561,23.247653,0.104167,241.608311,170.16646,71.441851,112.183386,354.287272,353.791697,0.495575,180.633348,173.158349,26.733247,62.721701,55.366461,42.350356,114.098429,31.02633,34.505877,59.283594,60.100204,16.83084,5.833236,4.079597,7213201003


In [63]:
reg.head(3)

Unnamed: 0_level_0,precinct,female rv_active,male rv_active,unknown rv_active,female rv_inactive,male rv_inactive,unknown rv_inactive,female rv_18-29,male rv_18-29,unknown rv_18-29,female rv_30-49,male rv_30-49,unknown rv_30-49,female rv_50-64,male rv_50-64,unknown rv_50-64,female rv_over 65,male rv_over 65,unknown rv_over 65,female rv_mean_reg_time,male rv_mean_reg_time,unknown rv_mean_reg_time,female rv_acn,male rv_acn,unknown rv_acn,female rv_apv,male rv_apv,unknown rv_apv,female rv_dem,male rv_dem,unknown rv_dem,female rv_grn,male rv_grn,unknown rv_grn,female rv_lbr,male rv_lbr,unknown rv_lbr,female rv_rep,male rv_rep,unknown rv_rep,female rv_uaf,male rv_uaf,unknown rv_uaf,female rv_uni,male rv_uni,unknown rv_uni
vtdst5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1
1001,7213201001,698,706,55,133,173,13,234,248,50,259,303,12,159,162,4,140,122,2,3120 days 01:37:02.382671488,2723 days 12:07:22.320819104,440 days 19:03:31.764705880,4,4,0,0,0,0,414,312,26,1,3,0,13,18,1,95,106,3,304,436,38,0,0,0
1002,7213201002,785,753,38,91,115,7,189,183,28,322,312,8,172,184,3,145,136,5,3817 days 01:12:19.726027392,3421 days 17:23:30.138248832,622 days 06:56:00.000000000,3,7,0,0,0,0,469,349,15,4,4,1,2,13,0,101,136,3,296,358,26,1,1,0
1003,7213201003,791,766,32,48,51,1,129,107,8,438,464,16,135,137,5,74,75,3,2335 days 17:32:06.579261024,2076 days 01:22:50.379436960,547 days 12:21:49.090909088,2,2,0,0,0,0,404,284,12,2,0,0,10,12,0,112,139,2,309,380,19,0,0,0


In [64]:
results.head(3)

Unnamed: 0_level_0,precinct,state_rep dem,state_rep lib,state_rep rep,state_senate dem,state_senate lib,state_senate rep,us_rep dem,us_rep lib,us_rep rep,us_rep grn,state_rep unf,us_rep unf,state_senate unf
vtdst5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1001,7213201001,726,0,0,0,0,0,675,39,158,0,0,0,0
1002,7213201002,860,0,0,0,0,0,781,53,230,0,0,0,0
1003,7213201003,446,0,0,0,0,0,391,25,176,0,0,0,0


In [65]:
def compare_completeness_by_precinct(data):
    precs = [df.index.values for df in data]
    all_precs = list(set([val for prec in precs for val in prec]))
    
    overlaps = []
    for val in all_precs:
        overlap = 0
        for prec in precs:
            if val in prec:
                overlap += 1
        overlaps.append(overlap)
    overlaps = np.array(overlaps)
    
    print(f"all: {len(overlaps)}, 4: {len(overlaps[overlaps==4])}, "
          f"3: {len(overlaps[overlaps==3])}, "
          f"2: {len(overlaps[overlaps==2])}, "
          f"1: {len(overlaps[overlaps==1])}")
    
    pidxs = sorted([p for n, p in zip(overlaps, all_precs) if n != 4])
    df_names = ['shapes', 'census', 'reg', 'results']

    for idx in pidxs:
        matches = []
        for df, name in zip(data, df_names):
            if idx in df.index.values:
                matches.append(name)
        print(idx, matches)
        
    return pidxs

## 2018

In [66]:
pidxs = compare_completeness_by_precinct(data_2018)

all: 3136, 4: 3123, 3: 12, 2: 1, 1: 0
01179 ['shapes', 'census', 'results']
01220 ['shapes', 'census', 'results']
01223 ['shapes', 'census', 'results']
03230 ['shapes', 'census', 'results']
03256 ['shapes', 'census', 'results']
03348 ['shapes', 'census', 'results']
03349 ['shapes', 'census', 'results']
03359 ['shapes', 'census', 'results']
03402 ['shapes', 'census', 'results']
03424 ['shapes', 'census', 'results']
03458 ['shapes', 'census', 'results']
03561 ['shapes', 'census', 'results']
20003 ['reg', 'results']


There are 13 precincts that do not have data in each of the four categories. What dataframes are those 13 on?

We'll just drop 20003 for now

In [67]:
def drop_rows(data, idxs):
    for idx in idxs:
        for df in data:
            if idx in df.index.values:
                df.drop(index=idx, inplace=True)

In [68]:
drop_rows(data_2018, pidxs)

## 2016

In [69]:
pidxs = compare_completeness_by_precinct(data_2016)

all: 3011, 4: 2994, 3: 15, 2: 1, 1: 1
01177 ['shapes', 'census', 'results']
01179 ['shapes', 'census', 'results']
01220 ['shapes', 'census', 'results']
01222 ['shapes', 'census', 'results']
01223 ['shapes', 'census', 'results']
03230 ['shapes', 'census', 'results']
03256 ['shapes', 'census', 'results']
03348 ['shapes', 'census', 'results']
03349 ['shapes', 'census', 'results']
03359 ['shapes', 'census', 'results']
03402 ['shapes', 'census', 'results']
03415 ['shapes', 'census', 'results']
03424 ['shapes', 'census', 'results']
03458 ['shapes', 'census', 'results']
03561 ['shapes', 'census', 'results']
36015 ['reg', 'results']
ional ['results']


We'll drop 36015 and ional (from 'provisional')

In [70]:
drop_rows(data_2016, pidxs)

Now we merge them all together! This is very exciting!

In [71]:
precinct_data_2018 = pd.concat(data_2018, axis=1).drop(columns=['precinct'])
precinct_data_2016 = pd.concat(data_2016, axis=1).drop(columns=['precinct'])

In [72]:
precinct_data_2018.to_file("../data/processed/tidy/precinct_data_2018.geojson", driver='GeoJSON')
precinct_data_2016.to_file("../data/processed/tidy/precinct_data_2016.geojson", driver='GeoJSON')