# Measuring Voter Turnout among NYCHA Residents
#### Last edited Tuesday, July 19, 2022 by Kyle Slugg

##### Mission:

Following the passage and enactment on June 16, 2022 of The New York City Housing Authority (NYCHA) Public Housing Preservation Trust Act (NY State Bill A7805D/S9409A, referred to here as the "Trust Act"), NYCHA is working quickly to bring the Trust -- an entirely new legal entity -- into being. In order to ensure that current public housing residents are comfortable with the tenancy protections they will recieve after their developments of residents are transferred to the Trust (protections which are identical to those enjoyed under current NYCHA leases), residents must authorize such a transfer through a voting process.

NYCHA is directed to finalize and distribute the rules by which such a vote will be held and processed within 120 days of passage. At present, the largest outstanding issue is that of representativeness: in order for a vote to be valid, how many residents at a given development must participate?




Looking at turnout, cut by election district and by development (development first). 

Keep addresses static, and compute turnout at each address

Last piece is bringing in total number of NYCHA resident adults at time of voting (use this as denom)




In [1]:
import numpy as np
import pandas as pd
import zipfile_deflate64 as zipfile
import csv
from io import TextIOWrapper, BytesIO
import regex as re
from tqdm import tqdm
import glob
import geopandas as gpd
import fuzzymatcher
import recordlinkage
import seaborn as sns
import gc
from numba import njit

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',30)
tqdm.pandas()

### Filter and load voter records in potential NYCHA districts

In [None]:
voters.extract('FOIL_VOTER_LIST_LAYOUT.pdf')

In [2]:
voters = zipfile.ZipFile('AllNYSVoters_20220711.zip')
test_df = pd.DataFrame()

def read_in_chunks(file_object, chunk_size=1024):
    """Lazy function (generator) to read a file piece by piece.
    Default chunk size: 1k."""
    while True:
        data = file_object.readlines(chunk_size)
        if not data:
            break
        yield data
        

def process_csv_chunk(chunk, column_names, filter_col = None, filter_list = None):
    chunk_data = [row.decode('latin') for row in chunk]
    chunk_reader = csv.reader(chunk_data)
    
    interim_df = pd.DataFrame(data=[line for line in chunk_reader], columns=column_names)
    
    #Create NYC Electoral District number by appending Assembly Dist. and Electoral Dist. fields
    interim_df['FULL_ED'] = interim_df['AD']+interim_df['ED'].apply(lambda x: str(x).zfill(3))
    
    if filter_col is not None:
        interim_df = interim_df[interim_df[filter_col].apply(lambda x: str(x).replace('"','') in filter_list)]
    
    return interim_df

In [None]:
##### DELETE THIS BEFORE PUBLISHING ####

'''
#Columns entered manually from data dictionary
cols = ['LASTNAME','FIRSTNAME','MIDDLENAME','NAMESUFFIX',
        'RADDNUMBER','RHALFCODE','RPREDIRECTION','RSTREETNAME',
        'RPOSTDIRECTION', 'RAPARTMENTTYPE', 'RAPARTMENT', 'RADDRNONSTD','RCITY','RZIP5','RZIP4',
        'MAILADD1','MAILADD2','MAILADD3','MAILADD4',
        'DOB','GENDER','ENROLLMENT','OTHERPARTY',
        'COUNTYCODE','ED','LD','TOWNCITY','WARD','CD','SD','AD',
        'LASTVOTEDDATE','PREVYEARVOTED','PREVCOUNTY','PREVADDRESS','PREVNAME',
        'COUNTYVRNUMBER','REGDATE','VRSOURCE','IDREQUIRED','IDMET',
        'STATUS','REASONCODE','INACT_DATE','PURGE_DATE','SBOEID','VoterHistory']

with voters.open('AllNYSVoters_20220711.txt','r') as file_handle:
    #cols_line = file_handle.readline().decode('utf-8')
    #cols = [re.sub(r'[\s\'\"]+','',col).strip() for col in cols_line.split(',')]
    #print(cols)
    base_df = pd.DataFrame(columns=cols)
    n=0
    
    for chunk in tqdm(read_in_chunks(file_handle, chunk_size=10000000)):
        n+=1
        if n%20 == 0:
            if base_df.shape[0] > 0:
                base_df.reset_index().drop(columns='index').to_json(f'clean_segments/ivan_iter_{str(n).zfill(3)}.json')
                base_df = pd.DataFrame(columns=cols)
                
        base_df = base_df.append(process_csv_chunk(chunk, cols, filter_col='LASTNAME', filter_list = ['VALLADERES','Valladeres', 'VALLADARES']))
        
  '''      

In [3]:
#Columns entered manually from data dictionary
cols = ['LASTNAME','FIRSTNAME','MIDDLENAME','NAMESUFFIX',
        'RADDNUMBER','RHALFCODE','RPREDIRECTION','RSTREETNAME',
        'RPOSTDIRECTION', 'RAPARTMENTTYPE', 'RAPARTMENT', 'RADDRNONSTD','RCITY','RZIP5','RZIP4',
        'MAILADD1','MAILADD2','MAILADD3','MAILADD4',
        'DOB','GENDER','ENROLLMENT','OTHERPARTY',
        'COUNTYCODE','ED','LD','TOWNCITY','WARD','CD','SD','AD',
        'LASTVOTEDDATE','PREVYEARVOTED','PREVCOUNTY','PREVADDRESS','PREVNAME',
        'COUNTYVRNUMBER','REGDATE','VRSOURCE','IDREQUIRED','IDMET',
        'STATUS','REASONCODE','INACT_DATE','PURGE_DATE','SBOEID','VoterHistory']

#List of NYCHA electoral districts obtained using QGIS, by intersection of NYCHA developments layer
    #with ED layer. Converted to string for use in filtering.
nycha_distlist_2018 = [23042, 27035, 27041, 27061, 27084, 27085, 29017, 29020, 29022, 29025, 29026, 29027, 29029, 29031, 29034, 29040, 29050, 29051, 29053, 29056, 29057, 29074, 29075, 29080, 30057, 30059, 31013, 31014, 31015, 31016, 31034, 31035, 31050, 31055, 31063, 31064, 31078, 32001, 32015, 32016, 32021, 32026, 32028, 32030, 32032, 32033, 32034, 32035, 32037, 32038, 32039, 32040, 32042, 32043, 32044, 32050, 32051, 32052, 32054, 32055, 32060, 32061, 32072, 32074, 32077, 32079, 32083, 33029, 33030, 33031, 33033, 33035, 33036, 33037, 33038, 33039, 33040, 33041, 33042, 33043, 33044, 33045, 33046, 33047, 33048, 33053, 33055, 33058, 33059, 33060, 33061, 33062, 33063, 33070, 33073, 33077, 33078, 33079, 37002, 37003, 37004, 37006, 37007, 37008, 37009, 37010, 37012, 37013, 37014, 37015, 37016, 37017, 40050, 40051, 41066, 41067, 41068, 41069, 43068, 43069, 43081, 45045, 45046, 46005, 46008, 46010, 46011, 46013, 46014, 46015, 46033, 46080, 47053, 47054, 50016, 50017, 50057, 50058, 50059, 50060, 50062, 50063, 50089, 50091, 50103, 51054, 51069, 51070, 51071, 51072, 52051, 52052, 52053, 52054, 52056, 52057, 53021, 53022, 53027, 53031, 53032, 53040, 53041, 53042, 53043, 53051, 53081, 54007, 54008, 54010, 54011, 54076, 55006, 55016, 55021, 55022, 55032, 55033, 55035, 55048, 55049, 55055, 55057, 55058, 55059, 55060, 55062, 55065, 55066, 55067, 55068, 55069, 55070, 55072, 55079, 55080, 55081, 55083, 55090, 55092, 55093, 55094, 55095, 55100, 55101, 55103, 55107, 55110, 56001, 56002, 56004, 56006, 56007, 56008, 56009, 56010, 56015, 56017, 56024, 56025, 56026, 56027, 56030, 56059, 56063, 56064, 56069, 56070, 56071, 56083, 56084, 57002, 57008, 57009, 57011, 57012, 57032, 57033, 57046, 57097, 57098, 57099, 58002, 58011, 58073, 59002, 59003, 59004, 59041, 59063, 60001, 60009, 60012, 60014, 60015, 60031, 60032, 60033, 60039, 60040, 60058, 60059, 60060, 60063, 60064, 60065, 60066, 60067, 60068, 60069, 60071, 60072, 60073, 60074, 60080, 60081, 60087, 61007, 61010, 61053, 61070, 63041, 63042, 63071, 63075, 65035, 65036, 65038, 65040, 65043, 65044, 65045, 65050, 65056, 65057, 65061, 65073, 65074, 65081, 65083, 67007, 67025, 67026, 67027, 67086, 68001, 68002, 68016, 68025, 68026, 68028, 68029, 68030, 68032, 68036, 68044, 68045, 68047, 68048, 68049, 68054, 68056, 68058, 68059, 68060, 68061, 68062, 68064, 68065, 68066, 68067, 68068, 68069, 68070, 68076, 68082, 68084, 68086, 68090, 68091, 68092, 68093, 68097, 68110, 68113, 68114, 68115, 69003, 69012, 69016, 69017, 69018, 69019, 69028, 69052, 69056, 69057, 69058, 69068, 70003, 70005, 70017, 70018, 70025, 70030, 70031, 70032, 70058, 70060, 70061, 70062, 70073, 70076, 70077, 70078, 70097, 70099, 70102, 71016, 71017, 71021, 71022, 71042, 71043, 71058, 71068, 71108, 71109, 72011, 72050, 72052, 72071, 72072, 72086, 72093, 74001, 74003, 74004, 74005, 74006, 74013, 74014, 74018, 74028, 74029, 74038, 74069, 74077, 75012, 75013, 75014, 75111, 76017, 77002, 77003, 77022, 77024, 77031, 77051, 77053, 77054, 77055, 77062, 77063, 77077, 77081, 79003, 79015, 79016, 79018, 79025, 79026, 79027, 79030, 79031, 79032, 79033, 79035, 79036, 79038, 79039, 79041, 79046, 79048, 79049, 79050, 79051, 79052, 79053, 79058, 79060, 79061, 79063, 79064, 79065, 79066, 79070, 79071, 79079, 79082, 79085, 79086, 80014, 80040, 80041, 80042, 80043, 80045, 80074, 81038, 82012, 82013, 82014, 82015, 82037, 83007, 83014, 83022, 83023, 83028, 83058, 83059, 83060, 83061, 83062, 83063, 83064, 83088, 83089, 84019, 84023, 84024, 84025, 84026, 84027, 84028, 84029, 84030, 84031, 84032, 84033, 84037, 84039, 84042, 84044, 84045, 84060, 84061, 84062, 84063, 84066, 84072, 85005, 85010, 85011, 85012, 85016, 85017, 85022, 85023, 85026, 85027, 85029, 85030, 85031, 85034, 85036, 85049, 85053, 85056, 85060, 85068, 85081, 86003, 86004, 86006, 86020, 86072, 87009, 87066, 87070, 87071, 87082, 87088, 87089, 87102, 69053, 70038, 70037, 70039, 65033, 65034, 65020, 60018, 60019, 27040, 31086, 55071, 50011, 53028, 53029, 74026, 74027, 69100, 69101, 69108, 69089, 69102, 57003, 27039, 31057, 31032, 31039, 79045, 31085, 69114, 31018, 31017, 77023, 29030, 67019, 30058, 69015, 74022, 70007, 70010, 70011, 68039, 75041, 69111, 65019, 64040, 60070, 68017, 72051, 64049, 74002, 74011, 71048, 68019, 68021, 71038, 46012, 46007, 57001, 55064, 55105, 83067, 79037, 85052, 85050, 85051, 46006, 31019, 56003, 85069, 85070, 57039, 81009, 81040, 31058, 65021, 85015, 85038, 57007, 57006, 57013, 51073, 51074, 85040, 65018, 59049, 53035, 74025, 68027, 69029, 69030, 74012, 69054, 69060, 70100, 65077, 71087, 75040, 75045, 71088, 71089, 74016, 74017, 71090, 71091, 72002, 87009, 55093, 79041, 70011, 85060, 69003, 54008, 68054, 74077, 65061, 52056, 69052, 84044, 84045, 56071, 56084, 56069, 56070, 67025, 67027, 67026, 56015, 56025, 56026, 57099, 56024, 56027, 37002, 37003, 37004, 57046, 71038, 86020, 32037, 32038, 32044, 74001, 74003, 74004, 74005, 74002, 59041, 59063, 59049, 31085, 31058, 54007, 60009, 63075, 50059, 50060, 40051, 53027, 53028, 53029, 53040, 53081, 80045, 83014, 83088, 83067, 60063, 60064, 60065, 85034, 74016, 58073, 60069, 60071, 60072, 60073, 55021, 55022, 85056, 85069, 85070, 85068, 55035, 55068, 55069, 55070, 55110, 79016, 53041, 53042, 53043, 53051, 79031, 79033, 79082, 79037, 74038, 46014, 46080, 31039, 68026, 68028, 68030, 68027, 61010, 87070, 87071, 87088, 87089, 87102, 75041, 75040, 75045, 79030, 79085, 77051, 77053, 77054, 77062, 77055, 85011, 85012, 68029, 68036, 68115, 77063, 46010, 46015, 46007, 46013, 46012, 29017, 50016, 50017, 53035, 68076, 55081, 55103, 56083, 60031, 60018, 60019, 79025, 69028, 69058, 69056, 69057, 69068, 69053, 69114, 69054, 70060, 70061, 70062, 72050, 72052, 72051, 79052, 84039, 85049, 85052, 85050, 85051, 79018, 86072, 60015, 60032, 68047, 68048, 83022, 83023, 83058, 83059, 83060, 83061, 83062, 83063, 83064, 57002, 57003, 57001, 43069, 43081, 29022, 29034, 29040, 29057, 31014, 31016, 31050, 32021, 32030, 32034, 32043, 32054, 32055, 32074, 33030, 33033, 33037, 33040, 33041, 33058, 33060, 33063, 33073, 33078, 29025, 29029, 29074, 31013, 32033, 33044, 33055, 33077, 87082, 31086, 31017, 29053, 31015, 31034, 32035, 33042, 33046, 33047, 33059, 83007, 29026, 29050, 32032, 32051, 33031, 33039, 33045, 33070, 85005, 29030, 32060, 50089, 50091, 52057, 58002, 60012, 60014, 60033, 57039, 29020, 29051, 29075, 29080, 32028, 32040, 32042, 32050, 32052, 32079, 33029, 33035, 33036, 33038, 33079, 31018, 60070, 31019, 32072, 32061, 29031, 31035, 31078, 33061, 29027, 29056, 23042, 31064, 32039, 32083, 33043, 33048, 33053, 33062, 55048, 55049, 74018, 79048, 79049, 79046, 81038, 71048, 75012, 75013, 75014, 75111, 55079, 55080, 87066, 55055, 59002, 59003, 59004, 65050, 74006, 52051, 52052, 70018, 70097, 69100, 69101, 69108, 69089, 69102, 46005, 46006, 83028, 83089, 46011, 31032, 67007, 67019, 71021, 71022, 71068, 86006, 86004, 65074, 77003, 77077, 77002, 79015, 68001, 68002, 55057, 55058, 55092, 55107, 55067, 85038, 85040, 50062, 50011, 50103, 57008, 57009, 57011, 57098, 57007, 79063, 79064, 79065, 68067, 68068, 68069, 68070, 68064, 68065, 68066, 68058, 68059, 68060, 70005, 70007, 55032, 56063, 56064, 65035, 65040, 65036, 57032, 57033, 57097, 40050, 68032, 68114, 55095, 58011, 68025, 70038, 70037, 70039, 60067, 60068, 60080, 60066, 60081, 60001, 60039, 60087, 55059, 55060, 65073, 74017, 74025, 74022, 70073, 70076, 70077, 70099, 70078, 72071, 72072, 72086, 81009, 81040, 56001, 56002, 56004, 56003, 63041, 63042, 45045, 45046, 47053, 47054, 71043, 71108, 79050, 79051, 79066, 84037, 65077, 68045, 82037, 84062, 84063, 84060, 84061, 84019, 84023, 84024, 84025, 85022, 85023, 85026, 85027, 84072, 84066, 79035, 79036, 79032, 79086, 68082, 79039, 79053, 79070, 79071, 79079, 79045, 84028, 84027, 64049, 41066, 41068, 46008, 31063, 31057, 55033, 55016, 68113, 55100, 55101, 80014, 80074, 84026, 84029, 84030, 84031, 84033, 84032, 80040, 80041, 80042, 80043, 60058, 60059, 60060, 71087, 71088, 71089, 71090, 27035, 27041, 27084, 27085, 27040, 27039, 70100, 37012, 37014, 37016, 37006, 37013, 37015, 37017, 55090, 82015, 71091, 37007, 37008, 37009, 37010, 51069, 51070, 51071, 51072, 51054, 51073, 51074, 31055, 27061, 69111, 69060, 70003, 70017, 70010, 69030, 43068, 61007, 74028, 74029, 74027, 74026, 76017, 68084, 54010, 54076, 56017, 54011, 65033, 65034, 85010, 79060, 79061, 84042, 70025, 70030, 70031, 70032, 70102, 70058, 71016, 71017, 71109, 55006, 77024, 77081, 77023, 65057, 65056, 41067, 41069, 32001, 65020, 65019, 65021, 65018, 85029, 85030, 85031, 85016, 85017, 85015, 64040, 79058, 32015, 32016, 32026, 32077, 65081, 61053, 85081, 55083, 74069, 56030, 56008, 56009, 56010, 46033, 68056, 68061, 68062, 68110, 55094, 50063, 69017, 82012, 82013, 82014, 55072, 55071, 55105, 63071, 56006, 56007, 79003, 65038, 79027, 79026, 60040, 86003, 68097, 55065, 55066, 55064, 60074, 65043, 65083, 65044, 65045, 68086, 68090, 68091, 68092, 68093, 74013, 74014, 74011, 74012, 68016, 68044, 68017, 68019, 68021, 72093, 71058, 72002, 71042, 72011, 79038, 56059, 61070, 85053, 85036, 77022, 77031, 68039, 57012, 57006, 57013, 50057, 50058, 53021, 53022, 53031, 53032, 68049, 67086, 69016, 30057, 30059, 30058, 55062, 69018, 69015, 69029, 69019, 69012, 52053, 52054]
nycha_distlist_2018 = set([str(dist) for dist in nycha_distlist_2018])

#In dataset, field 'FULL_ED' is obtained by concatenating assembly district and local ED (zero-padded to three characters)

#FOR 2022 DATASET, read in NYCHA election districts from various versions of the district shapefile (21D through 22A2)
    #to account for shifting district lines post-redistricting and lag in SBOE data updates
    
filelist = glob.glob('2022_EDs/*.xlsx')

distlist = pd.DataFrame()

for file in filelist:
    newlist = pd.read_excel(file)
    distlist = distlist.append(newlist)
    
nycha_distlist_2022 = set(distlist['ElectDist'].astype(str).unique())


  distlist = distlist.append(newlist)
  distlist = distlist.append(newlist)
  distlist = distlist.append(newlist)
  distlist = distlist.append(newlist)


In [4]:
with voters.open('AllNYSVoters_20220711.txt','r') as file_handle:
    #cols_line = file_handle.readline().decode('utf-8')
    #cols = [re.sub(r'[\s\'\"]+','',col).strip() for col in cols_line.split(',')]
    #print(cols)
    base_df = pd.DataFrame(columns=cols)
    n=0
    
    for chunk in tqdm(read_in_chunks(file_handle, chunk_size=10000000)):
        n+=1
        if n%20 == 0:
            if base_df.shape[0] > 0:
                base_df.reset_index().drop(columns='index').to_json(f'clean_segments/iter_{str(n).zfill(3)}.json')
                base_df = pd.DataFrame(columns=cols)
                
        base_df = base_df.append(process_csv_chunk(chunk, cols, filter_col='FULL_ED', filter_list = nycha_distlist_2022))

  base_df = base_df.append(process_csv_chunk(chunk, cols, filter_col='FULL_ED', filter_list = nycha_distlist_2022))
  base_df = base_df.append(process_csv_chunk(chunk, cols, filter_col='FULL_ED', filter_list = nycha_distlist_2022))
  base_df = base_df.append(process_csv_chunk(chunk, cols, filter_col='FULL_ED', filter_list = nycha_distlist_2022))
  base_df = base_df.append(process_csv_chunk(chunk, cols, filter_col='FULL_ED', filter_list = nycha_distlist_2022))
  base_df = base_df.append(process_csv_chunk(chunk, cols, filter_col='FULL_ED', filter_list = nycha_distlist_2022))
  base_df = base_df.append(process_csv_chunk(chunk, cols, filter_col='FULL_ED', filter_list = nycha_distlist_2022))
  base_df = base_df.append(process_csv_chunk(chunk, cols, filter_col='FULL_ED', filter_list = nycha_distlist_2022))
  base_df = base_df.append(process_csv_chunk(chunk, cols, filter_col='FULL_ED', filter_list = nycha_distlist_2022))
  base_df = base_df.append(process_csv_chunk(chunk, cols, filter_col='FU

In [5]:
caches = glob.iglob('clean_segments/*')

full_df = pd.DataFrame(columns=cols)

for file in tqdm(caches):
    segment_df = pd.read_json(file)
    full_df = full_df.append(segment_df)


  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.append(segment_df)
  full_df = full_df.appen

In [9]:
full_df.shape

(1565825, 48)

In [10]:
county_to_borough_dict = {31: 'MANHATTAN',
                          24: 'BROOKLYN',
                          41: 'QUEENS',
                          3:  'BRONX',
                          43: 'STATEN ISLAND'}
full_df['BOROUGH'] = full_df['COUNTYCODE'].apply(lambda x: county_to_borough_dict[x])

for col in ['RADDNUMBER','RSTREETNAME','BOROUGH']:
    full_df[col] = full_df[col].apply(lambda x: str(x).strip())

def get_mod_address_from_row(row):
    address_string = ''
    for col in ['RADDNUMBER','RPREDIRECTION','RSTREETNAME','BOROUGH']:
        if row[col] is not None:
            if col == 'RPREDIRECTION':
                if str(row[col]).strip() != 'nan':
                    address_string = address_string+str(row[col]).strip()
                else:
                    pass
                
            else:
                address_string = address_string+str(row[col]).strip()
                
            if col in ['RSTREETNAME']:
                address_string = address_string+', '
            else:
                address_string = address_string+' '
    
    return(address_string.strip()+', NY')
            
full_df['MOD_ADDRESS'] = full_df.progress_apply(lambda row: get_mod_address_from_row(row), axis=1)

100%|██████████████████████████████| 1565825/1565825 [00:54<00:00, 28656.70it/s]


In [11]:
full_df.reset_index().drop(columns='index').to_csv('NYCHA_DISTRICT_RECORDS_2022.csv')

In [12]:
gc.collect()

0

### Locate full NYCHA address range from BBL

In [13]:
#The following files and associated documentation are found at:
#https://www1.nyc.gov/site/planning/data-maps/open-data.page#pad
#If direct link fails, search "Property Address Directory" on linked page

bbl = pd.read_csv('pad22a/bobabbl.txt')
addr = pd.read_csv('pad22a/bobaadr.txt')
snd = pd.read_fwf('pad22a/snd22Acow.txt', widths=[1,1,32,1,1,1,5,2,3,2,1,1,2,32,2,20,1,92])
snd.columns = ['rectype','boro','stname','primary_flag','principal_flag','boro1','sc5','lgc','spv','filler','numeric_ind','GFT','len_full_name','full_stname','min_SNL','stn20','ht_name_type_code','filler']


  addr = pd.read_csv('pad22a/bobaadr.txt')


In [14]:
bl_guide = pd.read_csv('stairhalls.csv')
bl_guide.columns = [col.upper() for col in bl_guide.columns]

In [15]:
boro_num_dict = {'MANHATTAN': 1, 'BRONX': 2, 'BROOKLYN': 3, 'QUEENS': 4, 'STATEN ISLAND': 5}
bl_guide['BORO'] = bl_guide['BOROUGH'].apply(lambda x: boro_num_dict[x])
bl_guide['TDS_NUM'] = bl_guide['LOCATION'].apply(lambda x: int(x[:3]))

In [None]:
### DEPRECATED
#Buildings data obtained from Data Warehouse: select * from nicedw.nice_buildings
bldgs = pd.read_csv('bldgs.csv')

#Remove bldgs with null BBLs or those beginning in 0 (i.e., buildings removed from portfolio)
bldgs = bldgs[bldgs['BBL'].apply(lambda x: not pd.isna(x))]
bldgs['BORO'] = bldgs['BBL'].apply(lambda x: int(str(x)[0]))

bldgs['BLOCK'] = bldgs['BLOCK'].fillna(0).astype(int)
bldgs['LOT'] = bldgs['LOT'].fillna(0).astype(int)
bldgs['BBL'] = bldgs['BBL'].fillna(0).astype(int).astype(str).apply(lambda x: str.ljust(x, 10, '0'))

#Identify "derived" block and lot in cases where BBL does not reflect block, lot fields
bldgs['der_block'] = bldgs['BBL'].apply(lambda x: int(x[1:6]))
bldgs['der_lot'] = bldgs['BBL'].apply(lambda x: int(x[6:]))
bldgs = bldgs[['TDS_NUM','BORO','BLOCK','LOT','BBL', 'der_block','der_lot']].drop_duplicates()

In [None]:
### DEPRECATED
#Create new rows when derived blocks and lots do not match given fields
alt_bbls = bldgs[bldgs['BLOCK'] != bldgs['der_block']]
alt_bbls = alt_bbls[['TDS_NUM','BORO','BBL','der_block','der_lot']].drop_duplicates()

addtl_rows = pd.DataFrame(columns = bldgs.columns)


for row in alt_bbls.iterrows():
    cols = bldgs.columns
    new_row = pd.DataFrame({'TDS_NUM':[row[1]['TDS_NUM']],
                            'BORO':[row[1]['BORO']], 
                            'BLOCK':[row[1]['der_block']],
                            'LOT':[row[1]['der_lot']],
                            'BBL':[row[1]['BBL']],
                            'der_block':[row[1]['der_block']],
                            'der_lot':[row[1]['der_lot']]})
    
    addtl_rows = addtl_rows.append(new_row)

In [16]:
#Match building BBLs to Prop. Address Directory BBL table
nycha_bbl = bbl.merge(bl_guide, how='outer', left_on=['boro','block','lot'], right_on=['BORO','BLOCK','LOT'], indicator=True)
nycha_bbl = nycha_bbl[nycha_bbl['_merge'] == 'both']
nycha_bbl = nycha_bbl[['TDS_NUM','boro','block','lot','numaddr', 'numbf']].drop_duplicates()

In [17]:
#Merge NYCHA BBLs one-to-many to PAD address table
nycha_addr = nycha_bbl.merge(addr, how='outer', on=['boro','block','lot'])
nycha_addr = nycha_addr[nycha_addr['TDS_NUM'].apply(lambda x: not pd.isna(x))]
nycha_addr = nycha_addr.merge(snd, on=['boro','sc5'], how='inner')

In [18]:
#Screen out invalid address numbers/address ranges (those with GARAGE, etc.)

def is_valid_address_number(value):
    test_value = None
    try:
        test_value = int(str(value))
    except:
        return False
    
    if isinstance(test_value, int):
        return True

nycha_addr_ranges = nycha_addr[nycha_addr['lhnd'].apply(lambda x: is_valid_address_number(x))]
nycha_addr_ranges = nycha_addr_ranges[['TDS_NUM','boro','block','lot',
                                       'lhnd','lsos','hhnd','hsos','parity', 'stname_x', 'stname_y','addrtype','GFT']]

In [19]:
#Create unique row for each address number in matching address ranges

def expand_address_range(row):
    start = int(row['lhnd'])
    end = int(row['hhnd'])+1
    parity = row['parity']
    
    if parity == 0:
        return row
    else:
        address_range = list(range(start, end, 2))
    
    expanded_df = pd.DataFrame([row]*len(address_range), index=pd.RangeIndex(len(address_range)))
    expanded_df['HOUSE_NUM'] = None
    
    for i in range(len(address_range)):
        expanded_df.at[i,'HOUSE_NUM'] = address_range[i]
        
    return(expanded_df)


In [20]:
nycha_addresses_expanded = pd.DataFrame()

for row in tqdm(nycha_addr_ranges.drop_duplicates().iterrows()):
    nycha_addresses_expanded = nycha_addresses_expanded.append(expand_address_range(row[1]))

  nycha_addresses_expanded = nycha_addresses_expanded.append(expand_address_range(row[1]))
  nycha_addresses_expanded = nycha_addresses_expanded.append(expand_address_range(row[1]))
  nycha_addresses_expanded = nycha_addresses_expanded.append(expand_address_range(row[1]))
  nycha_addresses_expanded = nycha_addresses_expanded.append(expand_address_range(row[1]))
  nycha_addresses_expanded = nycha_addresses_expanded.append(expand_address_range(row[1]))
  nycha_addresses_expanded = nycha_addresses_expanded.append(expand_address_range(row[1]))
  nycha_addresses_expanded = nycha_addresses_expanded.append(expand_address_range(row[1]))
  nycha_addresses_expanded = nycha_addresses_expanded.append(expand_address_range(row[1]))
  nycha_addresses_expanded = nycha_addresses_expanded.append(expand_address_range(row[1]))
  nycha_addresses_expanded = nycha_addresses_expanded.append(expand_address_range(row[1]))
  nycha_addresses_expanded = nycha_addresses_expanded.append(expand_address_range(row[1]))

In [21]:
#Add borough names and standardize addresses
boro_num_to_name_dict = {1: 'MANHATTAN',
                         2: 'BRONX',
                         3: 'BROOKLYN',
                         4: 'QUEENS',
                         5: 'STATEN ISLAND'}

nycha_addresses_expanded['BOROUGH'] = nycha_addresses_expanded['boro'].progress_apply(lambda x: boro_num_to_name_dict[x])
nycha_addresses_expanded['STREET_NAME_STD'] = nycha_addresses_expanded['stname_y'].progress_apply(lambda x: re.sub(r'(?<=[\d]+)((TH)|(RD)|(ST)|(ND))', '', x.strip())).apply(lambda x: re.sub(r'[\s]+', ' ', x))
nycha_addresses_expanded['FULL_ADD'] = nycha_addresses_expanded['HOUSE_NUM'].progress_apply(lambda x: str(x).strip())+' '+nycha_addresses_expanded['STREET_NAME_STD']+', '+nycha_addresses_expanded['BOROUGH']+', NY'

100%|█████████████████████████████████| 17831/17831 [00:00<00:00, 519282.58it/s]
100%|█████████████████████████████████| 17831/17831 [00:00<00:00, 126529.64it/s]
100%|█████████████████████████████████| 17831/17831 [00:00<00:00, 418472.87it/s]


In [22]:
nycha_addr_full = nycha_addresses_expanded[['TDS_NUM', 'HOUSE_NUM', 'STREET_NAME_STD', 'BOROUGH', 'FULL_ADD']]

##### Adding official NYCHA addresses to list

In [23]:
#Supplement with NYCHA addresses from PTAD shapefile (which lacks many intermediate address nums)
nycha_addr_official = gpd.read_file('STD_NYCHA/NYCHA_Address_Points.geojson')
nycha_addr_official['STREET_NAME_STD'] = nycha_addr_official['STREET'].progress_apply(lambda x: re.sub(r'(?<=[\d]+)((TH)|(RD)|(ST)|(ND))', '', x))
nycha_addr_official['FULL_ADD'] = nycha_addr_official['HOUSE_NUM']+' '+nycha_addr_official['STREET_NAME_STD']+', '+nycha_addr_official['BOROUGH']+', NY'

100%|███████████████████████████████████| 5285/5285 [00:00<00:00, 115667.66it/s]


In [24]:
nycha_addr_full = nycha_addr_full.append(nycha_addr_official[['TDS_NUM', 'HOUSE_NUM', 'STREET_NAME_STD', 'BOROUGH', 'FULL_ADD']])

#nycha_addr_full = nycha_addr_full.drop_duplicates(['HOUSE_NUM', 'STREET_NAME_STD', 'BOROUGH'])
nycha_addr_full = nycha_addr_full.reset_index().drop(columns='index')



  nycha_addr_full = nycha_addr_full.append(nycha_addr_official[['TDS_NUM', 'HOUSE_NUM', 'STREET_NAME_STD', 'BOROUGH', 'FULL_ADD']])


##### Manually adjusting select addresses

In [25]:
#Add St. Nicholas Ave addresses
stn_ave_addr = nycha_addr_full[nycha_addr_full['STREET_NAME_STD'].apply(lambda x: 'SAINT NICHOLAS' in x)].copy()
stn_ave_addr['STREET_NAME_STD'] = 'ST NICHOLAS AVENUE'

#Add 7th Ave addresses
seventh_ave_addr = nycha_addr_full[nycha_addr_full['STREET_NAME_STD'].apply(lambda x: 'POWELL BOULEVARD' in x)].copy()
acp_addr = seventh_ave_addr.copy()

seventh_ave_addr['STREET_NAME_STD'] = '7 AVENUE'
acp_addr['STREET_NAME_STD'] = 'ADAM CLAYTON POWELL BOULEVARD'

#Add 8th Ave addresses
eighth_ave_addr = nycha_addr_full[nycha_addr_full['STREET_NAME_STD'].apply(lambda x: 'FREDERICK DOUGLASS' in x)].copy()
eighth_ave_addr['STREET_NAME_STD'] = '8 AVENUE'

#Misc...
w27 = nycha_addr_full[nycha_addr_full['STREET_NAME_STD'].apply(lambda x: 'WEST 27 DRIVE' in x)].copy()
w27['STREET_NAME_STD'] = 'WEST 27 STREET DRIVE'

ua1 = nycha_addr_full[nycha_addr_full['STREET_NAME_STD'].apply(lambda x: 'UNIVERSITY AVENUE' in x)].copy()
ua2 = ua1.copy()
ua1['STREET_NAME_STD'] = 'DR MARTIN L KING JR BOULEVARD'
ua2['STREET_NAME_STD'] ='DR M L KING JR BOULEVARD'

gb = nycha_addr_full[nycha_addr_full['STREET_NAME_STD'].apply(lambda x: 'GUY BREWER' in x)].copy()
gb['STREET_NAME_STD'] = 'GUY R BREWER BOULEVARD'

ftw = nycha_addr_full[nycha_addr_full['STREET_NAME_STD'].apply(lambda x: 'FORT WASHINGTON' in x)].copy()
ftw['STREET_NAME_STD'] = 'FT WASHINGTON AVENUE'

#Put it all together...
new_addr = pd.DataFrame()

for df in [stn_ave_addr, seventh_ave_addr, acp_addr, eighth_ave_addr, w27, ua1, ua2, gb, ftw]:
    new_addr = new_addr.append(df)
    
nycha_addr_full = nycha_addr_full.append(new_addr).reset_index().drop(columns='index')

  new_addr = new_addr.append(df)
  new_addr = new_addr.append(df)
  new_addr = new_addr.append(df)
  new_addr = new_addr.append(df)
  new_addr = new_addr.append(df)
  new_addr = new_addr.append(df)
  new_addr = new_addr.append(df)
  new_addr = new_addr.append(df)
  new_addr = new_addr.append(df)
  nycha_addr_full = nycha_addr_full.append(new_addr).reset_index().drop(columns='index')


In [26]:
nycha_addr_full.to_csv('NYCHA_addr_full_OFFICIAL_PLUS_BBL.csv')

### Match voter file addresses and NYCHA addresses to isolate NYCHA voters

In [27]:
#Load voter records from NYCHA electoral districts isolated above, alongside addresses
nycha_addr_full = pd.read_csv('NYCHA_addr_full_OFFICIAL_PLUS_BBL.csv', index_col=0)
for col in nycha_addr_full.columns:
    nycha_addr_full[col] = nycha_addr_full[col].progress_apply(lambda x: re.sub(r'[\s]+', ' ', str(x)).strip())

nycha_addr_full['TDS_NUM'] = nycha_addr_full['TDS_NUM'].astype(float).astype(int)

full_df = pd.read_csv('NYCHA_DISTRICT_RECORDS_2022.csv', index_col = 0)
for col in ['RADDNUMBER', 'RSTREETNAME', 'BOROUGH']:
    full_df[col] = full_df[col].progress_apply(lambda x: re.sub(r'[\s]+', ' ', str(x)).strip())


100%|█████████████████████████████████| 23306/23306 [00:00<00:00, 126417.81it/s]
100%|█████████████████████████████████| 23306/23306 [00:00<00:00, 139997.55it/s]
100%|█████████████████████████████████| 23306/23306 [00:00<00:00, 107653.44it/s]
100%|█████████████████████████████████| 23306/23306 [00:00<00:00, 142015.99it/s]
100%|█████████████████████████████████| 23306/23306 [00:00<00:00, 101253.49it/s]
  full_df = pd.read_csv('NYCHA_DISTRICT_RECORDS_2022.csv', index_col = 0)
100%|█████████████████████████████| 1565825/1565825 [00:11<00:00, 140073.05it/s]
100%|█████████████████████████████| 1565825/1565825 [00:11<00:00, 130522.16it/s]
100%|█████████████████████████████| 1565825/1565825 [00:10<00:00, 148600.53it/s]


In [28]:
#Set up index using Record Linkage Toolkit. Here, match exactly on Borough and Address number,
    #while allowing minor variations in street spellings
    
indexer = recordlinkage.Index()
indexer.block(left_on=['BOROUGH','RADDNUMBER'], right_on=['BOROUGH','HOUSE_NUM'])


#full_df = full_df[full_df['BOROUGH'] == 'STATEN ISLAND']
#nycha_addresses = nycha_addresses[nycha_addresses['BOROUGH'] == 'STATEN ISLAND']

candidates = indexer.index(full_df, nycha_addr_full)

In [29]:
compare = recordlinkage.Compare()
#compare.exact('BOROUGH','BOROUGH', label='BOROUGH')
#compare.numeric('RADDNUMBER','ADDR_HN', label='ADDRESS_NUMBER')
compare.string('RSTREETNAME','STREET_NAME_STD', threshold=1, label = 'STREET_NAME')

<Compare>

In [30]:
#Obtain matching index pairs
features = compare.compute(candidates, full_df, nycha_addr_full)
print(features.shape)

(4975970, 1)


In [34]:
features[features['STREET_NAME']==1.0].shape

(479319, 1)

In [35]:
#Fetch information on matches for validation
match_candidates = {'FULL_DF_MOD_ADDRESS':[], 'NYCHA_ADDRESSES_FULL_ADD':[], 'FULL_DF_IND':[], 'NYCHA_ADDRESS_IND':[]}

for row in tqdm(features[features['STREET_NAME']>0].reset_index().iterrows()):
    row_values = row[1]
    full_add_ind = int(row_values['level_0'])
    nycha_add_ind = int(row_values['level_1'])
    
    match_candidates['FULL_DF_MOD_ADDRESS'].append(full_df.iloc[full_add_ind]['MOD_ADDRESS'])
    match_candidates['FULL_DF_IND'].append(full_add_ind)
    match_candidates['NYCHA_ADDRESSES_FULL_ADD'].append(nycha_addr_full.iloc[nycha_add_ind]['FULL_ADD'])
    match_candidates['NYCHA_ADDRESS_IND'].append(nycha_add_ind)

479319it [01:58, 4059.42it/s]


In [36]:
match_df = pd.DataFrame(match_candidates)
match_df.to_csv('_match_candidates_2022.csv')

In [49]:
match_df.shape

(479319, 4)

In [None]:
match_df = pd.read_csv('_match_candidates_2022.csv')

In [38]:
#Identify matched rows in voter file
matched_voter_indices = set(match_df['FULL_DF_IND'])

full_df_ind = full_df.copy()
full_df_ind['_match'] = full_df.reset_index()['index'].progress_apply(lambda x: x in matched_voter_indices)

100%|████████████████████████████| 1565825/1565825 [00:01<00:00, 1140374.78it/s]


In [46]:
full_df_ind[full_df_ind['_match']].drop_duplicates(subset=['FIRSTNAME','LASTNAME','DOB']).shape

(409040, 51)

In [45]:
match_mapper = {}

for row in tqdm(match_df.iterrows()):
    row_data = row[1]
    match_mapper[row_data['FULL_DF_IND']] = row_data['NYCHA_ADDRESS_IND']
    

479319it [00:15, 31544.48it/s]


In [50]:
match_df

Unnamed: 0,FULL_DF_MOD_ADDRESS,NYCHA_ADDRESSES_FULL_ADD,FULL_DF_IND,NYCHA_ADDRESS_IND
0,"1460 WASHINGTON AVENUE, BRONX, NY","1460 WASHINGTON AVENUE, BRONX, NY",1,20777
1,"1460 BRONX RIVER AVENUE, BRONX, NY","1460 BRONX RIVER AVENUE, BRONX, NY",1741,18334
2,"1460 WASHINGTON AVENUE, BRONX, NY","1460 WASHINGTON AVENUE, BRONX, NY",1924,20777
3,"1460 WASHINGTON AVENUE, BRONX, NY","1460 WASHINGTON AVENUE, BRONX, NY",11612,20777
4,"1460 WASHINGTON AVENUE, BRONX, NY","1460 WASHINGTON AVENUE, BRONX, NY",11915,20777
...,...,...,...,...
479314,"1762 LEXINGTON AVENUE, MANHATTAN, NY","1762 LEXINGTON AVENUE, MANHATTAN, NY",1397711,18780
479315,"1762 LEXINGTON AVENUE, MANHATTAN, NY","1762 LEXINGTON AVENUE, MANHATTAN, NY",1424458,18780
479316,"1740 LEXINGTON AVENUE, MANHATTAN, NY","1740 LEXINGTON AVENUE, MANHATTAN, NY",1399260,18782
479317,"1794B STERLING PLACE, BROOKLYN, NY","1794B STERLING PLACE, BROOKLYN, NY",1481158,19999


In [51]:
def get_matched_dev_info(base_df, mapper, supp_df):                 
    supp_dict = {'VOTER_IND':[], 'NYCHA_IND':[], 'TDS':[], 'NYCHA_ADD':[]}
    
    matched_df = base_df.reset_index()
    matched_df = matched_df[matched_df['_match']]
    
    matched_set = set(matched_df['index'])
    unmatched_set = set(base_df.reset_index()['index'])-matched_set
    
    for row in tqdm(matched_df.iterrows()):
        try:
            index = row[1]['index']
            supp_ind = mapper[index]
            matching_row = supp_df.iloc[supp_ind]

            supp_dict['VOTER_IND'].append(index)
            supp_dict['NYCHA_IND'].append(supp_ind)
            supp_dict['TDS'].append(matching_row['TDS_NUM'])
            supp_dict['NYCHA_ADD'].append(matching_row['FULL_ADD'])
        except:
            pass

    for index in unmatched_set:
        supp_dict['VOTER_IND'].append(index)
        for col in ['NYCHA_IND','TDS','NYCHA_ADD']:
            supp_dict[col].append(None)
            
    return supp_dict

supp_nycha_info = get_matched_dev_info(full_df_ind, match_mapper, nycha_addr_full)
supp_nycha_df = pd.DataFrame(supp_nycha_info)

416525it [00:28, 14445.97it/s]


In [54]:
supp_nycha_df['NYCHA_IND'].value_counts()

22428.0    781
18430.0    705
17849.0    699
17957.0    697
18435.0    695
          ... 
20024.0      1
20340.0      1
19886.0      1
19864.0      1
19327.0      1
Name: NYCHA_IND, Length: 4324, dtype: int64

In [55]:
full_df_supp = full_df_ind.merge(supp_nycha_df, left_index=True, right_on = 'VOTER_IND')

In [60]:
nycha_voter_set = full_df_supp[full_df_supp['_match']]
nycha_voter_set = nycha_voter_set.drop(columns=[col for col in nycha_voter_set.columns if 'Unnamed' in col])
nycha_voter_set.to_csv('NYCHA_voter_set_strict.csv')

### Expand elections field to arrive at turnout rates by election

In [62]:
nycha_voter_set = pd.read_csv('NYCHA_voter_set_strict.csv', index_col=0)

nycha_voter_set = nycha_voter_set[nycha_voter_set['STATUS'] != 'P']

  nycha_voter_set = pd.read_csv('NYCHA_voter_set_strict.csv', index_col=0)


In [65]:
nycha_voter_set['ElecList'] = nycha_voter_set['VoterHistory'].progress_apply(lambda x: set([re.sub(r'\(\w+\)', '', name).strip() for name in str(x).split(';')]))
#nycha_voter_set['ElecList'] = nycha_voter_set['VoterHistory'].apply(lambda x: set(name.strip() for name in str(x).split(';')))

nycha_voter_set = nycha_voter_set[['LASTNAME','FIRSTNAME','FULL_ED','LD','CD','SD','AD','BOROUGH','TDS','ElecList']]
nycha_voter_set['_counter'] = 1

100%|████████████████████████████████| 293236/293236 [00:07<00:00, 41885.97it/s]


In [66]:
nycha_voter_set.shape

(293236, 11)

In [13]:
#Add city council district info by ED
ed_to_council = pd.read_csv('ED22A1_to_Council_District.csv').set_index('ElectDist')
error_eds = []

#@njit
def get_cd(ed_num, df):
    try:
        cd_num = df.loc[ed_num]['coun_dist']
        return cd_num
    
    except:
        error_eds.append(ed_num)
        
    
nycha_voter_set['Council_Dist'] = nycha_voter_set['FULL_ED'].progress_apply(lambda x: get_cd(int(x), ed_to_council))

100%|██████████████████████████████| 4407587/4407587 [06:32<00:00, 11231.71it/s]


In [20]:
#Import development info for filtering and display
dev_info = pd.read_csv('overview_table_data.csv')
dev_info = dev_info[['TDS','DEV_NAME','CONS_TDS','CONS_NAME','GEO_BORO']]

#Pull in population data
def get_latest_records(df, group_col, date_col):
    new_df = pd.DataFrame(columns=df.columns)
    
    for group in df.groupby(group_col):
        new_df = new_df.append(group[1].sort_values(date_col, ascending=False).iloc[0])
        
    return new_df

pop_facts = pd.read_excel('Dev_Population_Facts.xlsx')
recent_pop = get_latest_records(pop_facts, 'TDS_NUMBER', 'MONTH_ID')
recent_pop['18_PLUS'] = recent_pop['POPULATION']-recent_pop['MINORS_UNDER_18']

dev_info = dev_info.merge(recent_pop[['TDS_NUMBER','18_PLUS','MONTH_ID']], left_on='TDS', right_on='TDS_NUMBER', how='left')

  warn("Workbook contains no default style, apply openpyxl's default")
  new_df = new_df.append(group[1].sort_values(date_col, ascending=False).iloc[0])
  new_df = new_df.append(group[1].sort_values(date_col, ascending=False).iloc[0])
  new_df = new_df.append(group[1].sort_values(date_col, ascending=False).iloc[0])
  new_df = new_df.append(group[1].sort_values(date_col, ascending=False).iloc[0])
  new_df = new_df.append(group[1].sort_values(date_col, ascending=False).iloc[0])
  new_df = new_df.append(group[1].sort_values(date_col, ascending=False).iloc[0])
  new_df = new_df.append(group[1].sort_values(date_col, ascending=False).iloc[0])
  new_df = new_df.append(group[1].sort_values(date_col, ascending=False).iloc[0])
  new_df = new_df.append(group[1].sort_values(date_col, ascending=False).iloc[0])
  new_df = new_df.append(group[1].sort_values(date_col, ascending=False).iloc[0])
  new_df = new_df.append(group[1].sort_values(date_col, ascending=False).iloc[0])
  new_df = new_df.append(gr

In [4]:
#Group election identifiers and define election types
ID_dict = {'2007GE':set(['20071106 GE', '20071105 GE', '20071106 ??', 'GENERAL 2007', 'GENERAL ELECTION 2007', '2007 GENERAL  ELECTION', '2007 General Election', '2007 GENERAL ELECTION', 'General Election, 2007', '07 GENERAL ELECTION', '20071006 GE', 'General Election 2007', '2007 GENERAL']),
           '2007PR':set(['20070918 PR','PRIMARY ELECTION 2007', '2007 Primary Election']),
           '2007SP':['Special Election 2007'],
           '2008GE':set(['20081104 GE', '2008 GENERAL ELECTION', 'GENERAL 2008', '2008 General Election', 'GENERAL ELECTION 2008','General Election, 2008', '20081104 PR', '20081114 PR','20081111 GE', '2008 General', 'General Election 2008', '20081028 GE','20080909 GE', '20081110 GE', '20081104', '20081120']),
           '2008PP':set(['200802`0 PP','08 PRESIDENTIAL PRIMARY','20080205 PP', '20080205 PR', 'PRESIDENTIAL PRIMARY 2008', '2008 PRESIDENTIAL PRIMARY', '20080205 ??', 'Presidential Primary, 2008', '2008 Presidential Primary', 'Presidential Primary 2008', '2008 Presidential Primary Election', '20080205 SP']),
           '2008PR':set(['20080909 PR', '20080909 PP', '20080911 PR','20080903 PR', '20080902 PR', '20080916 PR', '20080914 PR', '20080913 PR', '20080914', '20080909 SP', '20080912 PR']),
           '2009GE':set(['20091103 GE', 'GE 20091103', '2009 GENERAL ELECTION', 'General Election, 2009', 'GENERAL ELECTION 2009', '2009 General Election', 'GENERAL 2009', '20091103 RO', '20091103 SP']) ,
           '2009RO':set(['20090929 RO', '20090930 RO', '20090929 SP', '20090929 PR', '20090929 GE', '20090929 Run Off', '20090930 PR', '20090922 RO', '20090928 SP']),
           '2009PR':set(['2009 PRIMARY ELECTION','09 PRIMARY ELECTION', '20090915 PR', 'Primary Election, 2009', 'PRIMARY 2009', '20090915 GE', '20091015 PR','20090915 RO', '20090908 PR']),
           '2009SP_0915':set(['20090915 SP']),
           '2009SP_0606':set(['20090602 SP']),
           '2009SP_0421':set(['20090421 SP']),
           '2009SP_0224':set(['20090224 SP']),
           '2010GE':set(['20101103 GE', '10 GENERAL ELECTION', '20101102 GE', 'GE 20101102', '2010 GENERAL ELECTION', 'General Election, 2010', '2010 General Election', 'GENERAL ELECTION 2010', 'GENERAL 2010', 'General Election 2010']),
           '2010PR':set(['Primary Election, 2010', '10 PRIMARY ELECTION', '20100914 PR', '2010 Primary Election', '2010 PRIMARY ELECTION', 'PRIMARY 2010']),
           '2010SP_0323':['20100323 SP'],
           '2010SP_0316':['20100316 SP'],
           '2010SP_0209':['20100209 SP'],
           '2011GE':set(['20111105 GE', '20111108 GE', '2011 General Election', '2011 GENERAL ELECTION', 'GENERAL 2011', 'General Election, 2011', 'GENERAL ELECTION 2011', '20111106 GE', '11 GENERAL  ELECTION']),
           '2011PR':set(['PR 20110913', '2011 PRIMARY ELECTION', 'PRIMARY ELECTION 2011', '20110913 PR', 'PRIMARY 2011', '2011 Primary Election', '11 PRIMARY ELECTION']),
           '2011SP_0913':['20110913 SP', '2011 Special Election 26CD'],
           '2012GE':set(['20121106 GE','2012 General Election', 'GENERAL 2012', '2012 GENERAL ELECTION', 'GENERAL ELECTION 2012', 'General Election, 2012', '12 GENERAL  ELECTION', 'General Election 2012', 'GE 20121106']),
           '2012PR_0913':set(['20120913 PR']),
           '2012PR_0620':set(['20120626 PR', '2012 PRIMARY ELECTION FEDERAL']), 
           '2012PP':set(['20120424 PP','2012 Presidential Primary']),
           '2012SP_0320':['20120320 SP'],
           '2013GE':set(['GE 20131105', '2013 GENERAL ELECTION', 'GENERAL 2013', '2013 General Election', 'GENERAL ELECTION 2013', 'General Election, 2013', '13 GENERAL  ELECTION', 'General Election 2013', '20131105 GE']),
           '2013PR':set(['13 PRIMARY ELECTION', 'PR 20130910', '20131001 PR', 'PRIMARY 2013', '2013 PRIMARY ELECTION', 'Primary Election, 2013', '2013 Primary Election', 'PRIMARY ELECTION 2013', '20130910 PR', '20130910 RO','20130901 PR']),
           '2013RO':set(['20131001 RO', 'RO 20131001', '20131001 PP','20130110']),
           '2013SP_0219':set(['20130219 SP', '20130219 GE']),
           '2014GE':set(['2014 GENERAL ELECTION', 'General Election, 2014', '20141104 GE', 'GE 20141104', '2014  GENERAL ELECTION', '2014 General Election', 'GENERAL ELECTION 2014', 'GENERAL 2014', '14 GENERAL  ELECTION', 'General Election 2014']), 
           '2014PR_0909':set(['20140909 PR', 'PR 20140909', '2014 PRIMARY STATE and LOCAL', '2014 Primary Election (State & Local)']),
           '2014PR_0624':set(['20140624 PR', 'PR 20140624', '2014CONGRESSIONAL PRIMARY']),
           '2015GE':set(['GE 20151103', '2015 General Election', '2015  GENERAL ELECTION', 'GENERAL 2015', '20151108 GE', '2015 GENERAL ELECTION', 'General Election, 2015', '15 GENERAL ELECTION', '20151103 GE']),
           '2015PR':set(['2015 Primary Election', '20150910 PR', 'PRIMARY 2015']),
           '2015SP_0505':set(['20150505 SP', 'SP 20150505']),
           '2016GE':set(['20161108 GE', '20161108 PR', '20161108 SP', 'GE 20161108', 'GENERAL ELECTION 2016', 'GENERAL 2016', '2016 General Election', 'General Election, 2016', '2016 GENERAL ELECTION', 'General Election 2016', '16 GENERAL ELECTION', '2008 General', 'General Election 2008', '20161107 GE', '20081104', '20161103 GE']),
           '2016PR_0913':set(['PR 20160913', '20160913 PR', 'PR 20160913', ]),
           '2016PR_0628':set(['20160628 PP', '20160628 PR', 'PR 20160628', '20160628 SP', 'Federal Primary, 2016']), 
           '2016PP':set(['Presidential Primary 2016', 'Presidential Primary Election 2016', '16 PRESIDENTIAL PRIMARY', '20160419 PP', 'PP 20160419', '2016 Presidential Primary', 'PRESIDENTIAL PRIMARY ELECTION 2016',  '20160412 PR', '20160419 PR', 'PRESIDENTIAL PRIMARY 2016', 'Presidential Primary, 2016', '2016 PRESIDENTIAL PRIMARY', '2016 Presidential Primary Election', ]), 
           '2016SP_0419':set(['20160419 SP', 'SP 20160419', '2016 SD 9 SPECIAL']),
           '2016SP_0213':set(['20160223 SP']),
           '2017GE':set(['20171107 GE', 'GE 20171107','2017 General Election', 'GENERAL ELECTION 2017', 'General Election, 2017', 'GE 20171107', '17 GENERAL ELECTION', 'GENERAL 2017', '2017 GENERAL ELECTION', '20171107 GE']),
           '2017PR':set(['PR 20170912','20170912 RO', 'PR 20170912', '20170926 PR', 'PRIMARY ELECTION 2017', 'PRIMARY 2017', '17 PRIMARY ELECTION', '20170912 SP', '20170912 PR']),
           '2017SP_0524':['20170523 SP'],
           '2017SP_0214':['20170214 SP'],
           '2018GE':set(['2018 General Election', 'General Election, 2018', 'GE 20181204', 'GENERAL 2018', 'GENERAL ELECTION 2018', '18 GENERAL ELECTION', 'General Election 2018', '2018 GENERAL ELECTION', '20181106 GE', 'GE 20181106']),
           '2018PR_0913':set(['20180913 PR','PR 20180913','Primary Election, 2018', '18 PRIMARY ELECTION', 'PRIMARY 2018', 'PRIMARY ELECTION 2018', '2018 Primary Election']),
           '2018PR_0626':set(['20180626 PR','PR 20180626']),
           '2018SP_0424':set(['20180424 SP', 'SP 20180424']),
           '2019GE':set(['GENERAL 2019','GE 20191105','2019 General Election', '20191105 GE', '2019 GENERAL ELECTION', 'GENERAL ELECTION 2019']),
           '2019PR':set(['PR 20190625', '20190625 PR']),
           '2019SP_0514':set(['20190514 SP', 'SP 2019-05-']),
           '2019SP_0226':set(['20190226 SP', 'SP 20190226', '20190226 GE']),
           '2020GE':set(['GE 20201103','General Election, 2020', '2020 GENERAL ELECTION', '20201103 GE', 'GENERAL ELECTION 2020', 'GENERAL 2020', 'GENERAL ELECTION 2020', 'General Election 2020', '2020 General Election']),
           '2020PR':set(['PRIMARY ELECTION 2020', '2020 JUNE PRIMARY', '2020 Presidential Primary Election','20200623 PR', 'PP 20200623', '20200623 PP']),
           '2020SP_1222':set(['SP 20201222', 'SP 20200202', '20201222 SP']),
           '2021GE':set(['GE 20211102','20211102 GE', ]),
           '2021PR':set(['PR 20210622', 'PRIMARY 2021', '20210622 PR']),
           '2021SP_1102':set(['SP 20211102','20211102 GE']),
           '2021SP_0323':set(['20210323 SP','SP 20210323']),
           '2021SP_0223':set(['20210223 SP','SP 20210223']),
           '2021SP_0202':set(['SP 20210202','20210202 SP']),
           '2022PR':set(['20220628 PR','PR 20220628']),
           '2022SP_0524':set(['20220524 SP','SP 20220524']),
           '2022SP_0322':set(['20220322 SP']),
           '2022SP_0215':set(['20220215 SP','SP 20220215']),
           '2022SP_0118':set(['GE 20220118','20220118 SP']),
            '_AMBIG':set(['08 PRIMARY ELECTION', '14 PRIMARY ELECTION', 'PRIMARY 2014', '12 PRIMARY ELECTION', 'Primary Election, 2012', '2012 PRIMARY ELECTION', '2012 Primary Election','Primary Election, 2008', '2008 PRIMARY ELECTION', 'Special Election, 2009', 'SPECIAL ELECTION 2009', '20110502 GE'])}

elec_type_dict = {'GENERAL': {'2007GE',
                              '2008GE',
                              '2009GE',
                              '2010GE',
                              '2011GE',
                              '2012GE',
                              '2013GE',
                              '2014GE',
                              '2015GE',
                              '2016GE',
                              '2017GE',
                              '2018GE',
                              '2019GE',
                              '2020GE',
                              '2021GE'},
                 'PRIMARY': {'2007PR',
                              '2008PP',
                              '2008PR',
                              '2009PR',
                              '2010PR',
                              '2011PR',
                              '2012PP',
                              '2012PR_0620',
                              '2012PR_0913',
                              '2013PR',
                              '2014PR_0624',
                              '2014PR_0909',
                              '2015PR',
                              '2016PP',
                              '2016PR_0628',
                              '2016PR_0913',
                              '2017PR',
                              '2018PR_0626',
                              '2018PR_0913',
                              '2019PR',
                              '2020PR',
                              '2021PR',
                              '2022PR'},
                 'RUNOFF': {'2009RO', '2013RO'},
                 'SPECIAL': {'2007SP',
                              '2009SP_0224',
                              '2009SP_0421',
                              '2009SP_0606',
                              '2009SP_0915',
                              '2010SP_0209',
                              '2010SP_0316',
                              '2010SP_0323',
                              '2011SP_0913',
                              '2012SP_0320',
                              '2013SP_0219',
                              '2015SP_0505',
                              '2016SP_0213',
                              '2016SP_0419',
                              '2017SP_0214',
                              '2017SP_0524',
                              '2018SP_0424',
                              '2019SP_0514',
                              '2019SP_0226',
                              '2020SP_1222',
                              '2021SP_1102',
                              '2021SP_0323',
                              '2021SP_0223',
                              '2021SP_0202',
                              '2022SP_0524',
                              '2022SP_0322',
                              '2022SP_0215',
                              '2022SP_0118'},
                 None: {'_AMBIG'}}


In [5]:
#Standardize election identifiers
def get_common_id(x, id_dict):
    for key, value in id_dict.items():
        if x in value:
            return key
    
    return None

nycha_voter_set['Clean_Elec_List'] = nycha_voter_set['ElecList'].progress_apply(lambda lst: set([get_common_id(item, ID_dict) for item in lst]))


#Create dummy columns where 1 = Voted
for col in tqdm(ID_dict.keys()):
    nycha_voter_set[col] = nycha_voter_set['Clean_Elec_List'].apply(lambda x: 1 if col in x else 0)

100%|██████████████████████████████| 4407587/4407587 [00:56<00:00, 78380.44it/s]
100%|███████████████████████████████████████████| 69/69 [01:28<00:00,  1.28s/it]


In [8]:
t = nycha_voter_set.copy()
t['FN'] = t['LASTNAME']+t['FIRSTNAME']+t['FULL_ED'].astype(str)


UDDINMOHAMMED32007.0       76
POLANCOJOSE72013.0         76
GARCIAJUAN71017.0          76
HOSSAINMD29053.0           76
GONZALEZJUAN85029.0        75
                           ..
CAPALDICARRIE74027.0        1
BENSUSENCHARLES71017.0      1
TORRESANNA74029.0           1
MALDONADOBENIGNA65059.0     1
DIAMONDPATRICK75012.0       1
Name: FN, Length: 189386, dtype: int64

In [11]:
pd.DataFrame(t['FN'].value_counts())['FN'].value_counts()

25    71184
26    37247
27    27542
23    13975
19     6234
      ...  
36        3
29        2
75        1
38        1
67        1
Name: FN, Length: 57, dtype: int64

In [66]:
t = pd.DataFrame(nycha_voter_set[nycha_voter_set.columns[13:]].mean()).reset_index()

In [67]:
t = t.loc[:67]
t['year'] = t['index'].apply(lambda x: x[:4]).astype(int)
t['type'] = t['index'].apply(lambda x: x[4:])


In [75]:
t1 = t[(t['year'] >= 2012) & (t['type'].apply(lambda x: ('SP' not in x) and ('RO' not in x) and ('PP' not in x) and ('PR_06' not in x)))]
t1.mean()

  t1.mean()


0          0.152266
year    2016.761905
dtype: float64

In [35]:
rates_by_ad = nycha_voter_set.groupby('AD').sum()
rates_by_ad.drop(columns=rates_by_ad.columns[:5], inplace=True)

In [36]:
for col in rates_by_ad.columns[2:]:
    try:
        rates_by_ad[col] = rates_by_ad[col]/rates_by_ad['_counter']
    except:
        pass

rates_by_ad.to_excel('NYCHA_rates_by_AD.xlsx')

In [39]:
rates_by_cd = nycha_voter_set.groupby('CD').sum()
rates_by_cd.drop(columns=rates_by_cd.columns[:5], inplace=True)

for col in rates_by_cd.columns[2:]:
    try:
        rates_by_cd[col] = rates_by_cd[col]/rates_by_cd['_counter']
    except:
        pass

rates_by_cd.to_excel('NYCHA_rates_by_CD.xlsx')

In [44]:
rates_by_council = nycha_voter_set.groupby('Council_Dist').sum()
rates_by_council.drop(columns=rates_by_council.columns[:6], inplace=True)

for col in rates_by_council.columns[1:]:
    try:
        rates_by_council[col] = rates_by_council[col]/rates_by_council['_counter']
    except:
        pass

rates_by_council.to_excel('NYCHA_rates_by_council.xlsx')

In [None]:
#Clean resulting dataframe (dropping all non-development identifiers) and group by development
nycha_voter_set = nycha_voter_set.drop(columns=nycha_voter_set.columns[:8])#.drop(columns=['ElecList','Clean_Elec_List'])

rates_by_dev = nycha_voter_set.groupby('TDS').sum()

for col in rates_by_dev.columns[1:]:
    try:
        rates_by_dev[col] = rates_by_dev[col]/rates_by_dev['_counter']
    except:
        pass

#Merge dev data with voter records
rates_by_dev_enriched = dev_info.merge(rates_by_dev, how='right', left_on='TDS', right_index=True).reset_index(drop=True)
rates_by_dev_enriched = rates_by_dev_enriched.sort_values(['GEO_BORO','CONS_TDS','TDS']).set_index(['GEO_BORO','CONS_TDS','TDS'])



In [None]:
rates_by_dev_enriched['_excess_registrants'] = (rates_by_dev_enriched['_counter']-rates_by_dev_enriched['18_PLUS'])

In [None]:
summ_table = rates_by_dev_enriched[['2016GE','2016PP','2017GE','2017PR','2018GE','2018PR_0913','2020GE','2020PR','2021GE','2021PR']].describe().loc['mean'].reset_index()
summ_table['YEAR'] = summ_table['index'].apply(lambda x: str(x)[:4])

def get_type(value, dict_name):
    for k, v in dict_name.items():
        if value in v:
            return k
        
summ_table['TYPE'] = summ_table['index'].apply(lambda x: get_type(x, elec_type_dict))
summ_table = summ_table.drop(columns='index')
summ_table.pivot_table(values='mean', index='YEAR', columns='TYPE').reset_index().sort_values('YEAR', ascending=False)

In [None]:
summ_table = rates_by_dev_enriched[['2016GE','2016PP','2017GE','2017PR','2018GE','2018PR_0913','2020GE','2020PR','2021GE','2021PR']].describe()#.loc['mean'].reset_index()
summ_table

In [None]:
excess_reg = rates_by_dev_enriched[rates_by_dev_enriched.columns[:6]]

In [None]:
excess_reg['excess_registrants'] = excess_reg['_counter']-excess_reg['18_PLUS']
excess_reg['excess_pct_of_adult_pop'] = excess_reg['excess_registrants']/excess_reg['18_PLUS']

In [None]:
excess_reg.to_excel('excess_registrants.xlsx')

In [None]:
#Summarize turnout by election type
rates_by_dev_summary = rates_by_dev_enriched.copy().drop(columns=rates_by_dev_enriched.columns[6:])

for col, values in elec_type_dict.items():
    rates_by_dev_summary[f'{col}_MEDIAN'] = rates_by_dev_enriched[values].median(axis=1)

In [None]:
rates_by_dev_enriched[rates_by_dev_enriched['DEV_NAME'].apply(lambda x: x is not np.nan)].to_excel('Rates_by_Development_2022.xlsx')
rates_by_dev_summary[rates_by_dev_summary['DEV_NAME'].apply(lambda x: x is not np.nan)].drop(columns='None_MEDIAN').to_excel('Rates_by_Development_Summary_2022.xlsx')

In [None]:
excess_reg.describe()

### Developments of Voting Rule WG Members

In [None]:
#Jackson [120, cons 267], Red Hook West [79], Polo Grounds [149], Soundview [71], Howard [72], Bland [54, cons 186], Nostrand [43, cons 36], Riis[18/19], Queensbridge [5/505], 
#Staten Island TBD

member_dev_list = [120, 79, 149, 71, 72, 54, 43, 18, 19, 5, 505, 114, 38]
member_data = rates_by_dev_enriched.reset_index()[rates_by_dev_enriched.reset_index()['TDS'].apply(lambda x: x in member_dev_list)]

In [None]:
member_data_summary = member_data.copy().drop(columns = member_data.columns[9:])
sum_col_list = []
for col, values in elec_type_dict.items():
    member_data_summary[f'{col}_MEDIAN'] = member_data[values].median(axis=1)
    member_data_summary[f'{col}_MAX'] = member_data[values].max(axis=1)
    member_data_summary[f'{col}_MIN'] = member_data[values].min(axis=1)
    
    sum_col_list = sum_col_list + [f'{col}_MEDIAN', f'{col}_MAX', f'{col}_MIN']

In [None]:
riis = member_data_summary.iloc[[5,6]]
riis = riis.append(pd.DataFrame(data = {key:None for key in riis.columns}, columns=riis.columns, index=[3]))

qns = member_data_summary.iloc[[8,10]]
qns = qns.append(pd.DataFrame(data = {key:None for key in qns.columns}, columns=qns.columns, index=[3]))


riis_full = member_data.iloc[[5,6]]
riis_full = riis_full.append(pd.DataFrame(data = {key:None for key in riis_full.columns}, columns=riis_full.columns, index=[3]))

qns_full = member_data.iloc[[8,10]]
qns_full = qns_full.append(pd.DataFrame(data = {key:None for key in qns_full.columns}, columns=qns_full.columns, index=[3]))



In [None]:
def combine_proportions(df, column):
    votes_list = []
    for row in df.iterrows():
        votes_list.append(row[1][column]*row[1]['_counter'])
        
    return sum(votes_list)/sum(df['_counter'])
                               
        
for col in sum_col_list:
    riis.loc[3, col] = combine_proportions(riis[:2], col)
    qns.loc[3, col] = combine_proportions(qns[:2], col)
    
riis.loc[3, 'DEV_NAME'] = 'RIIS_COMBINED'
qns.loc[3, 'DEV_NAME'] = 'QUEENSBRIDGE_COMBINED'

#Same for full data

for col in member_data.columns[9:]:
    riis_full.loc[3, col] = combine_proportions(riis_full[:2], col)
    qns_full.loc[3, col] = combine_proportions(qns_full[:2], col)
    
riis_full.loc[3, 'DEV_NAME'] = 'RIIS_COMBINED'
qns_full.loc[3, 'DEV_NAME'] = 'QUEENSBRIDGE_COMBINED'
    




In [None]:
member_data_summary = member_data_summary.append(riis.loc[3]).append(qns.loc[3])
member_data = member_data.append(riis_full.loc[3]).append(qns_full.loc[3])

In [None]:
member_data_tables = member_data_summary[['DEV_NAME']+sum_col_list]

In [None]:
t = member_data_tables.set_index('DEV_NAME').stack().reset_index()
t['TYPE'] = t['level_1'].apply(lambda x: x.split('_')[0])
t['MEASURE'] = t['level_1'].apply(lambda x: x.split('_')[1])
t.columns = ['DEV_NAME','_del','TURNOUT','TYPE', 'MEASURE']
t = t.pivot_table(index=['DEV_NAME','TYPE'], values='TURNOUT', columns='MEASURE')

In [None]:
writer = pd.ExcelWriter('_tables_for_members/TABLES.xlsx')

for name in t.index.levels[0]:
    t_sub = t.loc[name].transpose()
    t_sub['OTHER'] = t_sub[['None','RUNOFF','SPECIAL']].mean(axis=1)
    t_sub.drop(columns=['None','RUNOFF','SPECIAL']).to_excel(writer, sheet_name = f'{name}')

writer.save()

In [None]:
t_sub

In [None]:
elec_by_development = member_data[['DEV_NAME']+list(member_data.columns[9:])].set_index('DEV_NAME').transpose()

In [None]:
elec_by_development.to_excel('_TEMP_wg_elections.xlsx')

In [None]:
elec_by_development.transpose()[['2016GE','2016PP','2017GE','2017PR','2018GE','2018PR_0913','2020GE','2020PR','2021GE','2021PR']].describe()

In [None]:
elec_by_development