In [1]:
##%load schools.md
from IPython.display import Markdown, display

display(Markdown("people.md"))

## [MA Secretary of State - Elections Division](https://www.sec.state.ma.us/divisions/elections/elections-and-voting.htm)

The Commonwealth of Massachusetts maintains a legacy database (1995)  that cities and towns use to upload details about the residents and request extracts for elections and town records purposes.

Below are the three types of extracts we turned into datasets.  From 2004 to 2023, there were about 110 different extracts providing snapshots of some 3.5M records.  As is apparent, the legacy system is suboptimal and a replacement is being planned.

Histograms are pre-calculated; see the ```histograms.ipynb``` notebook.

A. People schema contains 6 tables

    1. attributes ~90K rows
    ```CREATE TABLE IF NOT EXISTS people.attributes
            (
                people_id character(12) COLLATE pg_catalog."default" NOT NULL,
                name character varying(50)[] COLLATE pg_catalog."default",
                date_name date[],
                address_id smallint[],
                date_address_id date[],
                party smallint[],
                date_party date[],
                precinct smallint[],
                date_precinct date[],
                date_dob date,
                dob date,
                sex smallint,
                CONSTRAINT attributes_pkey PRIMARY KEY (people_id)
            )```
    2. addresses  ~30K rows
        * address_id - unique, incremental integer key
        * streetName
        * streetNum
        * unit
        * streetSuffix
        * pid - parcel id; from assessor parcels
    3. elections ~70K rows
    4. registered ~80K rows
    5. residents ~90K rows
        all use
        * people_id - unique, incremental integer key
        * date - array/list of dates
    6. histograms ~300K rows; 210K are from the 21 precinct histograms
        * date - from elections, registered, residents tables
        * type - one of ``` [activity, precinct, party, sex] ```
        * age  - missing default to 1900-01-01
        * count
    

B. MA Database columns

    1. True Lists of Residents  {TOWN_ID}RES_123456.txt
        Census information aboput residents over the age of 18.
        * Record Sequence Number
        * Resident Id Number
        * Last Name
        * First Name
        * Middle Name
        * Title
        * Date of Birth
        * Residential Address - Street Number
        * Residential Address - Street Suffix
        * Residential Address - Street Name
        * Residential Address - Apartment Number
        * Residential Address - Zip Code
        * Mailing Address - Street Name and Number
        * Mailing Address - Apartment Number
        * Mailing Address - City or Town
        * Mailing Address -State
        * Mailing Address - Zip Code
        * Occupation
        * Party Affiliation
        * Nationality
        * Ward Number
        * Voter Status
        * Mail to Code

    2. Registered Voters  {TOWN_ID}VOT_123456.txt
        All residents who are registered to vote
        * Record Sequence Number 
        * Voter ID Number 
        * Last Name 
        * First Name 
        * Middle Name 
        * Title 
        * Residential Address Street Number 
        * Residential Address Street Suffix 
        * Residential Address Street Name 
        * Residential Address Apartment Number 
        * Residential Address Zip Code 
        * Mailing Address ¿ Street Number and Name 
        * Mailing Address - Apartment Number 
        * Mailing Address - City or Town 
        * Mailing Address - State
        * Mailing Address - Zip Code 
        * Party Affiliation 
        * Date of Birth 
        * Date of Registration 
        * Ward Number 
        * Precinct Number 
        * Congressional District Number 
        * Senatorial District Number 
        * State Representative District 
        * Voter Status
        
    3. Election Voting Activity   {TOWN_ID}ANP_123456.txt
        Who voted in an election.
        a. ANP for local/state elections
        b. ACP for Federal elections
        c. Column headers
        * Party Affiliation 
        * Voter ID Number 
        * Last Name 
        * First Name 
        * Middle Name 
        * Residential Address - Street Number
        * Residential Address - Street Suffix 
        * Residential Address - Street Name 
        * Residential Address - Apartment Number 
        * Residential Address - Zip Code 
        * Type of Election 
        * Election Date 
        * City/ Town Name 
        * City/ Town Indicator 
        * City/ Town Code Assigned Number
        * Voter Title 
        * Ward Number 
        * Precinct Number 
        * Voter Status r
        * Mailing Address - Street Number/Name 
        * Mailing Address - Apartment Number 
        * Mailing Address - City/Town 
        * Mailing Address - State 
        * Mailing Address - Zip Code
        

## Set-up

In [1]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv (
        find_dotenv (
            usecwd=True
        ),
    override=True
) # read local .env file and override any existing

from sqlalchemy import create_engine
from os import environ

username     =  environ.get("POSTGRES_USERNAME", "postgres")
password     =  environ.get("POSTGRES_PASSWORD", "postgres")
ipaddress    =  environ.get("POSTGRES_IPADDRESS", "localhost")
port         =  environ.get("POSTGRES_PORT", "5432")
dbname       =  environ.get("POSTGRES_DBNAME", "ArlingtonMA")

#establish database connection for Transform queries and Loads
cnx= create_engine(f'postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}')


In [None]:
from RES_VOT_ANP  import *

import pandas as pd
import numpy as np
from glob import glob

## Extract

In [None]:
def get_raw_data ( stub = ''):

    data_dir   =  stub + 'people/elections/'
    elections  =  { }
    for file in glob ( data_dir + '*A*P*txt' ) :
        elections [ file . replace( data_dir , '' ) ] = get_people_elections ( file )

    data_dir   =  stub + 'people/residents/'
    residents  =  {}
    for file in glob ( data_dir + '*RES*txt' ) :
        residents [ file . replace( data_dir , '' ) ] = get_people_residents ( file )

    data_dir = stub + 'people/registered/'
    registered = {}
    for file in glob ( data_dir + '*VOT*txt' ) :
        registered [ file . replace( data_dir , '' ) ] = get_people_registered ( file )


    xref        =  create_file_asof_date_xref ( registered , residents , elections ) 
    
    return elections, residents, registered, xref

def consolidate_people (  ) :
    
    elections, residents, registered , xref = get_raw_data ( )
    
    cols = [
        'Residential Address - Street Number',
        'Residential Address - Street Suffix',
        'Residential Address - Street Name',
        'Residential Address - Apartment Number',
        'Residential Address - Zip Code'
    ]

    all_registered = pd.DataFrame()
    for key in registered.keys():
        df = registered[key]
        df['people_type'] = 'registered'
        df['id'] = df['Voter Id Number']
        df['precinct'] = df['Precinct Number'].astype(int).astype(str)
        df['date'] = xref['registered'][key]
        for col in cols:
            df[col] = df[col].astype(str)
        all_registered = pd.concat([all_registered,df])

    all_residents = pd.DataFrame()
    for key in residents.keys():
        df = residents[key]

        ##10RES_147065.txt is messed up
        mask = df['Residential Address - Street Suffix'] == '2-Jan'
        df.loc[mask,'Residential Address - Street Suffix']='1/2'

        mask = df['Residential Address - Apartment Number'].str.contains('Jan')
        df.loc[mask,'Residential Address - Apartment Number']='1-'+df.loc[mask,'Residential Address - Apartment Number'].str.split('-').str[0]
        mask = df['Residential Address - Apartment Number'].str.contains('Feb')
        df.loc[mask,'Residential Address - Apartment Number']='2-'+df.loc[mask,'Residential Address - Apartment Number'].str.split('-').str[0]

        df['people_type'] = 'residents'
        df['id'] = df['Resident Id Number']
        df['precinct'] = df['Precinct Number'].astype(int).astype(str)
        df['date'] = xref['residents'][key]
        for col in cols:
            df[col] = df[col].astype(str)
        all_residents = pd.concat([all_residents,df])

    all_elections = pd.DataFrame()
    for key in elections.keys():
        df = elections[key]
        df['people_type'] = 'elections'
        df['id'] = df['Voter Id Number']
        df['precinct'] = df['Precinct Number'].astype(int).astype(str)

        df['date'] = xref['elections'][key]
        for col in cols:
            df[col] = df[col].astype(str)
        all_elections = pd.concat([all_elections,df])

    cols = [
        'id','people_type','date','name','party','precinct',
        'Residential Address - Street Number',
        'Residential Address - Street Suffix',
        'Residential Address - Street Name',
        'Residential Address - Apartment Number'
    ]

    people = pd.concat([all_elections[cols],all_residents[cols],all_registered[cols]])

    cols = ['id',
            'Residential Address - Street Number',
            'Residential Address - Street Suffix',
            'Residential Address - Street Name',
            'Residential Address - Apartment Number']
    newcols = ['people_id','streetNum','streetSuffix','streetName','unit']

    people=people.rename(columns = dict(zip(cols,newcols)))
    people=people.replace({np.nan:'',pd.isnull:''})

    people_2_assessor = {'ANDREWS WAY':'ANDREWS WY',
     '^APACHE TRL$':'APACHE TR',
     '^FALMOUTH ROAD WEST$':'FALMOUTH RD W',
     "FARMER'S CIR":'FARMER`S CIR',
     '^GROVE STREET PL$':'GROVE ST PL',
     '^LAKEVIEW ST$':'LAKEVIEW',
     '^SKY LINE DR$':'SKYLINE DR',
     '^SPRING VALLEY ST$':'SPRING VALLEY',
     '^SUMMER STREET PL$':'SUMMER ST PL',
     '^TEEL STREET PL$':'TEEL ST PL',
     '^ARIZONA TER$':'ARIZONA TERR',
     '^NORTH UNION$':'NORTH UNION ST',
     '^UPLAND ROAD WEST$':'UPLAND RD WEST'}
    people=people.replace(people_2_assessor,regex=True)

    print('\nobservations of people',len(people))
    
    return all_elections, all_residents, all_registered , xref, people

def get_dob_sex ( all_residents, all_registered ) :
    
    cols  =  [ 'id' , 'date' , 'Date of Birth' , 'Gender' ]
    df    =  pd . concat ( [ all_residents [ cols ] , all_registered [ cols ] ] )
    
    df . rename ( columns = { 
        'id'      :  'people_id' ,
        'date'    :  'date_dob'  ,
        'Gender'  :  'sex'       ,
        'Date of Birth' : 'dob'
    } , inplace = True )

    df  =  df [ ~ df . duplicated ( 'people_id' , keep = 'last') ] \
            . sort_values ( [ 'people_id' ] ) \
            . reset_index ( drop = True )

    assert ( pd . isnull ( df . dob ) . any ( ) == False )

    df . loc [ pd . isnull ( df . sex ) , 'sex' ] = ''
    
    return df


def get_addresses():
    
    all_addresses = pd.read_csv('people/all_addresses.tsv',sep='\t',dtype=str)
    all_addresses = all_addresses.replace({np.nan:'',pd.isnull:''})
    all_addresses = all_addresses.rename(columns={'id':'address_id'})
    all_addresses = all_addresses[~all_addresses.duplicated(all_addresses.columns[1:],keep='first')]#.sort_values(list(all_addresses.columns[1:]))
    assert(all_addresses.duplicated(all_addresses.columns[1:]).any()==False)

    pids = pd.read_csv('people/pid_address_parts_xref.tsv',sep='\t',dtype=str)
    pids = pids.replace({np.nan:'',pd.isnull:''})

    address_id_2_pid = all_addresses.merge(pids,how='outer',on=['streetName','streetNum','unit','streetSuffix'])[['address_id','pid']]

    ##badness!!!!
    address_id_2_pid=address_id_2_pid[~address_id_2_pid.address_id.duplicated()]

    all_addresses = all_addresses . merge ( address_id_2_pid , on='address_id' , how='left')

    return all_addresses

def aggregate ( combo , column ) :
    
    df = combo [ [ 'people_id' , column , 'date' ] ] . copy ( )
    df [ 'prior_' + column ] = df [ column ] . shift ( )
    df [ 'change' ]  =  ( df [ 'prior_' + column ] != df [ column ] ) | ( ~df . duplicated ( 'people_id' ) )
    df  =  df [ df . change == True ] [ [ 'people_id' , column , 'date' ] ] . sort_values ( ['people_id' , 'date' ] ) . reset_index ( drop = True )

    df  =  df . groupby ( 'people_id' ) . agg ( {
        column : list ,
        'date' : list
    } ) . reset_index ( )
    
    df . columns  =  df . columns . str . replace ( 'date' , 'date_' + column )
    
    return df


In [None]:
all_elections, all_residents, all_registered , dates_xref, people = consolidate_people (  )

all_addresses = get_addresses()


## Transform

In [None]:

combo = people.merge(all_addresses,on=['streetName','streetNum','unit','streetSuffix'],how='left',indicator='matched').sort_values(['date'])#.groupby('matched').count()
print(len(combo))
print(combo[['people_id','matched']].groupby('matched').count().to_markdown())
combo = combo . sort_values ( [ 'people_id' , 'date' ] ).drop('matched',axis=1).reset_index(drop=True)

combo.groupby(['party']).count()

party = list(combo.party.unique())
xref = dict(zip(party,range(len(party))))
combo.party=combo.party.replace(xref)

##s/b sorted by value before key assignment
int_value_pairs = pd.DataFrame.from_dict(
    dict(zip(range(len(party)),party)),
    orient='index').reset_index().rename(columns={'index':'key',0:'value'})
int_value_pairs['item']='party'



dob_sex     =  get_dob_sex ( all_residents, all_registered )
names       =  aggregate   ( combo , 'name' )
addresses   =  aggregate   ( combo , 'address_id' )
parties     =  aggregate   ( combo , 'party' )
precincts   =  aggregate   ( combo , 'precinct' )

attributes = names \
    . merge ( addresses , on = 'people_id' , how = 'outer' ) \
    . merge ( parties   , on = 'people_id' , how = 'outer' ) \
    . merge ( precincts , on = 'people_id' , how = 'outer' ) \
    . merge ( dob_sex   , on = 'people_id' , how = 'outer' ) \
    . replace ( { np . nan : '' } )


sex = list(attributes.sex.unique())
xref = dict(zip(sex,range(len(sex))))
attributes.sex=attributes.sex.replace(xref)

xref = pd.DataFrame.from_dict(
    dict(zip(range(len(sex)),sex)),
    orient='index').reset_index().rename(columns={'index':'key',0:'value'})
xref['item']='sex'

int_value_pairs = pd.concat([int_value_pairs,xref])


peeps = combo[['people_type','people_id','date']].sort_values(['people_type','date']).groupby(['people_type','people_id']).agg({
    'people_id':len,
    'date':list
})
peeps.columns = ['obs','date']
peeps = peeps.reset_index()

## Load

In [None]:
table_create_query = \
    """
        DROP TABLE IF EXISTS people.elections;
        CREATE TABLE people.elections (
            "people_id"    CHAR(12),
            "date"         DATE ARRAY,
            PRIMARY KEY ("people_id")
        );
        CREATE INDEX elections_idx 
            ON people.elections("people_id");

        DROP TABLE IF EXISTS people.residents;
        CREATE TABLE people.residents (
            "people_id"    CHAR(12),
            "date"         DATE ARRAY,
            PRIMARY KEY ("people_id")
        );
        CREATE INDEX residents_idx 
            ON people.residents("people_id");

        DROP TABLE IF EXISTS people.registered;
        CREATE TABLE people.registered (
            "people_id"    CHAR(12),
            "date"         DATE ARRAY,
            PRIMARY KEY ("people_id")
        );
        CREATE INDEX registered_idx 
            ON people.registered("people_id");


        DROP TABLE IF EXISTS people.addresses;
        CREATE TABLE people.addresses (
            "address_id" SMALLINT,
            "streetName" VARCHAR(20),
            "streetNum" VARCHAR(5),
            "unit" VARCHAR(10),
            "streetSuffix" VARCHAR(5),
            "pid" VARCHAR(17),
            PRIMARY KEY ("address_id")
        );
        CREATE INDEX addresses_idx 
            ON people.addresses(address_id);

        DROP TABLE IF EXISTS people.attributes;
        CREATE TABLE people.attributes (
            "people_id"    CHAR(12),
            "name" VARCHAR(50) ARRAY,
            "date_name" DATE ARRAY,
            "address_id" SMALLINT ARRAY,
            "date_address_id" DATE ARRAY,
            "party" SMALLINT ARRAY,
            "date_party" DATE ARRAY,
            "precinct" SMALLINT ARRAY,
            "date_precinct" DATE ARRAY,
            "date_dob" DATE,
            "dob" DATE,
            "sex" SMALLINT,
            PRIMARY KEY ("people_id")
        );
        CREATE INDEX attributes_idx 
            ON people.attributes(people_id);

    """

In [None]:
cnx.execute(table_create_query)


# int_value_pairs.to_sql('int_value_pairs',schema='common',
#                     con=cnx,if_exists='append',index=False)


all_addresses.to_sql('addresses',schema='people',con=cnx,if_exists='append',index=False)

from sqlalchemy.dialects.postgresql import ARRAY, DATE
import sqlalchemy

def date_str (date):
    return [x if x!='' else '1900-01-01' for x in date ]



for ptype in ['elections','residents','registered']:

    dtypesdict={}
    x = peeps[peeps.people_type==ptype]
    #x['date'] = x['date'].fillna(["1900-01-01"]).apply(date_str)
    dtypesdict.update({'date': ARRAY(sqlalchemy.types.DATE())})
    x.drop(['people_type','obs'],axis=1).to_sql(ptype,schema='people',con=cnx,if_exists='append',index=False,dtype=dtypesdict)




def date_str (date):
    return [x if x!='' else pd.to_datetime('1900-01-01') for x in date ]

def int_str (strings):
    return [int(str(x).replace('.0','')) for x in strings if x !='' and x == x]


dtypesdict.update({'name': ARRAY(sqlalchemy.types.VARCHAR())})
dtypesdict.update({'date_name': ARRAY(sqlalchemy.types.DATE())})
dtypesdict.update({'date_address_id': ARRAY(sqlalchemy.types.DATE())})
dtypesdict.update({'date_party': ARRAY(sqlalchemy.types.DATE())})
dtypesdict.update({'date_precinct': ARRAY(sqlalchemy.types.DATE())})

dtypesdict.update({'name': ARRAY(sqlalchemy.types.VARCHAR(50))})
dtypesdict.update({'address_id': ARRAY(sqlalchemy.types.SMALLINT())})
dtypesdict.update({'party': ARRAY(sqlalchemy.types.SMALLINT())})
dtypesdict.update({'precinct': ARRAY(sqlalchemy.types.SMALLINT())})

attributes['date_name']       = attributes['date_name'].apply(date_str)
attributes['date_address_id'] = attributes['date_address_id'].apply(date_str)
attributes['date_party']      = attributes['date_party'].apply(date_str)
attributes['date_precinct']   = attributes['date_precinct'].apply(date_str)

mask = attributes.dob==''
attributes.loc[mask,'dob']='1900-01-01'
mask = attributes.date_dob==''
attributes.loc[mask,'date_dob']='1900-01-01'

attributes.to_sql('attributes',schema='people',con=cnx,if_exists='append',index=False,dtype=dtypesdict)
