## Set-Up

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv (
        find_dotenv (
            usecwd=True
        ),
    override=True
) # read local .env file and override any existing

from sqlalchemy import create_engine
from os import environ

username     =  environ.get("POSTGRES_USERNAME", "postgres")
password     =  environ.get("POSTGRES_PASSWORD", "postgres")
ipaddress    =  environ.get("POSTGRES_IPADDRESS", "localhost")
port         =  environ.get("POSTGRES_PORT", "5432")
dbname       =  environ.get("POSTGRES_DBNAME", "MA")

#establish database connection for Transform queries and Loads
cnx= create_engine(f'postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}')

In [None]:
import pandas as pd
import tabula
from numpy import nan

## Extract

In [None]:
url = 'https://www.arlingtonma.gov/home/showpublisheddocument/64717/638163762463230000'
df = tabula.io.read_pdf(url, pages='all',pandas_options={'header': None})

tmm=pd.DataFrame()
for idx in range(len(df)):
    tmm = pd.concat([tmm,df[idx]])

##check box in column 3; only first page has user supplied headers; 
## set NaN to blank; tabula ignores dtype=str option
tmm = tmm.iloc[1:,:].drop(tmm.columns[[3]],axis=1).replace({nan:''})
tmm.columns = ['precinct','name','address','term','phone','email']

tmm.precinct=tmm.precinct.astype(int)
tmm['year']=2023


In [None]:
data_dir = 'town_meeting/'
atm2022 = pd.read_excel(data_dir + 'members_20220401.xlsx',sheet_name='Sheet3')
atm2022=atm2022.replace({nan:''})
atm2022.precinct=atm2022.precinct.astype(int)
atm2022['year']=2022

atm2019 = pd.read_excel(data_dir + 'members_20190315_normed.xlsx',sheet_name='Sheet1')
atm2019=atm2019.replace({nan:''})
atm2019.precinct=atm2019.precinct.astype(int)
atm2019['year']=2019

combo = tmm.merge(atm2022,on=['precinct','name','address'],how='outer',indicator='matched')

## Transform

In [None]:
query =    """
    select "streetname", "streetnum",unit,pid
    from property.assessments 
    where year=2023
    ;
"""

parcels = pd.read_sql_query(query,cnx).replace({nan:''})
parcels['match_address']=(parcels.streetnum + ' ' + parcels.streetname + ' ' + parcels.unit).str.strip()
#parcels['match_address']


In [None]:
tmm['match_address']=tmm['address'].replace({'Avenue':'Ave','Street':'St','Road':'Rd','\,|\.|\#| Apt.':''},regex=True).str.upper()#.str.split(' ').str[-1]

In [None]:
tmm = tmm.merge(parcels[['match_address','pid']],how='left',on='match_address')

In [None]:
tmm = pd.read_csv(data_dir+'master_list.tsv',sep='\t')\
    .replace({nan:None})\
    .sort_values(['year','precinct','term','name'])\
    .rename(columns={'gender':'sex','resident_id':'people_id'})
tmm

## Load

In [None]:
table_create_query = \
    """
        DROP TABLE IF EXISTS governance.tmm;
        CREATE TABLE governance.tmm (
            "precinct" SMALLINT,
            "name" VARCHAR(255),
            "address" VARCHAR(255),
            "term" SMALLINT,
            "phone" VARCHAR(20),
            "email" VARCHAR(255),
            "year" SMALLINT,
            "people_id" VARCHAR(12),
            sex VARCHAR(2),
            pid VARCHAR(17)
        );
        CREATE INDEX tmm_year_idx 
            ON governance.tmm("year");
        CREATE INDEX tmm_precinct_idx 
            ON governance.tmm("precinct");
    """
cnx.execute(table_create_query)


In [None]:
tmm.to_sql('tmm',schema='governance',
           con=cnx,if_exists='append',index=False)