In [1]:
from IPython.display import Markdown, display

display(Markdown("permits.md"))

#### [ArlingtonMA Permits](https://www.arlingtonma.gov/departments/inspectional-services/view-building-permits)

Extract from in-house database thru perl UI.  Not applicable to other municipalities.

#### Issues

* About 30% of permit addresses do not match assessor records.



## Issues

1. Replace selenium chrome driver with firefox driver

## Set-up

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv (
        find_dotenv (
            usecwd=True
        ),
    override=True
) # read local .env file and override any existing

from sqlalchemy import create_engine
from os import environ

username     =  environ.get("POSTGRES_USERNAME", "postgres")
password     =  environ.get("POSTGRES_PASSWORD", "postgres")
ipaddress    =  environ.get("POSTGRES_IPADDRESS", "localhost")
port         =  environ.get("POSTGRES_PORT", "5432")
dbname       =  environ.get("POSTGRES_DBNAME", "ArlingtonMA")

#establish database connection for Transform queries and Loads
cnx= create_engine(f'postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}')


In [None]:
import pandas as pd

## Extract

In [None]:
def start_up(url, headless=False):

    from seleniumbase import Driver

    driver = Driver(browser="chrome", headless=False)
    
    driver.get(url)

    return driver

def extract_permits(YEAR = 2023):
    from selenium.webdriver.common.by import By
    import time
    
    STARTDATE='1/1/'+str(YEAR)
    ENDDATE='12/31/'+str(YEAR)
    
    ##the url passed parameters; permittype= NULL defaults to 'Any permit'
    url = 'https://secure.town.arlington.ma.us/BuildingPermits/'+\
            'Select.pl?permittype=&issue=' +\
            STARTDATE + '&issue_thru=' + ENDDATE

    browser = start_up(url)

    browser.find_element(By.XPATH,'//input[@value = "Search"]').click()
    time.sleep(5)


    browser.find_element(By.PARTIAL_LINK_TEXT,'Show all').click()
    time.sleep(5)
    
    ##messed up
    df = pd.read_html(browser.page_source)[0].iloc[3:-3]
    
    browser.close()

    return df


In [None]:
permits_raw = extract_permits(YEAR = 2023)

## Transform

In [None]:
def update_property_permits ( df ) :

    df.columns     =  [
        'streetNum','streetName','date',
        'permit_type','permit_class','permit',
        'description','owner','contractor',
        'permit_value','permit_fee'
    ]
    df[['streetNum','streetName']] = df[['streetNum','streetName']].ffill()

    df['date']     =  pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d').astype(str)
    df['year']     =  pd.to_datetime(df.date).dt.strftime('%Y').astype(int)
    df.streetName  =  df.streetName.str.upper()
    #df             =  permits_norm(df)


    for col in ['permit','permit_value','permit_fee']:
        mask = df[col]==''
        df.loc[mask,col]='0'
        df[col]=df[col].fillna('0').astype(float).astype(int)


    cols = ['year','date','permit','permit_type','permit_class',
            'description','owner','contractor','permit_value','permit_fee',
            'streetName','streetNum','address']
    
    df['address']  =  df [ 'streetNum'  ] . map ( str , na_action = 'ignore' ) + ' ' +\
                      df [ 'streetName' ] . map ( str , na_action = 'ignore' )

    return df[cols].sort_values(['year','date','permit']).reset_index(drop=True)

def match_streetNameNum_2_pid(df, cnx):

    cols = ['streetName','streetNum']
    addresses = df[~df.duplicated(cols)][cols].sort_values(cols).reset_index(drop=True)
    len(addresses)


    addresses['unit']=addresses.streetNum.str.split('[Unit|#|unit]',regex=True).str[1]
    addresses['streetnum']=addresses.streetNum.str.split('[Unit|#|unit]',regex=True).str[0]
    addresses.unit=addresses.unit.astype(str).str.replace('nan','')

    query = """
            select "streetname","streetnum","unit","pid" 
            from property.assessments 
            where year=2023;
        """
    all_addresses = pd.read_sql_query(query,cnx)
    all_addresses.unit=all_addresses.unit.astype(str).str.replace('None','')
    all_addresses.columns = ['streetName','streetnum','unit','pid']
    cols = ['streetName','streetnum','unit']
    all_addresses=all_addresses[~all_addresses.duplicated(cols)]



    cols = ['streetName','streetnum','unit']
    combo = addresses.merge(all_addresses,how='left',on=cols,indicator='matched')
    matched = combo[combo.matched=='both'].drop('matched',axis=1)
    unmatched = combo[combo.matched=='left_only'][cols]

    assert (len(matched)+len(unmatched)-len(addresses))==0

    query = "select * from people.addresses where pid!='';"
    all_addresses = pd.read_sql_query(query,cnx)
    all_addresses=all_addresses.rename(columns={'streetNum':'streetnum'})
    all_addresses.unit=all_addresses.unit.astype(str).str.replace('None','')
    all_addresses.streetSuffix=all_addresses.streetSuffix.astype(str).str.replace('None','')

    cols = ['streetName','streetnum','unit']
    all_addresses=all_addresses[~all_addresses.duplicated(cols)]

    combo = unmatched.merge(all_addresses,how='left',on=cols,indicator='matched')
    matched=pd.concat([matched,combo[combo.matched=='both'][cols+['pid']]])

    unmatched=combo[combo.matched!='both'][cols]
    print(
        f' Matching and joining '
        f'  permits={len(df)} '
        f'  addresses={len(addresses)} '
        f'    matched={len(matched)} '
        f'not matched={len(unmatched)}'
         )
    
    return df . merge(matched,
                           how='left',
                           on=['streetName','streetNum'])\
                    . drop(['streetnum','unit'],axis=1)
    


In [None]:
permits = update_property_permits ( permits_raw.copy() )
permits = match_streetNameNum_2_pid(permits, cnx)

##exception
permits=permits.replace({'Wide character in print at D:\\Web\\Permits\\Web\\List.pl line 710. 163':''})

permit_classes = {'C':0,'I':1,'R':2}
permits.permit_class=permits.permit_class.replace(permit_classes)
permit_types = {'B':0,'C':1,'E':2,'G':3,'P':4}
permits.permit_type=permits.permit_type.replace(permit_types)

## Load

In [None]:
table_create_query = \
    """
        DROP TABLE IF EXISTS property.permits;
        CREATE TABLE property.permits (
            "year" SMALLINT NOT NULL ,
            "date" DATE ,
            "permit"  INTEGER NOT NULL,
            "permit_type"  SMALLINT,
            "permit_class"  SMALLINT,
            "description"  text,
            "owner"  varchar(255),
            "contractor"  varchar(255),
            "permit_value"  INTEGER,
            "permit_fee"  INTEGER,
            "streetName"  varchar(50),
            "streetNum"  varchar(20),
            "address"  varchar(255),
            "pid"  varchar(17),
            PRIMARY KEY ("date","permit_type","permit")
        );
        CREATE INDEX permits_idx 
            ON property.permits(pid);
        CREATE INDEX permits_year_idx 
            ON property.permits(year);
    """

#cnx.execute(table_create_query)


In [None]:
query = """
    delete from property.permits where year={year};
""".format(year=2023)
cnx.execute(query)

permits.to_sql(
    'permits',schema='property',con=cnx,
    if_exists='append',index=False)
