In [1]:
from IPython.display import Markdown, display

display(Markdown("sales.md"))

### [LA3 Sales](https://dlsgateway.dor.state.ma.us/gateway/DLSPublic/ParcelSearch)

* [DOR Land Use codes and Arms Length sales (NAL) codes](https://www.mass.gov/doc/property-type-classification-codes-non-arms-length-codes-and-sales-report-spreadsheet/download)


MA Department of Local Services (DLS) Gateway offers local officials
an immediate way to enter data and verify submission status across all
the regulatory review programs administered by Division of Local
Services. The LA3 parcel search details all real estate property sales
used in assessment valuations.

The data is for every city and town in Massachusetts from about 2001
through the current period and is generally updated each year in the
fall.  Fiscal year generally ends on June 30.  The same realestate
sales are sometimes used in different fiscal years; duplicates are
removed. Recreating assessment valuations using the LA3 Sales process
would require the possible inclusion of duplicate sales in different
fiscal years.

The data is incomplete for calendar years 2001 and 2007.  The
*Process*(I, C), *Prior Assessed Value*, *Current Assessed Value* and
*A/S Ratio* (Assessed to Sales) columns are dropped.  Columns *St
Name*, *St Alpha* and *Num* are combined into the address column.

The PID (asessor's property identifier) changes to standard full
format in 2005, the function *fix_pid_property_sales* is the attempt
to normalize all pids to current, long form standard; ~700/13000 fail
to match assessor records, mostly in 2002 and 2004.

The LA3 sales report is downloaded, transformed and loaded to postgres
using the [sales ETL notebook]().

#### Output

*  address TEXT
*  land_use INTEGER   [cross reference](https://www.mass.gov/doc/property-type-classification-codes-non-arms-length-codes-and-sales-report-spreadsheet/download) with description in common.int_value_pairs sql table
*  date TEXT
*  price INTEGER
*  buyer TEXT
*  seller TEXT
*  sale_type INTEGER [cross reference](https://www.mass.gov/doc/property-type-classification-codes-non-arms-length-codes-and-sales-report-spreadsheet/download) with description in common.int_value_pairs sql table
*  year INTEGER
*  loc_id TEXT
*  pid TEXT



## Issues

1. None

## Set-up

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv (
        find_dotenv (
            usecwd=True
        ),
    override=True
) # read local .env file and override any existing

from sqlalchemy import create_engine
from os import environ

username     =  environ.get("POSTGRES_USERNAME", "postgres")
password     =  environ.get("POSTGRES_PASSWORD", "postgres")
ipaddress    =  environ.get("POSTGRES_IPADDRESS", "localhost")
port         =  environ.get("POSTGRES_PORT", "5432")
dbname       =  environ.get("POSTGRES_DBNAME", "ArlingtonMA")

#establish database connection for Transform queries and Loads
cnx= create_engine(f'postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}')


In [None]:
import pandas as pd

import time
import shutil

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## Extract

In [None]:
def start_up(data_dir = 'sales', headless=True):
    from datetime import datetime
    from selenium import webdriver
    from selenium.webdriver.firefox.options import Options

    url = 'https://dlsgateway.dor.state.ma.us/gateway/DLSPublic/ParcelSearch/'

    options = Options()
    options.add_argument("start-maximized")

    if headless==True:
        options.add_argument("--headless")

    prefs = {
        "download.default_directory"   :  data_dir,
        "download.prompt_for_download" :  False,
        "download.directory_upgrade"   :  True
    }

    options.add_experimental_option('prefs', prefs)

    driver = webdriver.Firefox(
        options=options
    )

    driver.get(url)

    return driver

def select_town(driver,
                town='ARLINGTON',
                id='SearchFormEx1_ACSDropDownList_Towns'):
    from selenium.webdriver.support.ui import Select
    try:
        element = WebDriverWait(driver,10)\
                    .until(EC.presence_of_element_located((By.ID, id)))
        select = Select(driver.find_element(By.ID,id))
        select.select_by_visible_text(town)

    except :
        print ("Town dropdown select failed.")

def select_all(driver,id = "SelectedPropertyTypes"):
    from selenium.webdriver.support.ui import Select

    try:
        element = WebDriverWait(driver,10)\
                    .until(EC.presence_of_element_located((By.ID, id)))
        for elem in Select(driver.find_element(By.ID,id)).options:
            elem.click()

    except :
        print ("Select all failed. check is_multiple")

        
def update_input_range(driver,
                       input_ids, 
                       input_range):
    idx = 0
    for id in input_ids:
        element = WebDriverWait(driver,10)\
                    .until(EC.presence_of_element_located((By.ID, id)))
        input_field = driver.find_element(By.ID,id)
        input_field.clear()
        input_field.send_keys(str(input_range[idx]))
        idx += 1

    return True


def Btn ( driver, button ) :
    try:
        element = WebDriverWait(driver,10)\
                    .until(EC.presence_of_element_located((By.ID, button)))
        driver.find_element(By.ID,button).click()

    except :
        print ("Waited 10s for button.")
        
    return True


* Use selenium to:

* Open [webpage](https://dlsgateway.dor.state.ma.us/gateway/DLSPublic/ParcelSearch/)
* Input search criteria
    * data_dir - file system location for downloaded file

    * TOWN - must be proper name (default Arlington) 
    * DATEFROM - mm/dd/yyyy format
    * DATETO   - mm/dd/yyyy format
* Export to excel to data_dir
* rename excel file to avoid overwriting

In [None]:
data_dir = "sales"

TOWN = 'Boston'
DATEFROM = '1/1/2000'
DATETO = '12/31/2023'

browser = start_up(data_dir, headless=False)
time.sleep(5)

select_town ( browser , town=TOWN, id='SelectedJurisdictions' )

select_all ( browser , id = "SelectedPropertyTypes" )

update_input_range(
    browser,
    ["SaleDateFrom","SaleDateTo"], 
    (DATEFROM,DATETO)
)

Btn (browser, 'btnExport')

time.sleep(5)
browser.close()

shutil.move(data_dir+'LA3ParcelSearch.xlsx', data_dir+TOWN+'MA.xlsx')

## Transform

In [None]:
def fix_pid_property_sales ( df ) :
    
    mask = df.pid.str.contains('\. | ')
    df.loc[mask,'pid']=df.loc[mask,'pid'].str.replace('\. | ','-',regex=True)

    mask = df.pid.str.contains('_')
    assert(df[mask].pid.str.split('_').apply(len).unique()[0]==3)

    ##place A at end, maybe wrong for example 036.A-0006-0022.0
    mask2 = df[mask].pid.str.split('_').str[0].str.contains('A|\.A')
    df.loc[mask&mask2,'pid']    = df.loc[mask&mask2,'pid'].str.replace('A|\.A','',regex=True)+'.A'
    df.loc[mask&(~mask2),'pid'] = df.loc[mask&(~mask2),'pid']+'.0'

    ##pad with '0' and add proper separator, note '.0-' in second place, see above, could be '.A-' or '.B-' 
    foo=pd.DataFrame()

    for idx in range(3):
        foo.loc[:,idx] = df.loc[mask,'pid'].str.split('_').str[idx]

    df.loc[mask,'pid'] = foo[0].str.pad(width=3,fillchar='0')  +\
                            '.0-'                              +\
                         foo[1].str.pad(width=4,fillchar='0')  +\
                            '-'                                +\
                         foo[2].str.pad(width=6,fillchar='0')

    
    df.loc[mask,'pid'] = df.loc[mask,'pid'] . str . replace ( 'A\.0-' , '.A-' , regex = True ) \
                                            . str . replace ( 'B\.0-' , '.B-' , regex = True ) \
                                            . str . replace ( 'D\.0-' , '.D-' , regex = True ) \
                                            . str . replace ( 'E\.0-' , '.E-' , regex = True ) \
                                            . str . replace ( 'H\.0-' , '.H-' , regex = True ) \
                                            . str . replace ( 'I\.0-' , '.I-' , regex = True ) \
                                            . str . replace ( 'C\.0-' , '.C-' , regex = True )
    

    df.loc[:,'pid'] = df.loc[:,'pid'] . str . replace ( '\.\.' , '.' , regex = True )

    mask = df.pid.str.contains('A.0$|B.0$|C.0$|D.0$|E.0$|F.0$',regex=True)

    df.loc[mask,'pid'] = df.loc[mask,'pid']\
                                    .str.replace('A.0$','.A',regex=True)\
                                    .str.replace('B.0$','.B',regex=True)\
                                    .str.replace('C.0$','.C',regex=True)\
                                    .str.replace('D.0$','.D',regex=True)\
                                    .str.replace('E.0$','.E',regex=True)\
                                    .str.replace('F.0$','.F',regex=True)

    foo=pd.DataFrame() #property_sales.loc[mask,'pid'].str.split('-').str[2]
    for idx in range(3):
        foo.loc[:,idx] = df.loc[mask,'pid'].str.split('-').str[idx]

    df.loc[mask,'pid'] = foo[0].str.pad(width=5,fillchar='0')  +\
                                        '-'                                +\
                                     foo[1].str.pad(width=4,fillchar='0')  +\
                                        '-'                                +\
                                     foo[2].str.pad(width=6,fillchar='0')

    df.loc[:,'pid'] = df.loc[:,'pid'] . str . replace ( '\.\.' , '.' , regex = True )

    ##known exceptions for Arlington
    exceptions = {
        '044.0-0001-0173B.A': '044.A-0001-0173.B',
        '046.F.0-0001-0001.0': '046.F-0001-0001.0',
        '160.A-0005-00118.0': '160.A-0005-0118.0',
        '160.A-0005-00120.0': '160.A-0005-0120.0',
        '001.A-0003-0018': '001.A-0003-0018.0',
        '001.A-0003-0016': '001.A-0003-0016.0',
        '055.A-0002-003.0': '055.A-0002-0003.0',
        '002.A-0001-0002.0':'002.A-0001-002.0',
        '002.A-0001-0001.0':'002.A-0001-001.0',
        '127.A-0003-0000.B':'127.A-0003-000B.0',
    }
    
    df . loc [ mask , 'pid' ]  =  df . loc [ mask , 'pid' ] . replace ( exceptions )

    return df . sort_values ( [ 'year' , 'date', 'pid' ] ) . reset_index ( drop = True )


def get_property_sales ( df ):
    
    from numpy import nan
    
   
    df [ 'date' ]  =  pd . to_datetime ( df [ 'Sale Date' ] ) 
    df [ 'price']  =  df [ 'Sale Price' ] . astype ( int )
    df [ 'land_use'  ]  =  df [ 'Use Code'   ] . astype ( str )
    df [ 'pid'  ]  =  df [ 'Parcel Id'  ] . str . strip ( )
    df [ 'loc_id'  ]  =  df [ 'Location Id'  ] . str . strip ( )
    df [ 'year' ]  =  df [ 'date' ] . dt . strftime ( '%Y') . astype ( int )
    df [ 'date' ]  =  df [ 'date' ] . dt . strftime ( '%Y-%m-%d') 
    df [ 'buyer'  ]       =  df [ 'Buyer'  ]
    df [ 'seller'  ]      =  df [ 'Seller'   ]
    df [ 'streetName'  ]  =  df [ 'St Name'  ]
    df [ 'streetNum'  ]   =  df [ 'St Num'   ]
    df [ 'unit'  ]        =  df [ 'St Alpha' ]

    df  =  df . replace ( { nan : '' } )   
    df [ 'sale_type'  ]   =  df [ 'NAL Code' ] . replace ( {'':'0','\?':''} , regex = True )

    ## replace with apply and join 
    df['address']  =  df [ 'St Num'   ] . map ( str , na_action = 'ignore' ) +\
                      df [ 'St Alpha' ] . map ( str , na_action = 'ignore' ) +\
                ' ' + df [ 'St Name'  ] . map ( str , na_action = 'ignore' )
    
    
    cols   =  [ 'streetName', 'streetNum', 'unit' ,'land_use' , 'date' , 'price' , 'buyer' , 'seller' , 'sale_type' , 'year', 'loc_id' , 'pid' ]
    dcols  =  [ 'pid' , 'date' ,'price' ]
    df     =  df [ ~df . duplicated ( dcols ) ] [cols]
       
    df = fix_pid_property_sales ( df )

    df  =  df . replace ( { nan : '' } )
    
    return df [ ~df . duplicated ( dcols ) ] [cols]  . sort_values ( [ 'year', 'land_use', 'date', 'pid' ] ) . reset_index ( drop = True )

def convert_codes_2_ints(df, cnx):
    ## convert code for land_use and sale_type into int_value_pairs
    query = """
                select ivp.key as ivp,c2d.key as c2d
                from common.int_value_pairs ivp
                left join common.code_2_desc c2d on c2d.description=ivp.value
                where  ivp.item='NAL'
                and c2d.item='NAL';
            """
    c2d_2_ivp={}
    for x,y in pd.read_sql(query,cnx).to_dict()['c2d'].items():
        c2d_2_ivp[y] = x

    df.sale_type=sales.sale_type.replace(c2d_2_ivp)

    query = """
                select ivp.key as ivp,c2d.key as c2d
                from common.int_value_pairs ivp
                left join common.code_2_desc c2d on c2d.description=ivp.value
                where  ivp.item='use'
                and c2d.item='use';
            """
    c2d_2_ivp={}
    for x,y in pd.read_sql(query,cnx).to_dict()['c2d'].items():
        c2d_2_ivp[y] = x

    df.land_use=sales.land_use.replace(c2d_2_ivp)
    
    return df

In [None]:
sales = convert_codes_2_ints ( get_property_sales ( df ), cnx )

## Load

* drop/create postgres property.sales table
* use pandas to_sql to append to newly created table
    * modify for updates

In [None]:
table_create_query = \
    """
        --DROP TABLE IF EXISTS property.sales;
        CREATE TABLE property.sales (
            "streetName" varchar(23),
            "streetNum" varchar(4),
            "unit" varchar(5),
            "land_use" SMALLINT,
            "date"  DATE,
            "price"  INTEGER,
            "buyer"  varchar(36),
            "seller"  varchar(44),
            "sale_type"  SMALLINT,
            "year"  SMALLINT NOT NULL,
            "loc_id"  varchar(16),
            "pid"  varchar(19)
        );
        CREATE INDEX sales_idx 
            ON property.sales(pid);
    """

##cnx.execute(table_create_query)

In [None]:
sales.to_sql('sales',schema='property',con=cnx,
             if_exists='append',index=False)  