In [1]:
from IPython.display import Markdown, display

display(Markdown("deeds.md"))

### [Mass Registry of Deeds](https://www.masslandrecords.com/)

Massachusetts is divided into 21 registry districts with an elected Register of Deeds responsible for each office. Documents related to the ownership of real estate within the district are recorded at the Registry of Deeds. Recorded documents are assigned a sequential identifying number (known as the book and page number) and are then scanned into the registry's computer system. The resulting images are available for viewing on and printing from public access terminals at the registry and at your home or office over the Internet.

Common documents (Arlington in 2021)

* DISCHARGE 	2681
* MORTGAGE 	2281
* DEED 	1024
* CERTIFICATE OF MUNICIPAL LIEN 	653
* DECLARATION OF HOMESTEAD 	651
* CERTIFICATE 	282
* AFFIDAVIT 	175
* Blank 	155
* ASSIGNMENT 	88
* CERTIFICATE OF DEATH 	74


## Set-up

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv (
        find_dotenv (
            usecwd=True
        ),
    override=True
) # read local .env file and override any existing

from sqlalchemy import create_engine
from os import environ

username     =  environ.get("POSTGRES_USERNAME", "postgres")
password     =  environ.get("POSTGRES_PASSWORD", "postgres")
ipaddress    =  environ.get("POSTGRES_IPADDRESS", "localhost")
port         =  environ.get("POSTGRES_PORT", "5432")
dbname       =  environ.get("POSTGRES_DBNAME", "ArlingtonMA")

#establish database connection for Transform queries and Loads
cnx= create_engine(f'postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}')


In [None]:
import pandas as pd
from masslandrecords import *

## Extract

In [None]:
def start_up_firefox(county = 'MiddlesexSouth', headless=True):
    from datetime import datetime
    from selenium import webdriver
    from selenium.webdriver.firefox.options import Options

    url = 'http://www.masslandrecords.com/' + county +'/Default.aspx'

    options = Options()
    options.add_argument("start-maximized")

    if headless==True:
        options.add_argument("--headless")

    driver = webdriver.Firefox(
        options=options
    )

    driver.get(url)

    addedCookies = [{u'domain': u'www.masslandrecords.com',
      u'expiry': int(datetime.now().strftime("%s"))+3600*24*7,# 1 week in the future...
      u'name': u'AllowPopupTips',
      u'path': u'/',
      u'secure': False,
      u'value': u'False'}]

    for cookie in addedCookies:
        driver.add_cookie(cookie)

    return driver


In [None]:
from datetime import datetime

def get_latest_most_recent_docnos ( 
    cnx, 
    last_business_date = datetime.now().strftime('%m/%d/%Y')
):
  
    query = """
                select max(docno) as docno
                    from property.deeds_summary s
                    where extract(year from s.date)::int={year}
                    and docno<1e6
            """
    latest_docno = pd.read_sql_query(
        query.format(year=int(datetime.now().strftime('%Y'))),
        cnx).docno.values[0]

    browser=start_up_firefox(headless=False)
        
    Btn(browser,"Navigator1_SearchCriteria1_menuLabel")
    Btn(browser,'Navigator1_SearchCriteria1_LinkButton02') #recorded land
    #Btn(browser,'Navigator1_SearchCriteria1_LinkButton13')#registered land
    Btn(browser,'SearchFormEx1_BtnAdvanced')
    time.sleep(2)
    data = [int(latest_docno), 1000000,last_business_date,last_business_date]
    
    fill_in_form(browser,data)
    
    Btn(browser,'SearchFormEx1_btnSearch')
    Btn(browser,'DocList1_PageView100Btn')
    
    ##sort descending
    xpath = '//*[@id="DocList1_ContentContainer1"]/table/tbody/tr[1]/td/div/div[1]/table/thead/tr/th[6]/a'
    browser.find_element(By.XPATH,xpath).click()
    browser.find_element(By.XPATH,xpath).click()
    
    df = get_table_contents(browser)
    more_recent_docno = int(df['Doc. #'].iloc[0])

    browser.close()

    return latest_docno , more_recent_docno


In [None]:
def get_deeds_summary(latest_docno,more_recent_docno,cnx):

    browser=start_up_firefox(headless=False)
    time.sleep(5)

    Btn(browser,"Navigator1_SearchCriteria1_menuLabel")
    Btn(browser,'Navigator1_SearchCriteria1_LinkButton02') #recorded land
    #Btn(browser,'Navigator1_SearchCriteria1_LinkButton13')#registered land
    Btn(browser,'SearchFormEx1_BtnAdvanced')
    
    year = int(datetime.now().strftime('%Y'))
    data = [str(latest_docno),str(more_recent_docno), 
            f'1/1/{year}',f'12/31/{year}']
    
    fill_in_form(browser,data)
    
    Btn(browser,'SearchFormEx1_btnSearch')
    Btn(browser,'DocList1_PageView100Btn')




    update_deeds=pd.DataFrame()
    start_docno = latest_docno
    end_docno   = latest_docno+800
    update_docno_range(
        browser,
        (start_docno,end_docno)
    )
    
    while start_docno<= more_recent_docno:
        print('working',start_docno,end_docno)
        df           =  combine_pages(browser)
        update_deeds    =  pd.concat([update_deeds,df])
        start_docno +=  800
        end_docno   +=  800
        update_docno_range(
            browser,
            (start_docno,end_docno)
        )
        time.sleep(5)

    browser.close()

    return deeds_summary_norm(update_deeds, cnx)


def get_deeds_details(deeds_summary,cnx):

    browser=start_up_firefox(headless=False)
    time.sleep(4)
    
    doc_search = "Navigator1_SearchCriteria1_menuLabel"
    Btn(browser,doc_search)
    
    doc_search = 'Navigator1_SearchCriteria1_LinkButton01'
    Btn(browser,doc_search)
    
    time.sleep(2)
    
    all_details=pd.DataFrame()
    town_bookpage=deeds_summary[deeds_summary.town==10][['book','page']].reset_index(drop=True)
    for idx in range(len(town_bookpage)):
        update_book_page(browser,town_bookpage.loc[idx,'book'],town_bookpage.loc[idx,'page'])
    
        Btn(browser,'SearchFormEx1_btnSearch')   
        time.sleep(2)
        Btn(browser,"DocList1_GridView_Document_ctl02_ButtonRow_Doc. #_0")
        time.sleep(1)
        
        all_details = pd.concat([all_details,get_detail(browser)])
        
        if idx%100==0:
            print('Working',idx,town_bookpage.loc[idx,'book'],town_bookpage.loc[idx,'page'])
            all_details.to_sql(
                'deeds_details_raw',schema='property',con=cnx,
                if_exists='append',index=False
            )
            all_details=pd.DataFrame()
            time.sleep(5)

    browser.close()

    all_details.to_sql(
        'deeds_details_raw',schema='property',con=cnx,
        if_exists='append',index=False
    )

    return True

In [None]:
latest_docno , more_recent_docno = get_latest_most_recent_docnos(cnx,last_business_date='2023-08-31')

deeds_summary = get_deeds_summary(latest_docno,more_recent_docno,cnx)

get_deeds_details(deeds_summary.copy(),cnx)

## Transform

In [None]:
def transform_details_raw(cnx, YEAR=2023):
  
    query = """
            select *
                from property.deeds_details_raw d
                where extract(year from d."Rec. Date")::int={year}
                and d."Doc. #">(
                                    select max(docno) as docno
                                    from property.deeds_details s
                                    where extract(year from s.date)::int={year}
                                    and docno<1e6
                                )
                and d."Doc. #"<1e6
                ;
        """
    
    details_raw = pd.read_sql_query(query.format(year=YEAR),cnx)
    
    details_raw.columns=['streetnum','streetname','unit','docno',
                         'date','time','deed_type','pages',
                         'book_page','consideration','deeds_status',
                         'name','grant_type','refs_bookpage','refs_deed_type','refs_deed_year']

    details_raw=details_raw[~pd.isnull(details_raw.date)]
    details_raw=details_raw[~pd.isnull(details_raw.docno)]
    details_raw=details_raw[~details_raw.duplicated(['docno','deed_type'])]
    
    for col in ['docno','pages']:
        details_raw[col]=details_raw[col].astype(int)
    
    int_value_pairs = pd.read_sql_query("select key,value from common.int_value_pairs where item='deed_type'",cnx)
    xref = dict(zip(int_value_pairs.to_dict()['value'].values(),
    int_value_pairs.to_dict()['key'].values()))
    
    int_value_pairs = pd.read_sql_query("select key,value from common.int_value_pairs where item='deeds_status'",cnx)
    stat = dict(zip(int_value_pairs.to_dict()['value'].values(),
    int_value_pairs.to_dict()['key'].values()))
    
    details_raw.deed_type=details_raw.deed_type.replace(xref)
    details_raw.deeds_status=details_raw.deeds_status.replace(stat)
    
    details_raw['book'] = details_raw['book_page'].str.split('/').str[0]
    details_raw['page'] = details_raw['book_page'].str.split('/').str[1]
    
    details_raw['timestamp']=(pd.to_datetime(
        pd.to_datetime(details_raw['date']).dt.strftime('%Y-%m-%d ')+details_raw['time'])\
            .astype(int) / 10**9)
    
    details_raw=details_raw.drop(['book_page','time'],axis=1)
    details_raw.loc[details_raw.page=='','page']=None
    cols = ['streetnum', 'streetname', 'unit', 
            'date', 'timestamp', 'docno', 'deed_type', 'book', 'page',
           'pages', 'consideration', 'deeds_status', 
            'name','grant_type','refs_bookpage','refs_deed_type','refs_deed_year']
    details_raw=details_raw[cols].sort_values('timestamp').reset_index(drop=True)
    
    details_raw.loc[pd.isnull(details_raw.unit),'unit']=''
    details_raw.loc[pd.isnull(details_raw.streetnum),'streetnum']=''
    
    details_raw.streetnum=details_raw.streetnum.astype(str).str.replace(r'\.0$','',regex=True)
    
    details_raw.book=details_raw.book.astype(int)
    details_raw.page=details_raw.page.astype(int)

    for col in ['refs_bookpage', 'refs_deed_type', 'refs_deed_year','grant_type','name']:
        details_raw.loc[details_raw[col]=='',col]=[None]*len(mask[mask])
    
    return details_raw

def match_addresses(details_raw, cnx):
    query = """
        SELECT "streetnum","streetname",d.unit,d.pid
        FROM      property.deeds_details d
        WHERE     d.pid is not null
        AND       d.date>='2012-01-01'
        ;
    """
    deeds= pd.read_sql_query(query,cnx)
    deeds=deeds[(~deeds.duplicated())&\
                (deeds.pid!='')&\
                (deeds.streetname!='')&\
               (deeds.streetname is not None)]
    
    combo = details_raw.merge(deeds,
                          how='left',
                          on=['streetnum','streetname','unit'],
                          indicator='matched')
    combo.groupby('matched').count()
    
    matched  =  combo[(combo.matched=='both')].sort_values(['streetnum','streetname','unit','pid'])
    matched  =  matched[~matched.duplicated(['streetnum','streetname','unit'])]
    len(matched)
    
    unmatched = combo[combo.matched=='left_only'][['streetnum','streetname','unit']].sort_values(['streetname','streetnum','unit'])
    
    import numpy as np
    query = """
        SELECT streetname, streetnum,unit,pid from property.assessments
        where year = {year}
    """
    
    parcels = pd.read_sql_query(query.format(year=YEAR),cnx)
    parcels = parcels.replace({pd.isnull:'',None:'',np.nan:''})
    
    ##dangerous
    combo = (unmatched.replace(' UNIT ','').replace('UNIT ','')).merge(parcels,on=['streetname','streetnum','unit'],how='left',indicator='matched')
    parcel_matches = combo[combo.matched=='both'].drop(['matched'],axis=1).sort_values(['streetname','streetnum','unit','pid'])
    parcel_matches=parcel_matches[~parcel_matches.duplicated(['streetname','streetnum','unit'])]
    matched = pd.concat([matched,parcel_matches])
    
    
    unmatched = combo[combo.matched=='left_only']\
        .drop(['matched','pid'],axis=1)[['streetname','streetnum','unit']]\
        .sort_values(['streetname','streetnum','unit'])
    
    details_address_pid_xref = pd.concat([matched,unmatched])
    len(details_address_pid_xref)
    
    details_address_pid_xref = matched[['streetname','streetnum','unit','pid']]
    details_address_pid_xref=details_address_pid_xref.sort_values(['streetname','streetnum','unit'])
    details_address_pid_xref
    
    unmatched=unmatched[~pd.isnull(unmatched.streetname)]
    unmatched = unmatched.sort_values(['streetnum','streetname','unit'])
    unmatched=unmatched[~unmatched.duplicated()]
    
    return pd.concat([details_address_pid_xref,unmatched]

In [None]:
details_raw = transform_details_raw(cnx, YEAR=2023)

addr_pid_xref = match_addresses(details_raw.copy(), cnx)

details = details_raw.copy()

## Extra

* needs work

In [None]:
addr_pid_xref.to_csv('deeds_manual_pids.tsv',sep='\t',index=False)
addr_pid_xref = pd.read_csv('deeds_manual_pids.tsv',sep='\t')
addr_pid_xref=addr_pid_xref.replace({np.nan:''})


for col in ['consideration']:
    details[col]=details[col].fillna('0').astype(int)
details = details.replace({pd.isnull:'',np.nan:'',None:''})
details.loc[details.streetnum=='nan','streetnum']=''

details = details.merge(addr_pid_xref,how='left',
                        on=['streetname','streetnum','unit'])

details = details.replace({pd.isnull:'',np.nan:'',None:''})


## Load

In [None]:
deeds_summary.to_sql(
    'deeds_summary',schema='property',con=cnx,
    if_exists='append',index=False
)


details.to_sql(
    'deeds_details',schema='property',con=cnx,
    if_exists='append',index=False
)
