In [2]:
import re
import os
import time
import shutil
import config
from word2number import w2n
from selenium import webdriver
from concurrent.futures import TimeoutError                                  
from tqdm.notebook import tqdm
import pandas as pd
# for browser interaction & load waiting:
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.common.action_chains import ActionChains

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [29]:
def listdir_nohidden(path):
    for f in os.listdir(path):
        if not f.startswith('.'):
            yield f

In [3]:
# returns target element once it is loaded
def wait_for(element, by, timeout=10, multi=False):
    global d
    time.sleep(.5)
    try:
        WebDriverWait(d, timeout).until(
            EC.presence_of_element_located((by, element)))
    except TimeoutException:
        print(f"Timeout limit reached ({timeout} s)")
        print(f"> Seaching for {element} by: {by}")
    finally:
        time.sleep(.5)
        if multi: return d.find_elements(by, element)
        else: return d.find_element(by, element)

In [4]:
def launch_new_search(verb=True):
    global d
    # open search panel
    url = 'https://scerisecm.boston.gov/ScerIS/CmPublic/#/Home'
    open_search = '/html/body/div[1]/div/div/div[2]/div[3]/search-dashboard/div/div/div[2]/div/div/div/div/ul/li[3]/dashboard-folders-renderer/div/div/div/a[3]'
    if d:
        wait_for("button[ng-click='back()']", By.CSS_SELECTOR).click()
    else:
        d = webdriver.Chrome()
        d.set_window_size(1000,800)
        d.set_window_position(15,25)

        d.get(url)
    print('> Launching new search...')
    time.sleep(.8)
    wait_for(open_search, By.XPATH).click()
    return d

In [5]:
def get_search_params(verb=True):
    # define search params and their corresponding input elements on the page
    params = wait_for("div[ng-repeat='dynamicTerm in dynamicSearchFieldTerms'",
                      By.CSS_SELECTOR, multi=True)
    names = [p.text.split('\n')[0] for p in params]
    input_containers = wait_for('input-with-help', By.CLASS_NAME, multi=True)
    inputs = [i.find_element_by_tag_name('input') for i in input_containers]
    
    # list available params
    search_param = {n: {'title':p,'input':i}  for n, p, i in zip(names, params, inputs)}
    
    return(search_param)

In [6]:
def set_search(param, keys, verb=True):
    global search_param
    param = search_param[param]
    if verb:
        d.execute_script("arguments[0].scrollIntoView();", param['title'])
    param['input'].send_keys(keys)

In [None]:
"""
Header 1
 |-test
   |-test
   |-test
 |-test
Header 2
 | test
 |-test
 |-test

Header 3
 | test
 |-test
 |-test

 
 
 """

In [7]:
def organize_file(adco_adrs, adrs, n, verb=True, download_dir='../../Downloads', target_dir='permit_pdfs/'):
    time.sleep(.5)
    
    # create target download directory (to move file to)
    if not os.path.exists(target_dir+adco_adrs+'/'+adrs):
        os.makedirs(target_dir+adco_adrs+'/'+adrs)
        print('\r    |-Creating parent directory...', end=' '*20)

    # check pre-existing files
    o_len = len(os.listdir(target_dir+adco_adrs+'/'+adrs))
    time.sleep(.5)
    print('\r    |-'+target_dir+adco_adrs+'/'+adrs+'/', end=' '*20)

    # target current download in the downloads folder
    print(f"\r      |-Downloading '{filename}'...", end=' '*20)
    filename = max([f for f in os.listdir(download_dir)],
                   key=lambda xa : os.path.getctime(os.path.join(download_dir,xa)))

    # monitor file for download completion
    waits = 0
    while '.part' in filename or 'crdownload' in filename: # wait while downloading
        filename = max([f for f in os.listdir(download_dir)],
                   key=lambda xa : os.path.getctime(os.path.join(download_dir,xa)))
                   # credit to dmb for targeting download file
                   # stackoverflow.com/questions/34548041
        waits +=1
        time.sleep(.33)
        print(f"\r      |-Downloading '{filename}'...", end=' '*20)
        if waits>30: # 10 seconds
            raise TimeoutError("error:|-Download timed out!")
    
    # rename file 
    newname = str(n)+'.'+filename.split('.')[-1]
    os.rename(os.path.join(download_dir, filename), os.path.join(download_dir, newname))
    print(f"\r      |-File renamed to '{filename}'...", end=' '*20)        

    
    # move file to this directory
    time.sleep(.2)
    shutil.move(os.path.join(download_dir, newname), target_dir+adco_adrs+'/'+adrs+'/'+newname)
    print(f"\r      |-File moved: '{newname}'")

    if o_len == os.listdir(target_dir+adco_adrs+'/'+adrs):
        raise TimeoutError('error:|-File not found after download!')
    print(f"\r      |-File saved: '{newname}'")




In [8]:
def download_tif(adco_adrs, adrs, permit, n, n_items, year):
    print(f'    |-Downloading file {n+1}/{n_items}...', end=' '*20)

    item = d.find_elements(By.CLASS_NAME, 'ui-grid-row')[n].click()
    wait_for("button[ng-click='openDocuments()']", By.CSS_SELECTOR).click()
    wait_for("button[data-pcc-toggle='dialog-download']", By.CSS_SELECTOR).click()
    wait_for("button[data-pcc-download='download']", By.CSS_SELECTOR).click()
    pages = int(d.find_element(By.CSS_SELECTOR, 'span[data-pcc-pagecount]').text)
    
    wait_for("pcc-overlay-download", By.CLASS_NAME).click()
    # if big document, download first ten pages only
    if pages > 10:
        print('\r    |-Big file - splicing first 10 pages...', end=' '*20)
        n = str(n)+'_first10pg'
        wait_for("rdoRangeCustom", By.ID).click()
        wait_for("txtCustomRange", By.ID).send_keys('1-10')
    wait_for("OK", By.ID).click()

    
    organize_file(adco_adrs, adrs, year+' - '+str(permit)+'-'+str(n))
    
    
    time.sleep(2)
    wait_for("btnBackToSearchResult", By.ID).click()
    time.sleep(.5)

In [9]:
def scrape_building_docs(row):
    print('—'*30)
    st_num = row['Primary Street Number']
    st_name = row['Street Name']
    adco_adrs = row['GPSaddress']
    permit = row['Permit Number']
    year = row['Permit Date'][-4:]
    
    try:
        launch_new_search()
    except:
        if d: d.quit()
        driver = False
        launch_new_search()
    
    global search_param
    print('  |-Defining search parameters...', end=' '*20)
    search_param = get_search_params()
    # make inputs
    set_search('Document Type', 'C/O')

    params = ['Primary Street Number', 'Street Name', 'Permit Number']
    for param in params:
        set_search(param, str(row[param]))
        time.sleep(.2)

    print('\r  |-Getting results...', end=' '*30)
    # get results
    wait_for('btnSearch', By.ID).click()

    # download move rename 
    time.sleep(1.5)
    n_items = len(d.find_elements(By.CLASS_NAME, 'ui-grid-row'))
    
    downloaded  = 0 # for logging
    print('\r  |-Downloading C/O files...', end=' '*30)

    for n in range(n_items):
        
        # has a saving function inside:
        download_tif(str(adco_adrs),
                     str(st_num)+'_'+st_name,
                     permit, n, n_items, year)
        downloaded +=1 
        
    file_ct len(listdir_nohidden(
        f"permit_pdfs/{str(adco_adrs)}/{str(st_num)+'_'+st_name}/"))
    print(f'  > Done - {downloaded} files downloaded ({file_ct} total) ')
    print('—'*80)
        

---

# scrape

In [10]:
def launch_scraper():
    start = time.gmtime()
    globals()['to_search'] = pd.read_csv('sceris_adco_merged.csv')
    global to_search
    to_search = to_search.drop_duplicates(
        subset=['Primary Street Number', 'Street Name', 'Permit Number'])
    
    deepest_idx = to_search[to_search['Permit Number']=='45130'].index[0]
    to_search = to_search.loc[deepest_idx:]
    
    globals()['d'] = False
    global d
    max_tries = 5
    completed = 0
    
    
    for ridx in to_search.index:
        row = to_search.loc[ridx]
        st_num = row['Primary Street Number']
        st_name = row['Street Name']
        adco_adrs = row['GPSaddress']
        print(f"Acquiring nearby C/Os for STR building: {st_num} {st_name}...")
        wait_between = 2
        tries = 0
        success = False
        while tries<=max_tries:
            tries+=1
            try:
                scrape_building_docs(row)
                success = True
                time.sleep(.2)
                break
            except:
                print(f'>>> Failed! Retrying... (try {tries})')
                d.quit()
                d = False
                time.sleep(wait_between)
                wait_between **= 2
        if not success:
            print('*'*30, '\nFAILED on ridx:', ridx)
            d.quit()
            d = False
            print('  > Building ID:', row['Building ID'])
        else:
            completed += 1
            
    print(f'\n>>> Completed downloading C/Os for{completed}/{len(to_search)} rows from target data.')
    print(f'\n  > Time elapsed: {time.strftime(time.gmtime()-start)')
    

In [11]:
launch_scraper()

——————————————————————————————
Acquiring permits for 126 CANAL                                   
> Launching new search...
> Defining search parameters
> Getting results...
  > Downloading file 1/1
  > Organizing...
> File 2001 - 45130-0.tif saved to 'permit_pdfs/101 Canal Street, Boston MA 02114/126_CANAL                                   /2001 - 45130-0.tif'
>>> Sucess! Saving...
> Finished downloading address C/Os
——————————————————————————————
Acquiring permits for 101 SOUTH HUNTINGTON AVE
> Launching new search...
> Defining search parameters
> Getting results...
  > Downloading file 1/11
  > Organizing...
> File 2018 - COO712195-0.tif saved to 'permit_pdfs/101 South Huntington Ave, Jamaica Plain MA 02130/101_SOUTH HUNTINGTON AVE/2018 - COO712195-0.tif'
  > Downloading file 2/11
  > Big document: splicing first 10 pages
  > Organizing...
> File 2018 - COO712195-1_first10pg.tif saved to 'permit_pdfs/101 South Huntington Ave, Jamaica Plain MA 02130/101_SOUTH HUNTINGTON AVE/2018 - C

KeyboardInterrupt: 