In [5]:
import os, glob, json, time, requests, datetime, contextlib, argparse
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

HTTP_ATTEMPTS = 1000
WD            = os.getcwd()

def isnotebook():
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return True   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            return False  # Terminal running IPython
        else:
            return False  # Other type (?)
    except NameError:
        return False      # Probably standard Python interpreter

if isnotebook():
    from tqdm.notebook import tqdm, trange
else:
    from tqdm import tqdm, trange

In [14]:
''' Fetching files '''

def get_headers(url='https://propaccess.taylor-cad.org/clientdb/?cid=1'):
    session         = requests.Session()
    response        = session.get(url)
    session_cookies = session.cookies.get_dict()
    cookie_string   = '; '.join([f'{key}={session_cookies[key]}' for key in session_cookies])
    headers         = {
                          'cookie': cookie_string,
                          'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'
                      }
    return headers


if __name__ == '__main__':
    


    os.makedirs(f'{WD}/data/', exist_ok=True) 
    headers = get_headers()

    if isnotebook():
        begin_id, end_id = 10000, 100000
    else:
        parser = argparse.ArgumentParser(description='Fetcher range')
        parser.add_argument('-begin_id', type=int, help='starting id', required=False, default=10000)
        parser.add_argument('-end_id', type=int, help='ending id', required=False, default=1000000)
        args = parser.parse_args()
        begin_id, end_id = args.begin_id, args.end_id
    
    for prop_id in trange(begin_id, end_id):
        url   = f'https://propaccess.taylor-cad.org/ClientDB/Property.aspx?prop_id={prop_id}'
        fname = f'{WD}/data/{prop_id}.html'
        
        # Handling stale sessions
        for trial in range(HTTP_ATTEMPTS):
            response = requests.get(url, headers=headers)
            
            if response.ok:
                break
            
            if trial==HTTP_ATTEMPTS-1:
                raise Exception(f'Connection timeout at prop_id={prop_id}')
            
            time.sleep(1)
            headers  = get_headers()
            
        if 'Property not found.' in response.text:
            with contextlib.suppress(FileNotFoundError): # delete file, if property was removed from the county website
                os.remove(fname)
        else:
            with open(fname, 'wb') as f:
                f.write(response.content)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=90000.0), HTML(value='')))




In [30]:
''' Parsing files '''

if __name__ == '__main__':

    total_list = []
    
    for fname in tqdm(sorted(glob.glob(f'{WD}/data/*.html'))):
    
        with open(fname, 'rb') as f:
            html_text = f.read()

        # Skipping personal property, mobile homes, etc.
        if b'Type:</td><td>Real' not in html_text or b'No land segments' in html_text:
            continue
            
        inactive           = b'(INACTIVE)' in html_text
        
        try:
            soup               = BeautifulSoup(html_text, 'html.parser')
            tax_object         = soup.find(id="taxDueDetails_dataSection")
            table_entries      = tax_object.find_all('td')
            second_total_idx   = [index for index, s in enumerate(table_entries) if 'TOTAL' in s.text][1]
            recent_penalty     = float(table_entries[second_total_idx+5].text[1:])
        except:
            continue
        
        try:
            idx = next(table_entries.index(x) for x in table_entries if f'{datetime.datetime.now().year-1} TOTAL' in x.text)
            recent_delinquency = float(table_entries[idx+7].text[1:])
        except: # mostly ValueError and StopIteration
            recent_delinquency = 0.0
        
        try:
            school_line = next(x.text for x in table_entries if 'ISD' in x.text)
            school = school_line.split()[0]
            school = school.strip()
        except StopIteration:
            school = 'Unknown'
        
        # We only want properties that are tax delinquent 
        #if not recent_penalty:
        #    continue
            
        property_details  = soup.find(id="propertyDetails").find_all('td')
        prop_id           = int(property_details[2].text)
        legal_description = property_details[4].text
        property_use      = property_details[18].text
        prop_address      = property_details[23].text
        owner_name        = property_details[34].text
        owner_address     = ', '.join([s.strip() for s in property_details[38].strings])
        absentee          = 'HS' not in property_details[-1].text
        empty_land        = b'No improvements exist for this property.' in html_text
        
        land_details      = soup.find(id="landDetails").find_all('td')
        land_textarray    = [s.text for s in land_details]
        stride            = 9
        land_types        = land_textarray[2::9]
        land_areas        = [float(s) for s in land_textarray[3::9]]
        land_area         = sum(land_areas)
        land_dict         = dict(zip(land_types, land_areas))
        zoning            = land_types[0] if len(land_types)==1 else 'Mixed' if land_types else 'Unknown'
        
        
        prop_dict         = {
                                'prop_id'          : prop_id,
                                'legal_description': legal_description,
                                'prop_address'     : prop_address,
                                'owner_name'       : owner_name,
                                'owner_address'    : owner_address,
                                'absentee'         : absentee,
                                'empty_land'       : empty_land,
                                'property_use'     : property_use,
                                'zoning'           : zoning,
                                'land_area'        : land_area,
                                'land_dict'        : land_dict,
                                'recent_penalty'   : recent_penalty,
                                'recent_delinq'    : recent_delinquency,
                                'school'           : school,
                                'inactive'         : inactive
                            }
        
        total_list.append(prop_dict)
        
    if total_list:
        with open(f'{WD}/output.json', 'w') as json_f:
            json_f.write(json.dumps(total_list))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=60361.0), HTML(value='')))




In [31]:
df = pd.DataFrame(total_list)

In [32]:
df.to_csv(f'{WD}/output.csv', index = False)
df.to_excel(f'{WD}/output.xls', index = False)