# Property Record Card Scraper

In [154]:
import urllib
from bs4 import BeautifulSoup
import csv
import requests
import cssutils
from neo4j import *
import csv
import time

In [251]:

query = """
MATCH (p:Property) WHERE EXISTS(p.id)
RETURN COLLECT(p.id) AS geocodes
"""

uri = "bolt://localhost:7687"
user = "neo4j"
password = "letmein"

driver = GraphDatabase.driver(uri, auth=(user, password))

with driver.session() as session:
        results = session.run(query)
        for r in results:
            geocodes = r.get("geocodes")

In [252]:
len(geocodes)

49504

In [158]:
def format_geocode(id):
    return id[0:2] + '-' + id[2:6] + '-' + id[6:8] + '-' + id[8:9] + '-' + id[9:11] + '-' + id[11:13] + '-' + id[13:17]

In [159]:
def get_key(long_key):
    if long_key == 'Property Category:':
        return "category"
    elif long_key == 'Subcategory:':
        return "subcategory"
    elif long_key == 'PropertyAddress:':
        return "address"
    elif long_key == 'Subdivision:':
        return "subdivision"
    elif long_key == 'Last Modified:':
        return "modified"
    elif long_key == 'Neighborhood:':
        return "neighborhood"
    elif long_key == 'Square Feet:':
        return "lot_size"
    elif long_key == 'Acres:':
        return "acres"
    elif long_key == 'Year Built:':
        return "year_built"
    elif long_key == 'Style:':
        return "style"
    elif long_key == 'System Type:':
        return "heating"
    elif long_key == 'SFLA:':
        return "sqft"
    elif long_key == 'Bedrooms:':
        return "bedrooms"
    elif long_key == 'Full Baths:':
        return "full_baths"
    elif long_key == 'Half Baths:':
        return "half_baths"

In [210]:
def get_appraisal_obj(appraisal_list):
    app_obj = {}
    app_obj['year'] = appraisal_list[0].text
    app_obj['land'] = appraisal_list[1].text
    app_obj['building'] = appraisal_list[2].text
    app_obj['total'] = appraisal_list[3].text
    app_obj['method'] = appraisal_list[4].text
    return app_obj

In [243]:
# http://svc.mt.gov/msl/MTCadastral/PrintPropertyRecordCard/GetPropertyRecordCardData?geocode=06-0062-34-2-14-14-0000&year=2020"
def get_appraisals_for_geocode(geocode):
    url = "http://svc.mt.gov/msl/MTCadastral/PrintPropertyRecordCard/GetPropertyRecordCardData?geocode=" + format_geocode(geocode) + "&year=2020"
    page = requests.get(url).text
    soup = BeautifulSoup(page, 'html.parser')
    
    app_list = []
    
    el = soup.find('div', attrs={'id': 'appraisalsPane'})
    datums = el.findChildren('td')
    if len(datums) > 1:
        app_datums = [datums[i:i+5] for i in range(0, len(datums), 5)]
        for a_d in app_datums:
            app_obj = get_appraisal_obj(a_d)
            app_obj['id'] = geocode
            app_list.append(app_obj)
    return app_list

In [160]:
# http://svc.mt.gov/msl/MTCadastral/PrintPropertyRecordCard/GetPropertyRecordCardData?geocode=06-0062-34-2-14-14-0000&year=2020"
def get_values_for_geocode(geocode):
    url = "http://svc.mt.gov/msl/MTCadastral/PrintPropertyRecordCard/GetPropertyRecordCardData?geocode=" + format_geocode(geocode) + "&year=2020"
    page = requests.get(url).text
    soup = BeautifulSoup(page, 'html.parser')
    features = {}
    features['id'] = geocode
    keys = soup.find_all('span', attrs={'class': 'key'})
    for el in keys:
        if el.text in ['Property Category:', 'Subcategory:','PropertyAddress:', 'Subdivision:','Last Modified:', 'Neighborhood:','Square Feet:', 'Acres:','Year Built:','Style:', 'System Type:', 'SFLA:', 'Bedrooms:', 'Full Baths:', 'Half Baths:', ]:
            if el.find_next_sibling('span'):
                key = get_key(el.text)
                value = el.find_next_sibling('span').text.strip()
                features[key] = value
    return features

In [168]:
features_list = []

In [169]:
for i in geocodes:
    try:
        feats = get_values_for_geocode(i)
        features_list.append(feats)
    except:
        time.sleep(20)
    time.sleep(0.5)

In [170]:
len(features_list)

49500

In [171]:
with open('property_features.csv', 'w', newline='') as csvfile:
    fieldnames = ["id","category","subcategory", "address", "subdivision", "modified", "neighborhood", "lot_size", "acres", "year_built", "style", "heating", "sqft", "bedrooms", "full_baths", "half_baths"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for d in features_list:
        writer.writerow(d)

In [253]:
appraisals_list = []
for i in geocodes:
    try:
        appraisals = get_appraisals_for_geocode(i)
        appraisals_list.append(appraisals)
    except:
        time.sleep(10)
    time.sleep(0.1)

In [254]:
len(appraisals_list)

49496

In [255]:
with open('appraisals_full.csv', 'w', newline='') as csvfile:
    fieldnames = ["id","year","land", "building", "total", "method"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for prop in appraisals_list:
        for a in prop:
            writer.writerow(a)