In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [3]:
url = "https://www.phius.org/certified-project-database/santaella-gardens"

html = urlopen(url).read()

onesoup = BeautifulSoup(html)

In [213]:
loc = onesoup.find_all(attrs={'class': 'location'})

In [212]:
onesoup

<!DOCTYPE html>
<html dir="ltr" lang="en" prefix="og: https://ogp.me/ns#"><head>
    <meta charset="utf-8"/>
<link href="https://www.phius.org/certified-project-database/fairplay" rel="canonical"/>
<meta content="website" property="og:type"/>
<meta content="Drupal 9 (https://www.drupal.org)" name="Generator"/>
<meta content="width" name="MobileOptimized"/>
<meta content="true" name="HandheldFriendly"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="https://www.phius.org/themes/custom/phius/assets/images/phius-og-image-1200x630.jpg" property="og:image"/>
<link href="/core/misc/favicon.ico" rel="icon" type="image/vnd.microsoft.icon"/>

    <title>Fairplay | PHIUS</title>

    <link href="https://d1azc1qln24ryf.cloudfront.net/55136/Phiusorg/style-cf.css?iix82e" rel="stylesheet"/>
    <link href="/themes/custom/phius/assets/pwa/icons/favicon.ico" rel="icon"/>
    <link href="/themes/custom/phius/assets/pwa/icons/favicon.svg" rel="icon" type="image/sv

In [None]:
if len(loc) != 1:
    raise Exception(f"Location not unique: {loc}")

In [218]:
loc[0].get_text().strip()

'Fairplay, Colorado'

In [208]:
def extract_structured_data(soup):
    '''Extract all "structured-data" fields from PHIUS project page
    
    On the project page, most of the data on project is saved like this:
    
      <ul class="structured-data">
        <li>
          <div class="label">Annual Heating Demand</div>
          <div class="value">2.11</div>
        </li>
      [...]
    
    Some of the data comes in formats that can be make more useful
    (e.g. the date could be parsed into ISO or the area converted to a number)
    but that is outside of the scope of this funtion. 
    
    
    Parameters
    ----------
    soup : 
        BeautifulSoup4 parsed html page
    
    Returns
    -------
    extracted_data : dict
        Data will all structured fields from the website
    '''
    extracted_data = {}
    for d in onesoup.find_all(attrs={'class': 'structured-data'}):
        for li in d.find_all('li'):
            key = li.find(attrs={'class': 'label'}).text
            value = li.find(attrs={'class': 'value'}).text
            extracted_data[key] = value
    return extracted_data

In [49]:
extracted_data

{'Building Function': 'Multifamily',
 'Project Type': 'New Construction',
 'ASHRAE Climate Zone': '4A - Mixed - Humid',
 'Construction Completion': '2019',
 'Status': 'Final Certified',
 'INT. Conditioned Floor Area': '214543 sq. ft.',
 'Phius Certified Verifier': 'Avery Gray',
 'Phius CPHC (Lead)': 'Carmel Pratt',
 'Project Submitter': 'Jordana Viuker',
 'Construction Company': '1675 JV Associates LLC',
 'Construction Type': 'Masonry',
 'Number of Units': '249',
 'Number of Stories': '11',
 'Program Version': 'PHIUS+ 2015',
 'Design Certification': 'March 29, 2019',
 'Final Certification': 'March 25, 2022',
 'Region': 'Northeast',
 'Annual Heating Demand': '2.11',
 'Annual Cooling Demand': '5.18',
 'Peak Heating Load': '3.36',
 'Peak Cooling Load': '2.69',
 'Site Energy Use Index (EUI)': '24.8',
 'Site Energy Use Index (w/Renewables)': '21.82',
 'Compliance Software': 'WUFI Passive',
 'Source Energy (Residential)': '6327',
 'Source Energy (Residential w/Renewables)': '5360',
 'Air Tig

In [82]:
url = "https://www.phius.org/certified-project-database?_page=1&keywords=&_limit=10000"

html = urlopen(url).read()

allsoup = BeautifulSoup(html)

In [199]:
def extract_project_data(soup):
    '''Extract data from a single project in the overview page
    
    On the project page, most of the data on project is saved like this:
    ```
    <article class="teaser js-link-event" data-has-image="false" data-result-type="project">
    [...]
    <span class="status final-certified">Final Certified</span></div>
    [...]
    <a class="js-link-event-link" href="/certified-project-database/tenney-residence">Tenney Residence</a>
    [...]
    <span class="building-function">Single-Family</span>
    <span class="project-type">New Construction</span></span>
    <span class="climate-zone">5B - Cool - Dry</span><div class="stats">
    <span class="sq-ft">3450 sq. ft.</span>
    [...]
    ```
    
    Most of the fields in the input data are just copied verbatim into a dict, but
    some field are processed, extracting the URL to the project detail page, and 
    converting completion date and floor area to numbers.
    
    Returns
    -------
    extracted_data : dict
    '''
    extracted_data = {}
    atag = soup.find('a')
    extracted_data['link'] = 'https://www.phius.org' + atag.get('href')
    extracted_data['title'] = atag.get_text()
    # k matches the keys that I find in "strucutred-data" on the detail pages of each project
    # v is what class is called in this page
    for v, k in [('Status', 'status'), 
                 ('Building Function', 'building-function'), 
                 ('Project Type', 'project-type'), 
                 ('ASHRAE Climate Zone', 'climate-zone'), 
                 ('INT. Conditioned Floor Area', 'sq-ft'), 
                ]:
        tag = soup.find(attrs={'class': k})
        if tag:
            extracted_data[v] = tag.get_text()
    
    completion = soup.find(attrs={'class': 'completion-date'})
    # Text is something like "Completed 2019" be we want to get only the date
    if completion:
        extracted_data['Construction Completion'] = int(completion.get_text().split(' ')[1])
    if 'INT. Conditioned Floor Area' in extracted_data:
        extracted_data['Floor area'] = float(extracted_data['INT. Conditioned Floor Area'].replace('sq. ft.', ''))
    return extracted_data

In [200]:
extract_project_data(projectlist[123])

{'link': 'https://www.phius.org/certified-project-database/modern-ocean-view-pei',
 'title': 'Modern Ocean View PEI',
 'Status': 'Final Certified',
 'Building Function': 'Single-Family',
 'Project Type': 'New Construction',
 'ASHRAE Climate Zone': '6A - Cool - Humid',
 'INT. Conditioned Floor Area': '3470 sq. ft.',
 'Floor area': 3470.0}

In [201]:
projectlist = allsoup.find_all('article', attrs={'data-result-type': "project"})

In [202]:
assert len(projectlist) > 600

In [203]:
all_projects = {}
for p in projectlist:
    out = extract_project_data(p)
    title = out.pop('title')
    all_projects[title] = out

In [206]:
v

{'link': 'https://www.phius.org/certified-project-database/santaella-gardens',
 'Status': 'Final Certified',
 'Building Function': 'Multifamily',
 'Project Type': 'New Construction',
 'ASHRAE Climate Zone': '4A - Mixed - Humid',
 'INT. Conditioned Floor Area': '214543 sq. ft.',
 'Construction Completion': 2019,
 'Floor area': 214543.0}

In [209]:
for p, v in all_projects.items():
    html = urlopen(v['link']).read()
    onesoup = BeautifulSoup(html)
    v.update(extract_structured_data(onesoup))

In [210]:
all_projects

{'Santaella Gardens': {'link': 'https://www.phius.org/certified-project-database/santaella-gardens',
  'Status': 'Final Certified',
  'Building Function': 'Multifamily',
  'Project Type': 'New Construction',
  'ASHRAE Climate Zone': '4A - Mixed - Humid',
  'INT. Conditioned Floor Area': '214543 sq. ft.',
  'Construction Completion': '2019',
  'Floor area': 214543.0,
  'Phius Certified Verifier': 'Avery Gray',
  'Phius CPHC (Lead)': 'Carmel Pratt',
  'Project Submitter': 'Jordana Viuker',
  'Construction Company': '1675 JV Associates LLC',
  'Construction Type': 'Masonry',
  'Number of Units': '249',
  'Number of Stories': '11',
  'Program Version': 'PHIUS+ 2015',
  'Design Certification': 'March 29, 2019',
  'Final Certification': 'March 25, 2022',
  'Region': 'Northeast',
  'Annual Heating Demand': '2.11',
  'Annual Cooling Demand': '5.18',
  'Peak Heating Load': '3.36',
  'Peak Cooling Load': '2.69',
  'Site Energy Use Index (EUI)': '24.8',
  'Site Energy Use Index (w/Renewables)': '

In [211]:
len(all_projects)

622

In [219]:
from geopy.geocoders import Nominatim
from bs4 import BeautifulSoup

geolocator = Nominatim(user_agent="PassiveHouseDatabaseHarvester")


In [228]:
geolocator.geocode("Narragansett, Rhode Island, USA")

Location(Narragansett, South County, Rhode Island, United States, (41.4501021, -71.4495005, 0.0))

In [225]:
geolocator.geocode("Somerville, MA, USA")

Location(Somerville, Middlesex County, Massachusetts, United States, (42.3875968, -71.0994968, 0.0))

In [None]:
https://www.phius.org/certified-project-database/hawkins-yurgalevitch-passive-house
    Naragansett, Rhode Island -> Narragansett, Rhode Island (double "r")
https://www.phius.org/certified-project-database/schickler-lane-adu
    Belingham, Washington -> Bellingham, Washington (double "l")
https://www.phius.org/certified-project-database/295-painter-hill
    Wurtsburo, New York -> Wurtsboro, New York ("o" instead of "u")
https://www.phius.org/certified-project-database/summer-park
    and
https://www.phius.org/certified-project-database/summer-park-phase-2
    Hannover, New Hampshire -> Hanover, New Hampshire (only one "n")
https://www.phius.org/certified-project-database/stone-ridge-building
    Springettbury, Pennsylvania -> Springettsbury, Pennsylvania (missing "s")
    note that the seplling is correct for building B in the same project: https://www.phius.org/certified-project-database/stone-ridge-building-b
https://www.phius.org/certified-project-database/whisper-house
    Barnegate, New Jersey -> Barnegat, New Jersey (one "e" needs to be removed)
https://www.phius.org/certified-project-database/esta-bien-clarence-trust-property    
    Geneva (Jainsville/Rock County, WI climate data set), Wisconsin ->
    This one is in the "Geneva (Jainsville/Rock County, Wisconsin" but for some reaon
    the words "climate data set", and an extra "WI" also apprear in the location

In [232]:
import json

def add_misspelled_location(misspelled, correct):
    with open("data/known_coords.json", 'r') as f:
        known_locs = json.load(f)
        
    geoloc = geolocator.geocode(correct, timeout=10)
    print(f'Found: {geoloc}')
    if geoloc:
        locobj = {"type": "Point", "coordinates": [geoloc.longitude, geoloc.latitude]}
        known_locs[misspelled] = locobj
        
    with open("data/known_coords.json", 'w') as f:
        json.dump(known_locs, f, indent=2)


In [233]:
add_misspelled_location("Naragansett, Rhode Island", "Narragansett, Rhode Island")

Found: Narragansett, South County, Rhode Island, United States


In [236]:
add_misspelled_location("Belingham, Washington", "Bellingham, Washington")

Found: Bellingham, Whatcom County, Washington, 98225-3243, United States


In [237]:
add_misspelled_location("Wurtsburo, New York", "Wurtsboro, New York")

Found: Wurtsboro, Town of Mamakating, Sullivan County, New York, United States


In [238]:
add_misspelled_location("Unicorporated Adams County, Colorado", "Adams County, Colorado")

Found: Adams County, Colorado, United States


In [239]:
add_misspelled_location("Shaw Island, San Juan Islands, Washington", "Shaw Island, Washington")

Found: Shaw Island, San Juan County, Washington, 98280, United States


In [240]:
add_misspelled_location("Valle de Guadalupe, Ensenada, BC", "Valle de Guadalupe, Ensenada")

Found: Valle de Guadalupe, Ensenada, Municipio de Ensenada, Baja California, 22785, México


In [241]:
add_misspelled_location("Hannover, New Hampshire", "Hanover, New Hampshire")

Found: Hanover, Grafton County, New Hampshire, United States


In [242]:
add_misspelled_location("Springettbury, Pennsylvania", "Springettsbury, Pennsylvania")

Found: Springettsbury Township, York County, Pennsylvania, United States


In [243]:
add_misspelled_location("Barnegate, New Jersey", "Barnegat, New Jersey")

Found: Barnegat, Barnegat Township, Ocean County, New Jersey, United States


In [247]:
add_misspelled_location("Lake Geneva (Jainsville/Rock County, WI climate data set), Wisconsin", 
                       "Geneva, Wisconsin")

Found: Lake Geneva, Walworth County, Wisconsin, United States


In [245]:
with open("data/PHIUS.json", 'r') as f:
        phius = json.load(f)

In [246]:
phius

{'Santaella Gardens': {'link': 'https://www.phius.org/certified-project-database/santaella-gardens',
  'Status': 'Final Certified',
  'Building Function': 'Multifamily',
  'Project Type': 'New Construction',
  'ASHRAE Climate Zone': '4A - Mixed - Humid',
  'INT. Conditioned Floor Area': '214543 sq. ft.',
  'Construction Completion': 2019,
  'Floor area': 214543.0},
 'Oak Tree Village - Building A': {'link': 'https://www.phius.org/certified-project-database/oak-tree-village-building',
  'Status': 'Final Certified',
  'Building Function': 'Multifamily',
  'Project Type': 'New Construction',
  'ASHRAE Climate Zone': '5A - Cool - Humid',
  'INT. Conditioned Floor Area': '45195 sq. ft.',
  'Construction Completion': 2021,
  'Floor area': 45195.0},
 'Silvergreen': {'link': 'https://www.phius.org/certified-project-database/silvergreen',
  'Status': 'Final Certified',
  'Building Function': 'Multifamily',
  'Project Type': 'New Construction',
  'ASHRAE Climate Zone': '4A - Mixed - Humid',
  'I

In [253]:
with open("data/PHIUS.json", 'r') as f:
        known_projects = json.load(f)
with open("data/known_coords.json", 'r') as f:
        known_locs = json.load(f)


In [257]:
known_projects

{'Santaella Gardens': {'link': 'https://www.phius.org/certified-project-database/santaella-gardens',
  'Status': 'Final Certified',
  'Building Function': 'Multifamily',
  'Project Type': 'New Construction',
  'ASHRAE Climate Zone': '4A - Mixed - Humid',
  'INT. Conditioned Floor Area': '214543 sq. ft.',
  'Construction Completion': '2019',
  'Floor area': 214543.0,
  'Phius Certified Verifier': 'Avery Gray',
  'Phius CPHC (Lead)': 'Carmel Pratt',
  'Project Submitter': 'Jordana Viuker',
  'Construction Company': '1675 JV Associates LLC',
  'Construction Type': 'Masonry',
  'Number of Units': '249',
  'Number of Stories': '11',
  'Program Version': 'PHIUS+ 2015',
  'Design Certification': 'March 29, 2019',
  'Final Certification': 'March 25, 2022',
  'Region': 'Northeast',
  'Annual Heating Demand': '2.11',
  'Annual Cooling Demand': '5.18',
  'Peak Heating Load': '3.36',
  'Peak Cooling Load': '2.69',
  'Site Energy Use Index (EUI)': '24.8',
  'Site Energy Use Index (w/Renewables)': '

In [258]:
known_projects['Silvergreen']['Location']

{'type': 'Point', 'coordinates': [-75.07212510775798, 40.14273955]}

In [331]:
def point2multipoint(loc):
    if loc['type'] == 'Point':
        loc['type'] = 'MultiPoint'
        loc['coordinates'] = [loc['coordinates']]
    return loc

In [277]:
import datetime
import re

datematch = re.compile(r"(?P<month>[a-zA-Z]*) (?P<day>\d{1,2}), (?P<year>\d{4})")

In [286]:
monthnames = ['January', 'February', 'March', 'April', 'May', 'June', 
              'July', 'August', 'September', 'October', 'November', 'December']

In [325]:
def parse_date(v):
    if 'Final Certification' in v:
        m = datematch.match(v['Final Certification'])
        return datetime.datetime(year=int(m['year']), day=int(m['day']), 
                                 month=monthnames.index(m['month']) + 1).isoformat()
    else:
        return (datetime.datetime.now() + datetime.timedelta(days=356)).isoformat()

In [326]:
import folium
import folium.plugins

In [332]:
statuscolor = {'Pre-certified': "#C4F09E", 'Design Certified': "#C4F09E",
               'Certified': "#79BD9A", 'Final Certified': '#79BD9A',
                'Registered': "#FFFFFF", None: "#FFFFFF"}

out = {'type': "FeatureCollection", 'features': []}
for k, v in known_projects.items():
        if not 'Location' in v:
            print(f'Skipping {k} - unknown location')
            continue
        prop = {}
        status = v.get('Status', None)
        prop["marker-color"] = statuscolor[status]

        if v.get('Floor area', 0) > 10000:
            prop["marker-size"] = "large"
        else:
            prop["marker-symbol"] = "building"

        link = v['link']
        prop['name'] = f"<a href='{link}'>{k}</a>"
        desc = '<table>'
        desc = desc + '<tr><td>Certified by</td><td><a href="https://www.phius.org">PHIUS</a></td></tr>'

        for col in ['Project Type', 'Building Function', 'Construction Type', 'INT. Conditioned Floor Area']:
            if col in v:
                desc = desc + f'<tr><td><strong>{col}</strong></td><td>{v[col]}</td></tr>'
        desc = desc + '</table>'
        prop['description'] = desc
        prop['times'] = [parse_date(v)]

        out['features'].append({"type": "Feature",
                                "geometry": point2multipoint(v['Location']),
                                "properties": prop})

Skipping Heritage Point - unknown location
Skipping test project - unknown location
Skipping Far View PH Residence - unknown location
Skipping Robles Residence - unknown location
Skipping Norman home addition - unknown location
Skipping OPRC 7 Van Buren - unknown location
Skipping 3 Van Horne - unknown location
Skipping Wharton-Kelleher Residence - unknown location
Skipping Harvey West Studios - unknown location
Skipping Brooks Residence - unknown location
Skipping Russell Home - unknown location
Skipping Bronzeville Estates - Bronzeville House - unknown location
Skipping Bronzeville Estates - 6th St Duplex(2) - unknown location
Skipping Bronzeville Estates - 6th St Duplex(1) - unknown location
Skipping Bronzeville Estates - 5th St Duplex(2) - unknown location
Skipping Bronzeville Estates - 5th St Duplex(1) - unknown location
Skipping Pan American Square - unknown location
Skipping ZBrains Project - unknown location
Skipping Vaughan Residence - unknown location


In [333]:
out

{'type': 'FeatureCollection',
 'features': [{'type': 'Feature',
   'geometry': {'type': 'MultiPoint',
    'coordinates': [[-73.8785937, 40.8466508]]},
   'properties': {'marker-color': '#79BD9A',
    'marker-size': 'large',
    'name': "<a href='https://www.phius.org/certified-project-database/santaella-gardens'>Santaella Gardens</a>",
    'description': '<table><tr><td>Certified by</td><td><a href="https://www.phius.org">PHIUS</a></td></tr><tr><td><strong>Project Type</strong></td><td>New Construction</td></tr><tr><td><strong>Building Function</strong></td><td>Multifamily</td></tr><tr><td><strong>Construction Type</strong></td><td>Masonry</td></tr><tr><td><strong>INT. Conditioned Floor Area</strong></td><td>214543 sq. ft.</td></tr></table>',
    'times': ['2022-03-25T00:00:00']}},
  {'type': 'Feature',
   'geometry': {'type': 'MultiPoint',
    'coordinates': [[-71.9622443, 41.6035026]]},
   'properties': {'marker-color': '#79BD9A',
    'marker-size': 'large',
    'name': "<a href='htt

In [336]:
layer

<folium.plugins.timestamped_geo_json.TimestampedGeoJson at 0x10fe13160>

In [338]:
m = folium.Map(location=(40, -75), control_scale=True, zoom_start=4)
layer = folium.plugins.TimestampedGeoJson(out, transition_time=200, loop=True, auto_play=True, 
                                 add_last_point=True, period='P1M', min_speed=0.1, max_speed=10, 
                                 loop_button=False, date_options='YYYY-MM-DD HH:mm:ss', 
                                 time_slider_drag_update=False, duration=None)
m.add_child(layer)
fullscreen = folium.plugins.Fullscreen()
m.add_child(fullscreen)
m

In [None]:
def folium_map(df, color={0: '#55AAFF', 1: '#00F'},
               tformat='%a %d.%-m.%Y %-H:%M', timestamped=True):
    import folium
    import folium.plugins

    m = folium.Map(location=(df['LAT'].mean(), df['LON'].mean()),
                   control_scale=True, zoom_start=14)
    df = df.drop_duplicates(subset=['LAT', 'LON'])
    for name, grouped in df.groupby(df['Segel'].diff().abs().cumsum()):
        coords = list(zip(grouped['LAT'], grouped['LON']))
        folium.PolyLine(coords, popup=None,
                        tooltip=grouped.index[0].strftime(tformat) + ' to ' + \
                                grouped.index[-1].strftime(tformat),
                        color=color[grouped['Segel'].median()]).add_to(m)
    if timestamped:
        geoj = geojson(df)
        timelayer = folium.plugins.TimestampedGeoJson(geoj,
                                                      period='PT30S',
                                                      duration='PT1M')
        m.add_child(timelayer)

    return m


fullscreen = folium.plugins.Fullscreen()
m.add_child(fullscreen)