In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [3]:
#Testing
#Site: https://www.rebgv.org/
#Segment: /market-watch/
#Old Format:
#Page: /MLS-HPI-home-price-comparison.hpi.<area>.<sub-area>.<property type>.<YYYY-m-d>.html
#Example: https://www.rebgv.org/market-watch/MLS-HPI-home-price-comparison.hpi.greater_vancouver.detached.2020-2-1.html
#New Format (as of August 23rd, 2020):
#Page: /MLS-HPI-home-price-comparison.hpi.<area>.<sub-area>.<property type, nullable>.<YYYY-m-d>.html
#Example: https://www.rebgv.org/market-watch/MLS-HPI-home-price-comparison.hpi.greater_vancouver.all.detached.2020-7-1.html

linked = 'https://www.rebgv.org/market-watch/MLS-HPI-home-price-comparison.hpi.all.all.all.2020-7-1.html'

response = requests.get(linked)
# soup
soup = BeautifulSoup(response.text, 'html.parser')

# div main class: hpi-graphics
# div content class: table-wrapper
result = soup.find_all('div', class_='table-wrapper')
result

[<div class="table-wrapper">
 <div class="table-title-wrapper short">
 <h2 class="table-title">Home Price Index for Greater Vancouver</h2>
 <h3 class="table-subtitle">all</h3>
 <div class="table-date">July 2020</div>
 </div>
 <div id="hpiTable_wrapper">
 <div class="tbl-scroll">
 <table id="hpiTable">
 <!-- table will have 'Area' column if region == all -->
 <thead>
 <tr role="row">
 <th>Area</th>
 <th>Benchmark</th>
 <th>Price Index</th>
 <th>1 Month +/-</th>
 <th>6 Month +/-</th>
 <th>1 Year +/-</th>
 <th>3 Year +/-</th>
 <th>5 Year +/-</th>
 <th>Property Type</th>
 </tr>
 </thead>
 <tbody>
 <tr class="gv">
 <td>Greater Vancouver</td>
 <td>$1,031,400</td>
 <td>270.6</td>
 <td>0.6</td>
 <td>3.0</td>
 <td>4.5</td>
 <td>1.1</td>
 <td>45.5</td>
 <td>Residential - All Types</td>
 </tr>
 <tr class="">
 <td>Whistler</td>
 <td>$886,900</td>
 <td>206.7</td>
 <td>-2.9</td>
 <td>-2.8</td>
 <td>-0.7</td>
 <td>10.6</td>
 <td>62.6</td>
 <td>Residential - All Types</td>
 </tr>
 <tr class="">
 <td>W

In [None]:
home_dict = []

for year in range(2005, 2021):
    for month in range(1, 13):
        print(str(year) + "-" + str(month))
        try:
            url = 'https://www.rebgv.org/market-watch/MLS-HPI-home-price-comparison.hpi.all.all.all.' + str(year) + '-' + str(month) + '-1.html';
            print(url)
            r = requests.get(url)
            soup = BeautifulSoup(r.text, 'html.parser')

            home_table = soup.find('div', class_="table-wrapper")
            #print(home_table)
            for home in home_table.find_all('tbody'):
                rows = home.find_all('tr')
                for row in rows:
                    area = row.find('td').text;
                    benchmark = row.find_all('td')[1].text
                    priceIndex = row.find_all('td')[2].text
                    oneMonthChange = row.find_all('td')[3].text
                    sixMonthChange = row.find_all('td')[4].text
                    oneYearChange = row.find_all('td')[5].text
                    threeYearChange = row.find_all('td')[6].text
                    fiveYearChange = row.find_all('td')[7].text
                    propertyType = row.find_all('td')[8].text
                    year = year;
                    month = month;

                    home_obj = {
                        "Area": area,
                        "Benchmark": benchmark,
                        "Price Index": priceIndex,
                        "1 Month +/-": oneMonthChange,
                        "6 Month +/-": sixMonthChange,
                        "1 Year +/-": oneYearChange,
                        "3 Year +/-": threeYearChange,
                        "5 Year +/-": fiveYearChange,
                        "Property Type": propertyType,
                        "Report Month": month,
                        "Report Year": year
                    }
                    home_dict.append(home_obj)  
        except:
            continue

#print(home_dict)

In [None]:
# Export file as a CSV, without the Pandas index, but with the header
vhd = pd.DataFrame(home_dict)
vhd
# Save each property types into seperated files
# Four property types: Residential - All Types, Apartment, Detached and Townhouse
#alltypes = vhd.loc[(vhd["Property Type"]=="Residential - All Types") & (vhd["Report Month"]==7) & (vhd["Report Year"]==2020)]
alltypes = vhd.loc[(vhd["Property Type"]=="Residential - All Types")]
alltypes = alltypes.sort_values(["Area", "Property Type"], ascending=True)
alltypes.to_csv("Vancouver Real Estate Board Data (All Types) 200501 - 202007.csv", index=False, header=True)
apartment = vhd.loc[(vhd["Property Type"]=="Apartment")]
apartment = apartment.sort_values(["Area", "Property Type"], ascending=True)
apartment.to_csv("Vancouver Real Estate Board Data (Apartment) 200501 - 202007.csv", index=False, header=True)
detached = vhd.loc[(vhd["Property Type"]=="Detached")]
detached = detached.sort_values(["Area", "Property Type"], ascending=True)
detached.to_csv("Vancouver Real Estate Board Data (Detached) 200501 - 202007.csv", index=False, header=True)
townhouse = vhd.loc[(vhd["Property Type"]=="Townhouse")]
townhouse = townhouse.sort_values(["Area", "Property Type"], ascending=True)
townhouse.to_csv("Vancouver Real Estate Board Data (Townhouse) 200501 - 202007.csv", index=False, header=True)

In [None]:
# Save the most recent five years into a file (2016-2020)
#max(alltypes["Report Month"])
maxyear = alltypes.loc[(alltypes["Report Year"]==max(alltypes["Report Year"]))]
latestrecord = maxyear.loc[(maxyear["Report Month"]==max(maxyear["Report Month"]))]
latestrecord
#vancouver_home_data.to_csv("Vancouver Real Estate Board Data 201601 - 202007.csv", index=False, header=True)

In [2]:
# Standalone runs to avoid getting the data from the site again
vhd = pd.read_csv("Vancouver Real Estate Board Data 201601 - 202007.csv")
vhd

Unnamed: 0,Area,Benchmark,Price Index,1 Month +/-,6 Month +/-,1 Year +/-,3 Year +/-,5 Year +/-,Property Type,Report Month,Report Year
0,Greater Vancouver,"$784,000",205.7,2.3,10.6,20.9,32.1,36.1,Residential - All Types,1,2016
1,Lower Mainland,"$694,600",194.6,2.2,10.0,19.4,28.5,32.3,Residential - All Types,1,2016
2,Bowen Island,"$649,400",140.8,1.1,5.7,11.8,13.4,16.8,Residential - All Types,1,2016
3,Burnaby East,"$720,000",199.7,1.0,5.4,18.1,28.2,33.6,Residential - All Types,1,2016
4,Burnaby North,"$648,100",195.7,2.4,8.1,18.0,27.8,32.3,Residential - All Types,1,2016
...,...,...,...,...,...,...,...,...,...,...,...
4560,Coquitlam,"$529,000",290.3,0.5,1.4,2.4,15.2,84.4,Apartment,7,2020
4561,Burnaby South,"$674,500",270.8,0.6,1.9,3.2,5.7,64.2,Apartment,7,2020
4562,Burnaby North,"$610,900",260.8,0.9,0.7,0.2,10.9,72.3,Apartment,7,2020
4563,Burnaby East,"$733,200",281.3,-1.3,2.6,1.7,-0.7,54.4,Apartment,7,2020
