# ECO475 Group 2 Notebook Code

### Author: Shih-Chieh Lee, Lingyun Ma, Yuwen Zhao

# 1. Basic Setting

## a. Package Install

In [1]:
#!pip install stats-can
#!pip install pandas
#!pip install numpy
#!pip install matplotlib
#!pip install statsmodels
#!pip install linearmodels
#!pip install tabula-py #Note: Pls install tabula-py, not tabula——血的教训
#!pip install warnings

## b. Package Import 

In [2]:
# Data Collection Packages
from stats_can import StatsCan #read StatsCan data 


sc = StatsCan(data_folder="/Users/changanlee/Documents/GitHub/Housing_Price_Immigration/Input") 
#Create an instance of StatsCan class

In [3]:
# Import tabula and check java environment
from tabula.io import read_pdf  #Scrape table from pdf files
import requests 
from datetime import datetime
import calendar
import re

In [4]:
# Data Processing Packages
import pandas as pd #pandas
import numpy as np 
import matplotlib.pyplot as plt #data visualization
%matplotlib inline
# activate plot theme
import qeds

In [5]:
# Stats Model Packages
import statsmodels.api as sm # statistical model
from statsmodels.iolib.summary2 import summary_col # summary table for regression result
from linearmodels.iv import IV2SLS # IV 

In [6]:
# Silence all the warnings cuz they're absolutely annoying if you loop it multiple times
import warnings
#warnings.filterwarnings('ignore')

# 2. Data Collection

## A. StatsCan Data 

Let's start with datasets from Statistics Canada, as it's earier to collect directly using StatsCan library

In [6]:
CMA_pop = sc.table_to_df("17-10-0135-01")
CMA_pop.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Sex,Age group,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,2001-01-01,Canada,2016A00001124,Both sexes,All ages,Persons,249,units,0,v1462635598,178.1.1,31020902.0,,,,0
1,2001-01-01,Canada,2016A00001124,Both sexes,0 to 4 years,Persons,249,units,0,v1462635576,178.1.2,1754354.0,,,,0
2,2001-01-01,Canada,2016A00001124,Both sexes,0 years,Persons,249,units,0,v1462635484,178.1.3,328494.0,,,,0
3,2001-01-01,Canada,2016A00001124,Both sexes,1 year,Persons,249,units,0,v1462635485,178.1.4,345259.0,,,,0
4,2001-01-01,Canada,2016A00001124,Both sexes,2 years,Persons,249,units,0,v1462635486,178.1.5,349563.0,,,,0


## B. CREA Monthly House Price Index Data

### 1) Define Functions

In [238]:
# Define func to generate URLs based on months and years
def generate_url(month, year, cma):
    if cma == "Toronto":
        if month < 10:
            return f"https://trreb.ca/files/market-stats/home-price-index/TREB_MLS_HPI_Public_Tables_0{month}{year}.pdf"
        else:
            return f"https://trreb.ca/files/market-stats/home-price-index/TREB_MLS_HPI_Public_Tables_{month}{year}.pdf"
            #return the corresponding GTA HPI monthly report pdf for scrapping
    elif cma == "Vancouver":
        

IndentationError: expected an indented block (1887011157.py, line 10)

### 1) GTA Data

#### a) Define Functions

In [229]:
# Define func to extract table from pdf monthly report & data cleaning
def extract_table(url, month, year):

    # Extract table from pdf
    tables = read_pdf(url, pages="2", lattice = "True", multiple_tables = "False", 
                      area = [10,0,97,100],relative_area = "True", silent = "True") 
        # Note: lattice should be set as True for our case 
            #to read everything on page 2 as one table
            
    table = tables[0]
    
    # Clean the data table
    new_header = table.iloc[0] # Set new header with the first row of the table
    table = table[1:]  # Take the data below the header row
    table.columns = new_header  # Set the new header
    
    # We only want data about composite / residential property, which is the first four columns
    table = table.iloc[:, :4]
    
    # Rename the first column to "Location" 
    table = table.rename(columns={table.columns[0]: "Location"})

    # New Column for Month-Year
    if month < 10:
        table["Month_Year"] = f'0{month}_20{year}'
    else:
        table["Month_Year"] = f'{month}_20{year}'
        
    table.reset_index(drop=True, inplace=True)
    
    return table

#### b) Loop to Scrape Data

In [230]:
data_list = []
cma = "Toronto"

for year in range(16, 23):
    if year != 22:
        for month in range(1, 13):
            url = generate_url(month, year, cma)
            extracted_data = extract_table(url, month, year)
        
            if extracted_data is not None:
                extracted_data.reset_index(drop = True, inplace = True)
                data_list.append(extracted_data)

    else:
        for month in range(1, 6):
            url = generate_url(month, year)
            extracted_data = extract_table(url, month, year)
        
            if extracted_data is not None:
                extracted_data.reset_index(drop = True, inplace = True)
                data_list.append(extracted_data)
            
HPI_GTA = pd.concat(data_list, ignore_index = True)

TypeError: generate_url() takes 2 positional arguments but 3 were given

In [166]:
HPI_GTA.head()

Unnamed: 0,Location,Index,Benchmark,Yr./Yr. % Chg.,Month_Year
0,TREB Total,190.4,"$581,100",11.28%,01_2016
1,Halton Region,206.6,"$675,800",15.48%,01_2016
2,Burlington,202.9,"$601,200",11.12%,01_2016
3,Halton Hills,184.5,"$537,300",12.43%,01_2016
4,Milton,210.4,"$580,500",24.06%,01_2016


In [162]:
HPI_GTA.to_csv("/Users/changanlee/Desktop/University/Undergrad/4th-Year/Winter Semester/ECO475/Term Paper/Raw Data/HPI_GTA.csv",
               index = False)

For some pdf formatting reasons that I cannot solve right now, the last monthly report that can be extracted using read_pdf is May 2022. May need to mannually extract the rest of the data. See example below:

In [168]:
#Take June 2022 for example

url = generate_url(6,22) 
tables = read_pdf(url, pages = "2", lattice = "True", multiple_tables = "False", area = [10,0,97,100],relative_area = "True")
tables[0].head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Composite,Single Family Detached,Single Family Atached,Townhouse,Apartment,Unnamed: 2,Unnamed: 3,Unnamed: 4,...,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
0,,,Index,Benchmark,Yr./Yr. % Chg,Index,Benchmark,Yr./Yr. % Chg,Index,Benchmark,...,,,,,,,,,,
1,Al TRRE,Areas,3,.8,"$1,2",4900,17,9%,3,.6,...,$89,",100",20.0,9%,3.0,0.2,$77,",500",23.0,6%
2,Halton R,ion,3,.2,"$1,2",8000,11,6%,4,.1,...,$81,",000",10.0,0%,4.0,0.4,$70,",200",17.0,8%
3,Burlingto,,3,.8,"$1,0",4800,9,%,4,.9,...,$77,",200",9.0,%,4.0,0.9,$67,",300",19.0,0%
4,Halton H,,4,.1,"$1,2",9000,15,9%,4,.7,...,$70,",400",12.0,3%,4.0,0.7,$75,",700",16.0,1%


The column heading format is correct, but the value it reads is completely nonsense.

Whatever... Let's move on to GTA data first

### 2) GVA Data

First of all, Jan. 2016 ~ July 2016 monthly report is missing...

Second, GVA report url has multiple formats over time:

https://www.gvrealtors.ca/content/dam/rebgv_org_content/pdfs/monthly-stats-packages/REBGV-Stats-Pkg-August-2016.pdf

https://www.gvrealtors.ca/content/dam/rebgv_org_content/monthly-market-reports/2018-Dec-stats-pkg.pdf

https://www.gvrealtors.ca/content/dam/rebgv_org_content/pdfs/marketwatch/monthly_reports/REBGV-Stats-Pkg-November-2019-F.pdf

OK, let's try accessing the report using all 3 formats for all month-year combination and see how it goes

In [31]:
# Define the base URLs for each format
base_urls = [
    "https://www.gvrealtors.ca/content/dam/rebgv_org_content/monthly-market-reports/{Year}-{Month}-stats-pkg.pdf",
    "https://www.gvrealtors.ca/content/dam/rebgv_org_content/pdfs/marketwatch/monthly_reports/REBGV-Stats-Pkg-{Month}-{Year}-F.pdf",
    "https://www.gvrealtors.ca/content/dam/rebgv_org_content/pdfs/monthly-stats-packages/REBGV-Stats-Pkg-{Month}-{Year}.pdf",
    "https://www.gvrealtors.ca/content/dam/rebgv_org_content/monthly-market-reports/REBGV-Stats-Pkg-{Month}-{Year}.pdf",
    "https://www.gvrealtors.ca/content/dam/rebgv_org_content/pdfs/monthly-stats-packages/REBGV-Stats-Package-{Month}-{Year}.pdf",
    "https://www.gvrealtors.ca/content/dam/rebgv_org_content/pdfs/monthly-stats-packages/REBGV-Stats-Pkg-{Month}-{Year}-Updated%20HPI.pdf",
    "https://members.rebgv.org/news/REBGV-Stats-Pkg-{Month}-{Year}.pdf",
    "https://www.gvrealtors.ca/content/dam/rebgv_org_content/pdfs/REBGV-Stats-Pkg-{Month}-{Year}.pdf",
    "https://www.gvrealtors.ca/content/dam/rebgv_org_content/pdfs/monthly-stats-packages/{YearMonth}-REBGV-Stats-Pkg-{Month}-{Year}.pdf",
    "https://www.gvrealtors.ca/content/dam/rebgv_org_content/pdfs/monthly-stats-packages/REBGV-Stats-Pkg-{Month}-{Year}.pdf",
    "https://www.gvrealtors.ca/content/dam/rebgv_org_content/pdfs/monthly-stats-packages/REBGV%20Stats%20Package%20{Month}%20{Year}.pdf",
    "https://www.gvrealtors.ca/content/dam/rebgv_org_content/pdfs/monthly-stats-packages/{Year}-01-{Month}-Stats-Package.pdf",
    "https://www.gvrealtors.ca/content/dam/rebgv_org_content/pdfs/monthly-stats-packages/{Year}-01-{Month}-Stats--Package.pdf"
]

# Initialize list to store URLs that return status code 200
available_urls_info = []

# Loop through the years and months
for year in range(2016, 2024):
    for month in range(1, 13):
        # Get the month name and abbreviation
        month_name = calendar.month_name[month]
        month_abbr = calendar.month_abbr[month]
        year_month = f"{year}{month:02d}"

        # Iterate through each URL pattern
        for base_url in base_urls:
            # Replace placeholders in the URL pattern
            url_1 = base_url.format(Year=str(year), Month=month_name, YearMonth=year_month)
            url_2 = base_url.format(Year=str(year), Month=month_abbr, YearMonth=year_month)

            # Try fetching the first URL
            if requests.get(url_1).status_code == 200:
                available_urls_info.append({"Year": year, "Month": month, "URL": url_1})
                break  # Exit the loop on successful fetch
            
            # Try fetching the second URL if the first one fails
            elif requests.get(url_2).status_code == 200:
                available_urls_info.append({"Year": year, "Month": month, "URL": url_2})
                break  # Exit the loop on successful fetch
            
            # Special case for September URLs
            if month == 9:
                url_3 = base_url.format(Year=str(year), Month="Sept", YearMonth=year_month)
                if requests.get(url_3).status_code == 200:
                    available_urls_info.append({"Year": year, "Month": 9, "URL": url_3})
                    break  # Exit the loop on successful fetch

# Convert the list of dictionaries into a DataFrame
available_urls = pd.DataFrame(available_urls_info)


In [32]:
available_urls.head()

Unnamed: 0,Year,Month,URL
0,2016,3,https://www.gvrealtors.ca/content/dam/rebgv_or...
1,2016,4,https://www.gvrealtors.ca/content/dam/rebgv_or...
2,2016,5,https://www.gvrealtors.ca/content/dam/rebgv_or...
3,2016,6,https://www.gvrealtors.ca/content/dam/rebgv_or...
4,2016,8,https://www.gvrealtors.ca/content/dam/rebgv_or...
...,...,...,...
88,2023,8,https://members.rebgv.org/news/REBGV-Stats-Pkg...
89,2023,9,https://members.rebgv.org/news/REBGV-Stats-Pkg...
90,2023,10,https://members.rebgv.org/news/REBGV-Stats-Pkg...
91,2023,11,https://members.rebgv.org/news/REBGV-Stats-Pkg...


In [11]:
def extract_table_van(url, month, year):    
    tables = read_pdf(url, pages = "3", multiple_tables = "False")
    
    if not tables or tables[0].empty:
        return None
    
    else:
        table = tables[0]
        new_header = table.iloc[1] # Set new header with the second row of the table
        new_header[0] = "Property Type"
        new_header[1] = "Location"
        table = table[2:]  # Take the data below the header row
        table.columns = new_header

        end_here_index = (table['Property Type'] == 'Single Family Detached').idxmax()
        table = table.iloc[:end_here_index, 1:]

        if month < 10:
            table["Month_Year"] = f'0{month}_{year}'
        else:
            table["Month_Year"] = f'{month}_{year}'

    return table

In [33]:
data_list = []

for index, row in available_urls.iterrows():
    extracted_data = extract_table_van(row['URL'], row['Month'], row['Year'])
    
    if extracted_data is not None:
        extracted_data.reset_index(drop=True, inplace=True)
        data_list.append(extracted_data)
        
HPI_GVA = pd.concat(data_list, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[0] = "Property Type"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[1] = "Location"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[0] = "Property Type"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[1] = "Location"
A value is trying to be set on a copy of a slice f

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[0] = "Property Type"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[1] = "Location"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[0] = "Property Type"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[1] = "Location"
A value is trying to be set on a copy of a slice f

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[0] = "Property Type"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[1] = "Location"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[0] = "Property Type"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[1] = "Location"
A value is trying to be set on a copy of a slice f

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[0] = "Property Type"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[1] = "Location"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[0] = "Property Type"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[1] = "Location"
A value is trying to be set on a copy of a slice f

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[0] = "Property Type"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[1] = "Location"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[0] = "Property Type"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[1] = "Location"
A value is trying to be set on a copy of a slice f

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[0] = "Property Type"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[1] = "Location"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[0] = "Property Type"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[1] = "Location"
A value is trying to be set on a copy of a slice f

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[0] = "Property Type"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[1] = "Location"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[0] = "Property Type"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_header[1] = "Location"
A value is trying to be set on a copy of a slice f

In [34]:
columns_name = ["Location", "Price", "Index", "%1M_Change", "%3M_Change", "%6M_Change", 
               "%1Y_Change", "%3Y_Change", "%5Y_Change", "%10Y_Change", "Month_Year"]
HPI_GVA.columns = columns_name

HPI_GVA
#1536 rows

Unnamed: 0,Location,Price,Index,%1M_Change,%3M_Change,%6M_Change,%1Y_Change,%3Y_Change,%5Y_Change,%10Y_Change,Month_Year
0,Lower Mainland,"$719,500",202.9,2.7%,7.5%,13.1%,22.8%,34.2%,36.7%,72.2%,03_2016
1,Greater Vancouver,"$815,000",213.3,2.4%,7.0%,12.7%,23.2%,37.2%,39.0%,80.3%,03_2016
2,Bowen Island,"$653,500",141.7,1.4%,1.9%,1.6%,10.5%,14.2%,15.5%,19.5%,03_2016
3,Burnaby East,"$759,000",209.5,4.0%,7.1%,10.8%,22.2%,36.3%,41.6%,76.8%,03_2016
4,Burnaby North,"$675,000",202.9,2.9%,7.8%,11.0%,21.2%,33.0%,36.6%,72.4%,03_2016
...,...,...,...,...,...,...,...,...,...,...,...
1915,Vancouver West,"$1,325,900",311.7,-0.4%,-1.5%,0.5%,6.0%,12.0%,8.4%,44.2%,11_2023
1916,West Vancouver,"$2,560,500",276.0,-3.7%,-3.8%,-0.2%,-1.3%,6.4%,8.8%,45.0%,11_2023
1917,Whistler,"$1,388,200",311.9,1.0%,-0.8%,-0.1%,4.0%,32.2%,52.9%,174.8%,11_2023
1918,Lower Mainland,"$1,764,500",386.4,-0.8%,-2.1%,1.1%,6.9%,29.8%,38.6%,105.3%,11_2023


In [36]:
HPI_GVA.to_csv("/Users/changanlee/Desktop/University/Undergrad/4th-Year/Winter Semester/ECO475/Term Paper/Raw Data/HPI_GVA.csv",
               index = False)

In [25]:
unique_month_year = HPI_GVA['Month_Year'].unique().tolist()
unique_month_year
#2017.9
#2017.12
#2018.9

['03_2016',
 '04_2016',
 '05_2016',
 '06_2016',
 '08_2016',
 '10_2016',
 '11_2016',
 '12_2016',
 '01_2017',
 '02_2017',
 '03_2017',
 '04_2017',
 '05_2017',
 '06_2017',
 '07_2017',
 '08_2017',
 '10_2017',
 '11_2017',
 '01_2018',
 '02_2018',
 '03_2018',
 '04_2018',
 '05_2018',
 '06_2018',
 '07_2018',
 '08_2018',
 '10_2018',
 '11_2018',
 '01_2019',
 '02_2019',
 '03_2019',
 '04_2019',
 '06_2019',
 '07_2019',
 '08_2019',
 '10_2019',
 '11_2019',
 '01_2020',
 '02_2020',
 '03_2020',
 '04_2020',
 '06_2020',
 '07_2020',
 '08_2020',
 '10_2020',
 '11_2020',
 '01_2021',
 '02_2021',
 '03_2021',
 '06_2021',
 '07_2021',
 '08_2021',
 '10_2021',
 '11_2021',
 '01_2022',
 '02_2022',
 '03_2022',
 '04_2022',
 '06_2022',
 '07_2022',
 '09_2022',
 '10_2022',
 '05_2023',
 '09_2023']

In [24]:
(2023-2016)*12

84