# ECO475 Group 2_CMA level dataset

### Author: Shih-Chieh Lee, Lingyun Ma, Yuwen Zhao

# 1. Basic Setting

## a. Package Install

In [None]:
#!pip install stats-can
#!pip install pandas
#!pip install numpy
#!pip install matplotlib
#!pip install statsmodels
#!pip install linearmodels
#!pip install tabula-py #Note: Pls install tabula-py, not tabula——血的教训
#!pip install warnings

## b. Package Import 

In [1]:
# Data Collection Packages
from stats_can import StatsCan #read StatsCan data 


sc = StatsCan(data_folder="/Users/changanlee/Documents/GitHub/Housing_Price_Immigration/Input") 
#Create an instance of StatsCan class

In [2]:
# Import tabula and check java environment
from tabula.io import read_pdf  #Scrape table from pdf files
import requests 
from datetime import datetime
import calendar
import re

In [3]:
# Data Processing Packages
import pandas as pd #pandas
import numpy as np 
import matplotlib.pyplot as plt #data visualization
%matplotlib inline
# activate plot theme
import qeds

In [4]:
# Stats Model Packages
import statsmodels.api as sm # statistical model
from statsmodels.iolib.summary2 import summary_col # summary table for regression result
from linearmodels.iv import IV2SLS # IV 

In [5]:
# Silence all the warnings cuz they're absolutely annoying if you loop it multiple times
import warnings
warnings.filterwarnings('ignore')

# 2. Data Collection

## A. HPI Data

### 1) Load Excel File

Let's start with MLS HPI dataset from Canadian Real Estate Association (CREA)

We have downloaded the CMA / city-level HPI panel data from https://www.crea.ca/housing-market-stats/mls-home-price-index/hpi-tool/. 

We choose to use seasonally-unadjusted HPI dataset for our analysis to better match data from other sources (mainly from Statistics Canada), which does record monthly data that embedded seasonality.

In [30]:
# Read xlsx format HPI data

HPI_excel = pd.ExcelFile("/Users/changanlee/Desktop/University/Undergrad/4th-Year/Winter Semester/ECO475/Term Paper/Data/Raw/CMA Level/HPI/Seasonally Adjusted.xlsx")
    
HPI = pd.DataFrame()

for sheet_name in HPI_excel.sheet_names:
    df = pd.read_excel(HPI_excel, sheet_name)
    
    df["Location"] = sheet_name
    
    HPI =pd.concat([HPI, df], ignore_index = True)
    
HPI.loc[HPI['Location'] == "AGGREGATE", 'Location'] = "Canada"

In [32]:
HPI["Location"]

0             Canada
1             Canada
2             Canada
3             Canada
4             Canada
            ...     
13964    ST_JOHNS_NL
13965    ST_JOHNS_NL
13966    ST_JOHNS_NL
13967    ST_JOHNS_NL
13968    ST_JOHNS_NL
Name: Location, Length: 13969, dtype: object

### 2) Data Cleaning

We now move to perform some basic data cleaning for future analysis purpose

First, we notice that the CREA HPI dataset region is not named exactly based on official census naming. We will modify the Location naming based on a CREA-census region mapping dataset for future merging 

In [25]:
mapping_path = "/Users/changanlee/Desktop/University/Undergrad/4th-Year/Winter Semester/ECO475/Term Paper/Data/Raw/CMA Level/HPI/mapping file for census.csv"
CREA_census_map = pd.read_csv(mapping_path)

CREA_census_map = CREA_census_map[["location_name","geo_gdp"]]

CREA_census_map.head()               

Unnamed: 0,location_name,geo_gdp
0,ALBERTA,Alberta
1,BANCROFT_AND_AREA,"Area outside census metropolitan areas, Ontario"
2,BARRIE_AND_DISTRICT,"Barrie (CMA), Ontario"
3,BRANTFORD_REGION,"Brantford (CMA), Ontario"
4,BRITISH_COLUMBIA,British Columbia


In [34]:
merged_df = HPI.merge(CREA_census_map, how='left', left_on='Location', right_on='location_name')
merged_df

Unnamed: 0,Date,Composite_HPI_SA,Single_Family_HPI_SA,One_Storey_HPI_SA,Two_Storey_HPI_SA,Townhouse_HPI_SA,Apartment_HPI_SA,Composite_Benchmark_SA,Single_Family_Benchmark_SA,One_Storey_Benchmark_SA,Two_Storey_Benchmark_SA,Townhouse_Benchmark_SA,Apartment_Benchmark_SA,Location,location_name,geo_gdp
0,2005-01-01,100.0,100.0,100.0,100.0,100.0,100.0,241000,261300,208700,303900,202500.0,174500.0,Canada,,
1,2005-02-01,100.3,100.2,100.3,100.1,100.3,100.6,241800,261900,209400,304300,203100.0,175600.0,Canada,,
2,2005-03-01,100.7,100.6,100.7,100.5,100.6,101.1,242600,262800,210100,305300,203700.0,176400.0,Canada,,
3,2005-04-01,101.0,100.8,101.1,100.6,100.8,101.5,243300,263500,211100,305800,204200.0,177200.0,Canada,,
4,2005-05-01,101.2,101.0,101.3,100.8,101.0,102.1,243800,264000,211500,306400,204600.0,178100.0,Canada,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13964,2023-09-01,223.6,230.1,245.6,222.3,213.1,212.8,331800,344500,328900,360800,275900.0,228800.0,ST_JOHNS_NL,ST_JOHNS_NL,"St. John's (CMA), Newfoundland and Labrador"
13965,2023-10-01,222.8,228.9,243.8,221.4,211.0,217.4,330700,342700,326400,359300,273300.0,233700.0,ST_JOHNS_NL,ST_JOHNS_NL,"St. John's (CMA), Newfoundland and Labrador"
13966,2023-11-01,225.3,231.3,244.7,224.7,211.9,222.7,334400,346300,327700,364700,274400.0,239400.0,ST_JOHNS_NL,ST_JOHNS_NL,"St. John's (CMA), Newfoundland and Labrador"
13967,2023-12-01,226.3,232.4,244.6,226.8,215.7,221.8,335900,347900,327500,368100,279300.0,238400.0,ST_JOHNS_NL,ST_JOHNS_NL,"St. John's (CMA), Newfoundland and Labrador"


## B. StatsCan Population Data 

Let's start with datasets from Statistics Canada, as it's earier to collect directly using StatsCan library