### 1. Data Cleaning and Merging in one big dataset

In [11]:
import pandas as pd

# Load the datasets
housing_data = pd.read_csv('../data/processed/Housing_Prices_All_Data_2005_2024.csv')
combined_data = pd.read_csv('../data/processed/combined_withNaN.csv')
interest_rate_data = pd.read_csv('../data/processed/Bank_of_Canada_interest_rate_1980_2024_.csv')

# Convert the 'Date' column in the housing dataset to 'Month-Year' format (e.g., 'Jan-2005')
housing_data['Date'] = pd.to_datetime(housing_data['Date'])
housing_data['Month-Year'] = housing_data['Date'].dt.strftime('%b-%Y')

# The 'Month-Year' format in combined_data is already correct ('Jan-YYYY'), so no changes needed.

# Convert the 'Month-Year' format in interest_rate_data to match 'Jan-2005' format
interest_rate_data['Month-Year'] = pd.to_datetime(interest_rate_data['Month-Year'], format='%Y-%m').dt.strftime('%b-%Y')

# Merge the datasets on 'Month-Year'
merged_data_1 = pd.merge(housing_data, combined_data, on='Month-Year', how='inner')
final_merged_data = pd.merge(merged_data_1, interest_rate_data, on='Month-Year', how='inner')

# Display the first few rows to verify the merge
final_merged_data.head()

Unnamed: 0,Date,AGGREGATE_Composite_HPI_SA,AGGREGATE_Single_Family_HPI_SA,AGGREGATE_One_Storey_HPI_SA,AGGREGATE_Two_Storey_HPI_SA,AGGREGATE_Townhouse_HPI_SA,AGGREGATE_Apartment_HPI_SA,AGGREGATE_Composite_Benchmark_SA,AGGREGATE_Single_Family_Benchmark_SA,AGGREGATE_One_Storey_Benchmark_SA,...,Province,Unemployment Rate,Minimum Wage,Immigration Num,All-items CPI,All industries GDP,Active businesses,Opening businesses,Closing businesses,Overnight money market financing
0,2005-01-01,100.0,100.0,100.0,100.0,100.0,100.0,239800,260400,207700,...,Canada,6.9,,,105.3,1591349.0,,,,2.498661
1,2005-01-01,100.0,100.0,100.0,100.0,100.0,100.0,239800,260400,207700,...,Newfoundland and Labrador,14.2,$6.00,,,,,,,2.498661
2,2005-01-01,100.0,100.0,100.0,100.0,100.0,100.0,239800,260400,207700,...,Prince Edward Island,10.7,$6.80,,,,,,,2.498661
3,2005-01-01,100.0,100.0,100.0,100.0,100.0,100.0,239800,260400,207700,...,Nova Scotia,8.8,$6.50,,,,,,,2.498661
4,2005-01-01,100.0,100.0,100.0,100.0,100.0,100.0,239800,260400,207700,...,New Brunswick,9.7,$6.30,,,,,,,2.498661


In [18]:
final_merged_data.columns

Index(['Date', 'AGGREGATE_Composite_HPI_SA', 'AGGREGATE_Single_Family_HPI_SA',
       'AGGREGATE_One_Storey_HPI_SA', 'AGGREGATE_Two_Storey_HPI_SA',
       'AGGREGATE_Townhouse_HPI_SA', 'AGGREGATE_Apartment_HPI_SA',
       'AGGREGATE_Composite_Benchmark_SA',
       'AGGREGATE_Single_Family_Benchmark_SA',
       'AGGREGATE_One_Storey_Benchmark_SA',
       ...
       'Province', 'Unemployment Rate', 'Minimum Wage', 'Immigration Num',
       'All-items CPI', 'All industries GDP', 'Active businesses',
       'Opening businesses', 'Closing businesses',
       'Overnight money market financing'],
      dtype='object', length=692)

In [19]:
final_merged_data.shape

(2563, 692)

In [22]:
final_merged_data['Month-Year']

0       Jan-2005
1       Jan-2005
2       Jan-2005
3       Jan-2005
4       Jan-2005
          ...   
2558    May-2024
2559    May-2024
2560    May-2024
2561    May-2024
2562    May-2024
Name: Month-Year, Length: 2563, dtype: object

In [23]:
# Drop the 'Date' column
final_merged_data = final_merged_data.drop(columns=['Date'])

# Reorder columns to make 'Month-Year' the first column
columns = ['Month-Year'] + [col for col in final_merged_data.columns if col != 'Month-Year']
final_merged_data = final_merged_data[columns]

# Display the first few rows to verify the changes
final_merged_data.head()


Unnamed: 0,Month-Year,AGGREGATE_Composite_HPI_SA,AGGREGATE_Single_Family_HPI_SA,AGGREGATE_One_Storey_HPI_SA,AGGREGATE_Two_Storey_HPI_SA,AGGREGATE_Townhouse_HPI_SA,AGGREGATE_Apartment_HPI_SA,AGGREGATE_Composite_Benchmark_SA,AGGREGATE_Single_Family_Benchmark_SA,AGGREGATE_One_Storey_Benchmark_SA,...,Province,Unemployment Rate,Minimum Wage,Immigration Num,All-items CPI,All industries GDP,Active businesses,Opening businesses,Closing businesses,Overnight money market financing
0,Jan-2005,100.0,100.0,100.0,100.0,100.0,100.0,239800,260400,207700,...,Canada,6.9,,,105.3,1591349.0,,,,2.498661
1,Jan-2005,100.0,100.0,100.0,100.0,100.0,100.0,239800,260400,207700,...,Newfoundland and Labrador,14.2,$6.00,,,,,,,2.498661
2,Jan-2005,100.0,100.0,100.0,100.0,100.0,100.0,239800,260400,207700,...,Prince Edward Island,10.7,$6.80,,,,,,,2.498661
3,Jan-2005,100.0,100.0,100.0,100.0,100.0,100.0,239800,260400,207700,...,Nova Scotia,8.8,$6.50,,,,,,,2.498661
4,Jan-2005,100.0,100.0,100.0,100.0,100.0,100.0,239800,260400,207700,...,New Brunswick,9.7,$6.30,,,,,,,2.498661


In [25]:
# Filter the dataset to keep only the rows where the 'Province' is 'Canada'
final_merged_data_cleaned_canada = final_merged_data[final_merged_data['Province'] == 'Canada']

# Drop the 'Province' column since we're only keeping data for Canada
final_merged_data_cleaned_canada = final_merged_data_cleaned_canada.drop(columns=['Province'])

# Display the first few rows to verify the changes
final_merged_data_cleaned_canada.head()


Unnamed: 0,Month-Year,AGGREGATE_Composite_HPI_SA,AGGREGATE_Single_Family_HPI_SA,AGGREGATE_One_Storey_HPI_SA,AGGREGATE_Two_Storey_HPI_SA,AGGREGATE_Townhouse_HPI_SA,AGGREGATE_Apartment_HPI_SA,AGGREGATE_Composite_Benchmark_SA,AGGREGATE_Single_Family_Benchmark_SA,AGGREGATE_One_Storey_Benchmark_SA,...,ST_JOHNS_NL_Apartment_Benchmark_SA,Unemployment Rate,Minimum Wage,Immigration Num,All-items CPI,All industries GDP,Active businesses,Opening businesses,Closing businesses,Overnight money market financing
0,Jan-2005,100.0,100.0,100.0,100.0,100.0,100.0,239800,260400,207700,...,106500,6.9,,,105.3,1591349,,,,2.498661
11,Feb-2005,100.3,100.2,100.3,100.1,100.3,100.6,240500,261000,208400,...,106500,7.0,,,105.7,1592499,,,,2.495561
22,Mar-2005,100.6,100.6,100.7,100.5,100.6,101.1,241300,261900,209200,...,106500,6.9,,,106.3,1587122,,,,2.486494
33,Apr-2005,100.9,100.8,101.2,100.6,100.8,101.5,242000,262600,210100,...,106500,6.7,,,106.6,1598907,,,,2.492997
44,May-2005,101.2,101.1,101.4,100.8,101.0,102.0,242600,263200,210600,...,106500,7.0,,,106.7,1605776,,,,2.490639


In [26]:
# Write the final merged dataset to a CSV file
output_file_path = '../data/processed/final_merged_housing_data.csv'
final_merged_data.to_csv(output_file_path, index=False)

output_file_path


'../data/processed/final_merged_housing_data.csv'

### Handle Missing Values: Address any missing data in the merged dataset.