### This notebook addes a Housing Price Index dataset 1981-2024

- Raw File in data/raw/Housing Price Index 1981-2024.csv

- CSV file in data/processed/Housing Price Index 1981-2024.csv

- SQL: table Housing_Price_Index

In [1]:
import pandas as pd

# Load the CSV file
file_path = '../data/raw/Housing Price Index 1981-2024.csv'
df = pd.read_csv(file_path)

# Replace missing values ('..' or 'x') with NaN
df.replace({'..': pd.NA, 'x': pd.NA}, inplace=True)

# Display the first few rows to understand the structure
df.head()

Unnamed: 0,Geography,Jan-81,Feb-81,Mar-81,Apr-81,May-81,Jun-81,Jul-81,Aug-81,Sep-81,...,Sep-23,Oct-23,Nov-23,Dec-23,Jan-24,Feb-24,Mar-24,Apr-24,May-24,Jun-24
0,Canada,36.1,36.5,37.3,38.1,38.9,39.1,39.2,39.0,39.0,...,127.8,127.6,127.2,127.2,127.1,127.2,127.2,127.6,128.0,127.7
1,Newfoundland and Labrador,,,,,,,,,,...,110.0,110.0,108.6,108.6,108.6,108.6,108.4,108.4,108.8,108.8
2,"St. John's, Newfoundland and Labrador",37.5,37.5,37.5,37.5,37.7,37.7,37.7,37.8,37.8,...,110.0,110.0,108.6,108.6,108.6,108.6,108.4,108.4,108.8,108.8
3,Prince Edward Island,,,,,,,,,,...,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0
4,"Charlottetown, Prince Edward Island",,,,,,,,,,...,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0


In [8]:
# Convert values to numeric, coercing errors to NaN
df.iloc[:, 1:] = df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')

# Melt the DataFrame to convert columns to rows
df_melted = df.melt(id_vars=[df.columns[0]], var_name='Date', value_name='Value')

# Convert 'Date' column to datetime format and extract 'Month-Year'
df_melted['Date'] = pd.to_datetime(df_melted['Date'], errors='coerce')
df_melted['Month-Year'] = df_melted['Date'].dt.to_period('M')

# Pivot the DataFrame to have 'Month-Year' as rows and geography as columns
df_pivoted = df_melted.pivot_table(index='Month-Year', columns=df.columns[0], values='Value', aggfunc='mean')

# Reset index to flatten the DataFrame
df_pivoted.reset_index(inplace=True)

# Save the transformed dataset to a new CSV file
output_path = '../data/processed/Housing_Price_Index_1981_2024.csv'
df_pivoted.to_csv(output_path, index=False)

# Display the first few rows of the transformed dataset
print("Transformed DataFrame:")
print(df_pivoted.head())

  df_melted['Date'] = pd.to_datetime(df_melted['Date'], errors='coerce')


Transformed DataFrame:
Geography Month-Year Alberta British Columbia Calgary, Alberta Canada  \
0            1981-01     NaN              NaN             27.8   36.1   
1            1981-02     NaN              NaN             28.1   36.5   
2            1981-03     NaN              NaN             28.6   37.3   
3            1981-04     NaN              NaN             30.1   38.1   
4            1981-05     NaN              NaN             30.1   38.9   

Geography Charlottetown, Prince Edward Island Edmonton, Alberta  \
0                                         NaN              36.5   
1                                         NaN              36.8   
2                                         NaN              36.8   
3                                         NaN              36.9   
4                                         NaN              38.2   

Geography Greater Sudbury, Ontario Halifax, Nova Scotia Hamilton, Ontario  \
0                             54.6                  NaN   

In [9]:
import pandas as pd
import sqlite3

# Load the transformed CSV file
transformed_file_path = '../data/processed/Housing_Price_Index_1981_2024.csv'
transformed_df = pd.read_csv(transformed_file_path)

# Connect to the SQLite database
db_path = '../data/sql/Canada_Housing_Prices_2005_2024.db'
conn = sqlite3.connect(db_path)

# Add the transformed data to the database
table_name = 'Housing_Price_Index'
transformed_df.to_sql(table_name, conn, if_exists='replace', index=False)

# Close the connection
conn.close()

print(f"Data has been added to the database as table '{table_name}'")


Data has been added to the database as table 'Housing_Price_Index'
