In [1]:
# Written in JupyterLab Version 3.6.3
# Python 3.10
# pip 23.1.2
import numpy as np  # Version 1.23.5
import pandas as pd  # Version 2.0.1
import calendar
import os

In [2]:
# file downloaded from repository and accessed locally, due to formatting issues when accessing repository file
prime_filepath = "./data/cleaned_data/primary_df.csv"
prime_df = pd.read_csv(prime_filepath)

# Fix 'month column into separate values for month and year'
prime_df[["Year", "Month"]] = prime_df["month"].str.split("-", expand=True)

# Convert 'Year' column to integer type
prime_df["Year"] = prime_df["Year"].astype(int)

# Use boolean masking to filter out the older years which aren't as relevant to the project
prime_df = prime_df[(prime_df["Year"] >= 1990)]

# To drop the first column
prime_df = prime_df.drop(prime_df.columns[0], axis=1)

prime_df

Unnamed: 0,quarter,month,FEDFUNDS,RSAHORUSQ156S,MORTGAGE30US,CPIAUCSL,Year,Month
426,1990Q1,1990-01,8.23,64.1,9.8950,127.500,1990,01
427,1990Q1,1990-02,8.24,64.1,10.1975,128.000,1990,02
428,1990Q1,1990-03,8.28,64.1,10.2680,128.600,1990,03
429,1990Q2,1990-04,8.26,63.9,10.3700,128.900,1990,04
430,1990Q2,1990-05,8.18,63.9,10.4775,129.100,1990,05
...,...,...,...,...,...,...,...,...
834,2024Q1,2024-01,5.33,65.6,6.6425,309.685,2024,01
835,2024Q1,2024-02,5.33,65.6,6.7760,311.054,2024,02
836,2024Q1,2024-03,5.33,65.6,6.8200,312.230,2024,03
837,2024Q2,2024-04,5.33,,6.9925,,2024,04


In [3]:
# Extracting only the quarter value from the "quarter" column
prime_df["Quarter"] = prime_df["quarter"].str[-2:]

# Drop 'month' column with messy format
prime_df = prime_df.drop(["month", "quarter"], axis=1)

# Drop the last two rows (mostly null)
prime_df = prime_df.iloc[:-2]

# Convert numeric month values to calendar month names
prime_df["Month"] = prime_df["Month"].apply(lambda x: calendar.month_name[int(x)])

prime_df

Unnamed: 0,FEDFUNDS,RSAHORUSQ156S,MORTGAGE30US,CPIAUCSL,Year,Month,Quarter
426,8.23,64.1,9.8950,127.500,1990,January,Q1
427,8.24,64.1,10.1975,128.000,1990,February,Q1
428,8.28,64.1,10.2680,128.600,1990,March,Q1
429,8.26,63.9,10.3700,128.900,1990,April,Q2
430,8.18,63.9,10.4775,129.100,1990,May,Q2
...,...,...,...,...,...,...,...
832,5.33,65.7,7.4420,308.024,2023,November,Q4
833,5.33,65.7,6.8150,308.742,2023,December,Q4
834,5.33,65.6,6.6425,309.685,2024,January,Q1
835,5.33,65.6,6.7760,311.054,2024,February,Q1


In [4]:
# Note three distinct home sizes for each region, 5 region values for each month, and 12 month values for each year
# This may appear to be duplicates upon first glance but there are no duplicates
# file downloaded from repository and accessed locally, due to formatting issues when accessing repository file
secondary_filepath = "./data/cleaned_data/secondary_df.csv"
secondary_df = pd.read_csv(secondary_filepath)
secondary_df

Unnamed: 0,Year,Month,Region,Home Size,Average Sales Price,Number of Households (Thousands),Median Income - Current Dollars,Median Income - 2022 Dollars,Mean Income - Current Dollars,Mean Income - 2022 Dollars
0,2022,December,Midwest,Double,144300.0,28280.0,73070.0,73070.0,102400.0,102400.0
1,2022,December,Midwest,Single,82300.0,28280.0,73070.0,73070.0,102400.0,102400.0
2,2022,December,Midwest,Total1,104700.0,28280.0,73070.0,73070.0,102400.0,102400.0
3,2022,December,Northeast,Double,158300.0,22630.0,80360.0,80360.0,115300.0,115300.0
4,2022,December,Northeast,Single,75300.0,22630.0,80360.0,80360.0,115300.0,115300.0
...,...,...,...,...,...,...,...,...,...,...
1570,2014,January,United States,Single,48000.0,124600.0,53660.0,64900.0,75740.0,91610.0
1571,2014,January,United States,Total1,68300.0,124600.0,53660.0,64900.0,75740.0,91610.0
1572,2014,January,West,Double,93900.0,27910.0,57690.0,69780.0,79610.0,96300.0
1573,2014,January,West,Single,55400.0,27910.0,57690.0,69780.0,79610.0,96300.0


In [5]:
# Merge the two DataFrames on the shared 'Year' and 'Month' columns
full_df = pd.merge(prime_df, secondary_df, on=["Year", "Month"], how="inner")
full_df

Unnamed: 0,FEDFUNDS,RSAHORUSQ156S,MORTGAGE30US,CPIAUCSL,Year,Month,Quarter,Region,Home Size,Average Sales Price,Number of Households (Thousands),Median Income - Current Dollars,Median Income - 2022 Dollars,Mean Income - Current Dollars,Mean Income - 2022 Dollars
0,0.07,64.9,4.432,235.288,2014,January,Q1,Midwest,Double,88200.0,27460.0,54270.0,65640.0,73480.0,88890.0
1,0.07,64.9,4.432,235.288,2014,January,Q1,Midwest,Single,51400.0,27460.0,54270.0,65640.0,73480.0,88890.0
2,0.07,64.9,4.432,235.288,2014,January,Q1,Midwest,Total1,72000.0,27460.0,54270.0,65640.0,73480.0,88890.0
3,0.07,64.9,4.432,235.288,2014,January,Q1,Northeast,Double,92600.0,22180.0,59210.0,71620.0,83720.0,101300.0
4,0.07,64.9,4.432,235.288,2014,January,Q1,Northeast,Single,48700.0,22180.0,59210.0,71620.0,83720.0,101300.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1570,4.10,65.9,6.364,298.812,2022,December,Q4,United States,Single,80200.0,131400.0,74580.0,74580.0,106400.0,106400.0
1571,4.10,65.9,6.364,298.812,2022,December,Q4,United States,Total1,122100.0,131400.0,74580.0,74580.0,106400.0,106400.0
1572,4.10,65.9,6.364,298.812,2022,December,Q4,West,Double,162400.0,29440.0,82890.0,82890.0,117500.0,117500.0
1573,4.10,65.9,6.364,298.812,2022,December,Q4,West,Single,77600.0,29440.0,82890.0,82890.0,117500.0,117500.0


In [6]:
# Create the hierarchical datetime column
full_df["Year-Quarter-Month"] = (
    full_df["Year"].astype(str)
    + "-"
    + full_df["Quarter"]
    + "-"
    + full_df["Month"].astype(str).str.zfill(2)
)

# Rename columns
full_df.rename(
    columns={
        "RSAHORUSQ156S": "Owner-Occupied-Home-Proportion",
        "MORTGAGE30US": "30-Year-Mortgage-Rate",
        "CPIAUCSL": "Consumer-Price-Index",
    },
    inplace=True,
)

full_df

Unnamed: 0,FEDFUNDS,Owner-Occupied-Home-Proportion,30-Year-Mortgage-Rate,Consumer-Price-Index,Year,Month,Quarter,Region,Home Size,Average Sales Price,Number of Households (Thousands),Median Income - Current Dollars,Median Income - 2022 Dollars,Mean Income - Current Dollars,Mean Income - 2022 Dollars,Year-Quarter-Month
0,0.07,64.9,4.432,235.288,2014,January,Q1,Midwest,Double,88200.0,27460.0,54270.0,65640.0,73480.0,88890.0,2014-Q1-January
1,0.07,64.9,4.432,235.288,2014,January,Q1,Midwest,Single,51400.0,27460.0,54270.0,65640.0,73480.0,88890.0,2014-Q1-January
2,0.07,64.9,4.432,235.288,2014,January,Q1,Midwest,Total1,72000.0,27460.0,54270.0,65640.0,73480.0,88890.0,2014-Q1-January
3,0.07,64.9,4.432,235.288,2014,January,Q1,Northeast,Double,92600.0,22180.0,59210.0,71620.0,83720.0,101300.0,2014-Q1-January
4,0.07,64.9,4.432,235.288,2014,January,Q1,Northeast,Single,48700.0,22180.0,59210.0,71620.0,83720.0,101300.0,2014-Q1-January
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1570,4.10,65.9,6.364,298.812,2022,December,Q4,United States,Single,80200.0,131400.0,74580.0,74580.0,106400.0,106400.0,2022-Q4-December
1571,4.10,65.9,6.364,298.812,2022,December,Q4,United States,Total1,122100.0,131400.0,74580.0,74580.0,106400.0,106400.0,2022-Q4-December
1572,4.10,65.9,6.364,298.812,2022,December,Q4,West,Double,162400.0,29440.0,82890.0,82890.0,117500.0,117500.0,2022-Q4-December
1573,4.10,65.9,6.364,298.812,2022,December,Q4,West,Single,77600.0,29440.0,82890.0,82890.0,117500.0,117500.0,2022-Q4-December


In [7]:
# Save the DataFrame to a CSV file
full_df.to_csv("./data/cleaned_data/full_df.csv", index=False)