In [1]:
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('/Users/ernestgaisie/Desktop/Final Projects/CANADA_MORTGAGE_RATES_ANALYSIS/cleaned_34100133.csv', low_memory=False)

# Drop irrelevant columns (e.g., STATUS, TERMINATED, DGUID, UOM, SCALAR_FACTOR, VECTOR, COORDINATE)
data_cleaned = data.drop(columns=['STATUS', 'TERMINATED', 'DGUID', 'UOM', 'SCALAR_FACTOR', 'VECTOR', 'COORDINATE'])

# One-Hot Encode categorical variables
data_encoded = pd.get_dummies(data_cleaned, columns=['GEO', 'Type of structure', 'Type of unit'])

# Define features (X) and target (y)
X = data_encoded.drop(columns=['VALUE'])
y = data_encoded['VALUE']

# Convert all numerical data to float32 to ensure compatibility with TensorFlow
X = X.astype('float32')
y = y.astype('float32')

# Check the first few rows and data types
print(X.head())
print(X.dtypes)


   REF_DATE  UOM_ID  SCALAR_ID  DECIMALS  \
0    1987.0    81.0        0.0       0.0   
1    1987.0    81.0        0.0       0.0   
2    1987.0    81.0        0.0       0.0   
3    1987.0    81.0        0.0       0.0   
4    1987.0    81.0        0.0       0.0   

   GEO_Abbotsford-Mission, British Columbia  GEO_Alma, Quebec  \
0                                       0.0               0.0   
1                                       0.0               0.0   
2                                       0.0               0.0   
3                                       0.0               0.0   
4                                       0.0               0.0   

   GEO_Amos, Quebec  GEO_Asbestos, Quebec  GEO_Baie-Comeau, Quebec  \
0               0.0                   0.0                      0.0   
1               0.0                   0.0                      0.0   
2               0.0                   0.0                      0.0   
3               0.0                   0.0                      0

In [2]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure the data is in the correct format
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)


In [3]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Define the TensorFlow model
model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),  # Define the input shape
    layers.Dense(128, activation='relu'),  # Hidden layer with 128 neurons
    layers.Dense(64, activation='relu'),  # Hidden layer with 64 neurons
    layers.Dense(32, activation='relu'),  # Hidden layer with 32 neurons
    layers.Dense(1)  # Output layer for regression (single output)
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = model.fit(X_train, y_train, epochs=100, validation_split=0.2, batch_size=32)


Epoch 1/100
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 473us/step - loss: 94658.4141 - mae: 229.8898 - val_loss: 90769.9766 - val_mae: 214.5868
Epoch 2/100
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 422us/step - loss: 83151.0391 - mae: 215.2847 - val_loss: 72153.5469 - val_mae: 185.4716
Epoch 3/100
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 421us/step - loss: 65990.4453 - mae: 187.4460 - val_loss: 55080.1133 - val_mae: 169.8248
Epoch 4/100
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 424us/step - loss: 55199.8555 - mae: 171.5765 - val_loss: 49234.6133 - val_mae: 159.9022
Epoch 5/100
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 425us/step - loss: 51847.5234 - mae: 165.0280 - val_loss: 51593.7773 - val_mae: 178.0674
Epoch 6/100
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 423us/step - loss: 49679.0547 - mae: 163.1927 - val_loss: 54739.9883 - val_

In [4]:
# Evaluate the model
loss, mae = model.evaluate(X_test, y_test)
print(f"Mean Absolute Error on test data: {mae}")

[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 218us/step - loss: 14011.4824 - mae: 73.3191
Mean Absolute Error on test data: 72.4484634399414


In [6]:
# Make predictions on the test set
predictions = model.predict(X_test)

# Save the trained model
model.save('tf_model.keras')

[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 241us/step


In [7]:
# Evaluate the model on the test set
loss, mae = model.evaluate(X_test, y_test)
print(f"Mean Absolute Error on test data: {mae}")

[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 221us/step - loss: 14011.4824 - mae: 73.3191
Mean Absolute Error on test data: 72.4484634399414


In [8]:
# Make predictions on the test set
predictions = model.predict(X_test)

# Compare the first few predictions with the actual values
for i in range(5):
    print(f"Predicted: {predictions[i][0]}, Actual: {y_test[i]}")

[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 441us/step
Predicted: 1278.5657958984375, Actual: 1261.0
Predicted: 427.5155029296875, Actual: 464.0
Predicted: 354.2796630859375, Actual: 402.0
Predicted: 608.6024169921875, Actual: 875.0
Predicted: 216.60919189453125, Actual: 260.0


In [9]:
# Print the model summary
model.summary()

In [15]:
print(column_names)

Index(['REF_DATE', 'UOM_ID', 'SCALAR_ID', 'DECIMALS',
       'GEO_Abbotsford-Mission, British Columbia', 'GEO_Alma, Quebec',
       'GEO_Amos, Quebec', 'GEO_Asbestos, Quebec', 'GEO_Baie-Comeau, Quebec',
       'GEO_Barrie, Ontario',
       ...
       'GEO_Yellowknife, Northwest Territories', 'GEO_Yorkton, Saskatchewan',
       'Type of structure_Apartment structures of six units and over',
       'Type of structure_Apartment structures of three units and over',
       'Type of structure_Row and apartment structures of three units and over',
       'Type of structure_Row structures of three units and over',
       'Type of unit_Bachelor units', 'Type of unit_One bedroom units',
       'Type of unit_Three bedroom units', 'Type of unit_Two bedroom units'],
      dtype='object', length=231)


In [14]:
import numpy as np

# Example feature vector for "Labrador City two bedroom units"
feature_vector = np.zeros((1, len(column_names)))

# Set the specific features to 1 using the saved column names
feature_vector[0, column_names.get_loc('GEO_Labrador City')] = 1
feature_vector[0, column_names.get_loc('Type of unit_Two bedroom units')] = 1
# Adjust this depending on the type of structure you're interested in
feature_vector[0, column_names.get_loc('Type of structure_Apartment')] = 1

# Convert to float32 if necessary
feature_vector = feature_vector.astype('float32')


KeyError: 'GEO_Labrador City'