In [None]:
import pandas as pd
import numpy as np
import time
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
# from sklearn.metrics import mean_absolute_error

# Model Training
## Pipeline Setup

In [None]:
training_data = pd.read_csv('./data/combined/total_clean.csv')

In [None]:
# active_listings['hypothetical_soldDate'] = pd.to_datetime('today') + pd.DateOffset(months=6)
# active_listings['days_from_listing_to_sold'] = (active_listings['hypothetical_soldDate'] - pd.to_datetime(active_listings['listingDate'])).dt.days
# active_listings['listingYear'] = pd.to_datetime(active_listings['listingDate']).dt.year
# active_listings['listingMonth'] = pd.to_datetime(active_listings['listingDate']).dt.month
# active_listings['hypothetical_soldYear'] = active_listings['hypothetical_soldDate'].dt.year
# active_listings['hypothetical_soldMonth'] = active_listings['hypothetical_soldDate'].dt.month

In [None]:
numeric_features = ['beds', 'baths', 'sqFt', 'parking', 'taxes', 'age', 'lat', 'lon', 'daysOnMarket', 'priceToTaxRatio', 'listingYear', 'listingMonth', 'distanceToCenter', 'distanceToSchool']
categorical_features = ['propertyType', 'streetAddress', 'nearestSchool', 'postalCode', 'neighborhood']

features = numeric_features + categorical_features
target = 'price'

In [None]:
X = training_data[features]
y = training_data[target]

X_train, X_valid, y_train, y_valid = train_test_split(X, y)

In [None]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
out_dir = 'models/'

## Random Forest Regressor

In [None]:
name = 'rfr.joblib'
rfr = make_pipeline(
    preprocessor,
    RandomForestRegressor(n_estimators=200, max_depth=8)
)

In [None]:
st = time.time()
rfr.fit(X_train, y_train)
joblib.dump(rfr, out_dir + name)
et = time.time()
print(f'Random Forest Regressor training time: {(et-st)} seconds')

# Gradient Boosting Regressor

In [None]:
name = 'gbr.joblib'
gbr = make_pipeline(
    preprocessor,
    GradientBoostingRegressor(n_estimators=200, max_depth=8)
)

In [None]:
st = time.time()
gbr.fit(X_train, y_train)
joblib.dump(gbr, out_dir + name)
et = time.time()
print(f'Gradient Boosting Regressor training time: {(et-st)} seconds')

## kNN Regressor

In [None]:
name = 'knnr.joblib'
knnr = make_pipeline(
    preprocessor,
    KNeighborsRegressor(n_neighbors=5)
)

In [None]:
st = time.time()
knnr.fit(X_train, y_train)
joblib.dump(knnr, out_dir + name)
et = time.time()
print(f'kNN Regressor training time: {(et-st)} seconds')

## MultiLayer Perceptron Regressor

In [None]:
name = 'mlpr.joblib'
mlpr = make_pipeline(
    preprocessor,
    MLPRegressor(hidden_layer_sizes=(55, 55), max_iter=500, random_state=50, alpha=0.01)
)

In [None]:
st = time.time()
mlpr.fit(X_train, y_train)
joblib.dump(mlpr, out_dir + name)
et = time.time()
print(f'MLP Regressor training time: {(et-st)} seconds')

# Model Validation

In [None]:
rf_regressor = joblib.load('models/rfr.joblib')
gb_regressor = joblib.load('models/gbr.joblib')
knn_regressor = joblib.load('models/knnr.joblib')
mlp_regressor = joblib.load('models/mlpr.joblib')

In [None]:
print(f'Random Forest Regressor Train: {rf_regressor.score(X_train, y_train)}')
print(f'Random Forest Regressor Valid: {rf_regressor.score(X_valid, y_valid)}')

In [None]:
print(f'Gradient Boosting Regressor Train: {gb_regressor.score(X_train, y_train)}')
print(f'Gradient Boosting Regressor Valid: {gb_regressor.score(X_valid, y_valid)}')

In [None]:
print(f'KNN Regressor Train: {knn_regressor.score(X_train, y_train)}')
print(f'KNN Regressor Valid: {knn_regressor.score(X_valid, y_valid)}')

In [None]:
print(f'MLP Regressor Train: {mlp_regressor.score(X_train, y_train)}')
print(f'MLP Regressor Valid: {mlp_regressor.score(X_valid, y_valid)}')

# Model Predictions

In [None]:
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
prediction_data = pd.read_csv('./data/predict/total_clean.csv', parse_dates=['listingDate'])

In [None]:
hypothetical_sold_date = datetime(2025, 1, 1)
prediction_data['daysOnMarket'] = (hypothetical_sold_date - prediction_data['listingDate']).dt.days

X_predict = prediction_data[features]

In [None]:
predicted = gb_regressor.predict(X_predict)
X_predict.loc[:,'soldPricePredicted'] = pd.Series(predicted)
X_predict.to_csv('data/predict/predicted/output.csv')

## Feature Importance Analysis

In [None]:
transformed_feature_names = (
    numeric_features +
    list(rf_regressor.named_steps['columntransformer'].transformers_[1][1].get_feature_names_out(categorical_features))
)

In [None]:
rf_importances = rf_regressor.named_steps['randomforestregressor'].feature_importances_

rf_importance_df = pd.DataFrame({
    'feature': transformed_feature_names,
    'importance': rf_importances
}).sort_values(by='importance', ascending=False)

rf_numeric_importance = rf_importance_df[rf_importance_df['feature'].isin(numeric_features)]

In [None]:
plt.figure(figsize=(8, 4))
plt.bar(rf_numeric_importance['feature'], rf_numeric_importance['importance'])
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Random Forest Feature Importances (Numeric Features)')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
gb_importances = gb_regressor.named_steps['gradientboostingregressor'].feature_importances_

gb_importances_df = pd.DataFrame({
    'feature': transformed_feature_names,
    'importance': gb_importances
}).sort_values(by='importance', ascending=False)

gb_importances_df = gb_importances_df[gb_importances_df['feature'].isin(numeric_features)]

In [None]:
plt.figure(figsize=(8, 4))
plt.bar(rf_numeric_importance['feature'], rf_numeric_importance['importance'])
plt.xlabel('Feature')
plt.ylabel('Importance Score')
plt.title('Gradient Boosting Feature Importances (Numeric Features)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('graphs/gbr_importance.png')