***Imports***

In [26]:
import pandas as pd
import warnings

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
import xgboost as xgb

from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV

# Disable all warnings
warnings.filterwarnings('ignore')



# **One Hot Encoding**

In [27]:
df_final = pd.read_csv('../data/processed/df_final_processed_no_bins.csv')

In [28]:
df_final_ohe = pd.get_dummies(df_final, columns=['type', 'city'])


# Converting column datatypes into integers

df_final_ohe['is_foreclosure'] = df_final_ohe['is_foreclosure'].astype(int)
city_type_columns = df_final_ohe.filter(regex='^(city_|type_)').columns
df_final_ohe[city_type_columns] = df_final_ohe[city_type_columns].astype(int)

In [29]:
df_final_ohe.to_csv('../data/processed/df_final_processed_OHE.csv', index=False)

# **Model Selection**

***Train Test Split***

In [30]:
X = df_final_ohe.drop('price', axis=1)
y = df_final_ohe['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Exporting test/training data into CSV

X_train.to_csv('../data/training/X_train_v2_ohe.csv', index=False)
X_test.to_csv('../data/testing/X_test_v2_ohe.csv', index=False)
y_train.to_csv('../data/training/y_train_v2_ohe.csv', index=False)
y_test.to_csv('../data/testing/y_test_v2_ohe.csv', index=False)

In [32]:
models = {
    'Linear Regression': LinearRegression(),
    'Support Vector Machine': SVR(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'Ridge Regression': Ridge(random_state=42),
    'Lasso Regression': Lasso(random_state=42),
    'ElasticNet Regression': ElasticNet(random_state=42)
}

In [34]:
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f'{name} Performance:')
    print(f'Mean Squared Error: {mse}')
    print(f'Mean Absolute Error: {mae}')
    print(f'R^2 Score: {r2}')
    print('')


# Evaluate each model

for name, model in models.items():
    evaluate_model(name, model, X_train, X_test, y_train, y_test)

Linear Regression Performance:
Mean Squared Error: 185504913290.16754
Mean Absolute Error: 183278.2075664464
R^2 Score: 0.4392068230097783

Support Vector Machine Performance:
Mean Squared Error: 346285720808.7064
Mean Absolute Error: 220627.45328381535
R^2 Score: -0.04684380631419338

Random Forest Performance:
Mean Squared Error: 1160219013.535166
Mean Absolute Error: 11767.488193624557
R^2 Score: 0.9964925839695302

XGBoost Performance:
Mean Squared Error: 2504522904.0866137
Mean Absolute Error: 30133.225119850853
R^2 Score: 0.9924286603927612

Decision Tree Performance:
Mean Squared Error: 413223140.4958678
Mean Absolute Error: 2066.115702479339
R^2 Score: 0.9987508001073693

K-Nearest Neighbors Performance:
Mean Squared Error: 26662809917.355373
Mean Absolute Error: 70867.76859504133
R^2 Score: 0.9193966261278933

Gradient Boosting Performance:
Mean Squared Error: 24850275668.066753
Mean Absolute Error: 105168.107779518
R^2 Score: 0.9248760326947275

AdaBoost Performance:
Mean Squ

In [35]:
''' #  Using Train data to do predictions
def evaluate_model(name, model, X_train, y_train):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)

    mse = mean_squared_error(y_train, y_pred)
    mae = mean_absolute_error(y_train, y_pred)
    r2 = r2_score(y_train, y_pred)

    print(f'{name} Performance:')
    print(f'Mean Squared Error: {mse}')
    print(f'Mean Absolute Error: {mae}')
    print(f'R^2 Score: {r2}')
    print('')


# Evaluate each model

for name, model in models.items():
    evaluate_model(name, model, X_train, y_train)
'''

Linear Regression Performance:
Mean Squared Error: 178303546867.79648
Mean Absolute Error: 172670.28044776473
R^2 Score: 0.464220676338149

Support Vector Machine Performance:
Mean Squared Error: 352700638692.63055
Mean Absolute Error: 225903.08391167206
R^2 Score: -0.059820250204851044

Random Forest Performance:
Mean Squared Error: 302116962.93060225
Mean Absolute Error: 4568.789221114803
R^2 Score: 0.9990921772173957

XGBoost Performance:
Mean Squared Error: 1203621478.817942
Mean Absolute Error: 24794.90871416432
R^2 Score: 0.9963832497596741

Decision Tree Performance:
Mean Squared Error: 25839793.281653747
Mean Absolute Error: 258.3979328165375
R^2 Score: 0.9999223547303954

K-Nearest Neighbors Performance:
Mean Squared Error: 13967958656.33075
Mean Absolute Error: 34945.73643410853
R^2 Score: 0.9580280730625375

Gradient Boosting Performance:
Mean Squared Error: 23725681106.220478
Mean Absolute Error: 100084.75473600383
R^2 Score: 0.9287073667360487

AdaBoost Performance:
Mean S

### **Model Performance Comparison**

Based on the performance metrics selected, the **Decision Tree Regressor** performed the best across all the models. See details results and comparison below, along with summary and next steps for Part 3 of the project:

##### ***Best Performing Model: Decision Tree Regressor***
- **Mean Squared Error (MSE)**: 413,223,140.50
- **Mean Absolute Error (MAE)**: 2,066.12
- **R² Score**: 0.9989

##### ***Comparison with Other Models***

| Model                         | Mean Squared Error (MSE)  | Mean Absolute Error (MAE)  | R² Score  |
|-------------------------------|---------------------------|----------------------------|-----------|
| **Decision Tree**             | 413,223,140.50            | 2,066.12                   | 0.9989    |
| **Random Forest**             | 1,160,219,013.54          | 11,767.49                  | 0.9965    |
| **XGBoost**                   | 2,504,522,904.09          | 30,133.23                  | 0.9924    |
| **Linear Regression**         | 185,504,913,290.17        | 183,278.21                 | 0.4392    |
| **Support Vector Machine**    | 346,285,720,808.71        | 220,627.45                 | -0.0468   |
| **K-Nearest Neighbors**       | 26,662,809,917.36         | 70,867.77                  | 0.9194    |
| **Gradient Boosting**         | 24,850,275,668.07         | 105,168.11                 | 0.9249    |
| **AdaBoost**                  | 92,525,334,363.78         | 256,593.85                 | 0.7203    |
| **Ridge Regression**          | 185,530,659,977.92        | 183,475.83                 | 0.4391    |
| **Lasso Regression**          | 185,503,888,825.00        | 183,296.20                 | 0.4392    |
| **ElasticNet Regression**     | 234,388,902,656.75        | 189,973.07                 | 0.2914    |

##### ***Summary***

The **Decision Tree Regressor** achieved the best performance in terms of Mean Squared Error, Mean Absolute Error, and R² Score, indicating that it is the most accurate model among those tested for this particular dataset.

The **Random Forest Regressor** also performed very well, closely following the Decision Tree Regressor. The other models, especially the simpler linear models (Linear Regression, Ridge, Lasso, and ElasticNet), did not perform as well, which might suggest that the dataset benefits from more complex, non-linear models.

Based on these results, you might consider focusing on the Decision Tree and Random Forest models for further tuning and optimization.

# **Next Steps for Part 3**

Based on these results, we'll focusing on the Decision Tree and Random Forest models for further tuning and optimization.