# Kaggle House Price Competition - Baseline Performance Testing

This notebook tests the baseline performance of various models on the data with no feature engineering. The only exception is using LabelEncoder from sklearn to convert categorical data to numerical values that can be read by the model.

## Models tried 

- Support Vector Regression
- Random Forest
- AdaBoost
- XGBoost
- Neural Network

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
import random

# Set random seed for reproducibility and stability of comparisons
np.random.seed(42)
random.seed(42)

# Load data
print('LOAD DATA')
training_data = pd.read_csv('train.csv') 

# Print basics
n_features = len(training_data.columns)
print(f'Number of features: {n_features}')
n_entries = training_data.shape[0]
print(f'Number of data points: {n_entries}')

# Map features with string datatype entries to integers
# Identify columns with object data type
object_columns = training_data.select_dtypes(include=['object']).columns

# Map object entries to integers for each object column
for col in object_columns:
    training_data[col], _ = pd.factorize(training_data[col])

# Training data with missing features deleted
training_data_deleted = training_data.copy()
training_data_deleted = training_data_deleted.drop(columns=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])

# Fill missing training data with imputed values
lf = training_data['LotFrontage'].dropna()
mva = training_data['MasVnrArea'].dropna()
gyb = training_data['GarageYrBlt'].dropna()

lf_mean = lf[lf < 250].mean() # Drop outlier around 300
mva_med = mva.median()
gyb_mean = gyb.mean()

training_data_filled = training_data.copy()
training_data_filled['LotFrontage'] = training_data_filled['LotFrontage'].fillna(lf_mean)
training_data_filled['MasVnrArea'] = training_data_filled['MasVnrArea'].fillna(mva_med)
training_data_filled['GarageYrBlt'] = training_data_filled['GarageYrBlt'].fillna(gyb_mean)

# Split data into features and target
X_deleted = training_data_deleted.drop(columns='SalePrice')
y_deleted = training_data_deleted['SalePrice']
X_imputed = training_data_filled.drop(columns='SalePrice')
y_imputed = training_data_filled['SalePrice']

# Standardize data
scaler = StandardScaler()
X_deleted_scaled = scaler.fit_transform(X_deleted)
X_imputed_scaled = scaler.fit_transform(X_imputed)

# Split data into training and test set
Xd_train, Xd_test, yd_train, yd_test = train_test_split(X_deleted_scaled, y_deleted, test_size=0.2)
Xi_train, Xi_test, yi_train, yi_test = train_test_split(X_imputed_scaled, y_imputed, test_size=0.2)

# Initialize models
svr_model = SVR(kernel='rbf')  # Support Vector Regression
rf_model = RandomForestRegressor()  # Random Forest
ada_model = AdaBoostRegressor()  # AdaBoost
xgb_model = XGBRegressor()  # XGBoost

# Neural Network (using TensorFlow/Keras)
nn_model = MLPRegressor(hidden_layer_sizes=(128, 64), activation='relu', solver='adam')

############################
# DELETED FEATURES SECTION #
############################

# Train models on data w/ deletions
print('TRAIN SVR')
svr_model.fit(Xd_train, yd_train)
print('TRAIN RANDOM FOREST')
rf_model.fit(Xd_train, yd_train)
print('TRAIN ADABOOST')
ada_model.fit(Xd_train, yd_train)
print('TRAIN XGBOOST')
xgb_model.fit(Xd_train, yd_train)
print('TRAIN NEURAL NET (128/64 NEURONS)')
nn_model.fit(Xd_train, yd_train)

# Generate predictions (deleted)
svr_pred = svr_model.predict(Xd_test)
rf_pred = rf_model.predict(Xd_test)
ada_pred = ada_model.predict(Xd_test)
xgb_pred = xgb_model.predict(Xd_test)
nn_pred = nn_model.predict(Xd_test)

# Evaluation: Mean Squared Error (MSE) on test set (deleted)
svr_mse_d = mean_squared_error(yd_test, svr_pred)
rf_mse_d = mean_squared_error(yd_test, rf_pred)
ada_mse_d = mean_squared_error(yd_test, ada_pred)
xgb_mse_d = mean_squared_error(yd_test, xgb_pred)
nn_mse_d = mean_squared_error(yd_test, nn_pred)

# Print results (deleted)
print('RESULTS (DELETED)')
print('-----------------')
print(f"Support Vector Regression MSE: {svr_mse_d:.2e}")
print(f"Random Forest MSE: {rf_mse_d:.2e}")
print(f"AdaBoost MSE: {ada_mse_d:.2e}")
print(f"XGBoost MSE: {xgb_mse_d:.2e}")
print(f"Neural Network MSE: {nn_mse_d:.2e}")

print('===========================================')

############################
# IMPUTED FEATURES SECTION #
############################

# Train models on data w/ imputations
svr_model.fit(Xi_train, yi_train)
rf_model.fit(Xi_train, yi_train)
ada_model.fit(Xi_train, yi_train)
xgb_model.fit(Xi_train, yi_train)
nn_model.fit(Xi_train, yi_train)

# Generate predictions (imputed)
svr_pred = svr_model.predict(Xi_test)
rf_pred = rf_model.predict(Xi_test)
ada_pred = ada_model.predict(Xi_test)
xgb_pred = xgb_model.predict(Xi_test)
nn_pred = nn_model.predict(Xi_test)

# Evaluation: Mean Squared Error (MSE) on test set (imputed)
svr_mse_i = mean_squared_error(yi_test, svr_pred)
rf_mse_i = mean_squared_error(yi_test, rf_pred)
ada_mse_i = mean_squared_error(yi_test, ada_pred)
xgb_mse_i = mean_squared_error(yi_test, xgb_pred)
nn_mse_i = mean_squared_error(yi_test, nn_pred)

# Print results (imputed)
print('RESULTS (IMPUTED)')
print('-----------------')
print(f"Support Vector Regression MSE: {svr_mse_i:.2e}")
print(f"Random Forest MSE: {rf_mse_i:.2e}")
print(f"AdaBoost MSE: {ada_mse_i:.2e}")
print(f"XGBoost MSE: {xgb_mse_i:.2e}")
print(f"Neural Network MSE: {nn_mse_i:.2e}")

LOAD DATA
Number of features: 81
Number of data points: 1460
TRAIN SVR
TRAIN RANDOM FOREST
TRAIN ADABOOST
TRAIN XGBOOST
TRAIN NEURAL NET (128/64 NEURONS)




RESULTS (DELETED)
-----------------
Support Vector Regression MSE: 7.86e+09
Random Forest MSE: 8.32e+08
AdaBoost MSE: 1.35e+09
XGBoost MSE: 7.64e+08
Neural Network MSE: 5.88e+09
RESULTS (IMPUTED)
-----------------
Support Vector Regression MSE: 7.73e+09
Random Forest MSE: 8.59e+08
AdaBoost MSE: 1.34e+09
XGBoost MSE: 1.08e+09
Neural Network MSE: 7.69e+09




RESULTS (DELETED)
-----------------
- Support Vector Regression MSE: 7.86e+09
- Random Forest MSE: 8.32e+08
- AdaBoost MSE: 1.35e+09
- XGBoost MSE: 7.64e+08
- Neural Network MSE: 5.88e+09

RESULTS (IMPUTED)
-----------------
- Support Vector Regression MSE: 7.73e+09
- Random Forest MSE: 8.59e+08
- AdaBoost MSE: 1.34e+09
- XGBoost MSE: 1.08e+09
- Neural Network MSE: 7.69e+09

## Initial thoughts:
The square of the mean sale price (with no corrections for outliers) of the training dataset is 3.24e10 squared dollars (mean of $180k)

In [12]:
# Compute percentage errors between MSE and average squared sale price
m2 = 3.24e10 # avg sq sale price

svrd = svr_mse_d/m2
rfd = rf_mse_d/m2
adad = ada_mse_d/m2
xgbd = xgb_mse_d/m2
nnd = nn_mse_d/m2
svri = svr_mse_i/m2
rfi = rf_mse_i/m2
adai = ada_mse_i/m2
xgbi = xgb_mse_i/m2
nni = nn_mse_i/m2

print('RESULTS (DELETED)')
print('-----------------')
print(f"Support Vector Regression MSE: {svrd:.4f}")
print(f"Random Forest MSE: {rfd:.4f}")
print(f"AdaBoost MSE: {adad:.4f}")
print(f"XGBoost MSE: {xgbd:.4f}")
print(f"Neural Network MSE: {nnd:.4f}")

# Print results (imputed)
print('RESULTS (IMPUTED)')
print('-----------------')
print(f"Support Vector Regression MSE: {svri:.4f}")
print(f"Random Forest MSE: {rfi:.4f}")
print(f"AdaBoost MSE: {adai:.4f}")
print(f"XGBoost MSE: {xgbi:.4f}")
print(f"Neural Network MSE: {nni:.4f}")

RESULTS (DELETED)
-----------------
Support Vector Regression MSE: 0.2425
Random Forest MSE: 0.0257
AdaBoost MSE: 0.0416
XGBoost MSE: 0.0236
Neural Network MSE: 0.1813
RESULTS (IMPUTED)
-----------------
Support Vector Regression MSE: 0.2386
Random Forest MSE: 0.0265
AdaBoost MSE: 0.0415
XGBoost MSE: 0.0334
Neural Network MSE: 0.2373


RESULTS (DELETED)
-----------------
Support Vector Regression MSE: 0.2425
Random Forest MSE: 0.0257
AdaBoost MSE: 0.0416
XGBoost MSE: 0.0236
Neural Network MSE: 0.1813

RESULTS (IMPUTED)
-----------------
Support Vector Regression MSE: 0.2386
Random Forest MSE: 0.0265
AdaBoost MSE: 0.0415
XGBoost MSE: 0.0334
Neural Network MSE: 0.2373

## Further thoughts:
The results are better nearly across the board for the models trained on datasets where features missing information were deleted vs imputed. The exception is the AdaBoost, although the results are nearly identical in that case.

The overall best performing model was XGBoost on deleted data with an error of 2.36%, although Random Forest did comparably well for both datasets. Notably it outperforms XGBoost on the imputed dataset. This suggests to me that focusing on **XGBoost** and **RF models** is the way to go for further optimizations.