# Kaggle House Price Competition - Baseline Performance Testing

This notebook tests the baseline performance of various models on the data with no feature engineering. The only exception is using LabelEncoder from sklearn to convert categorical data to numerical values that can be read by the model.

## Models tried 

- Support Vector Regression
- Random Forest
- AdaBoost
- XGBoost
- Neural Network

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

# Load data
training_data = pd.read_csv('train.csv') 

# Print basics
n_features = len(training_data.columns)
print(f'Number of features: {n_features}')
n_entries = training_data.shape[0]
print(f'Number of data points: {n_entries}')

# Map features with string datatype entries to integers
# Identify columns with object data type
object_columns = training_data.select_dtypes(include=['object']).columns

# Map object entries to integers for each object column
for col in object_columns:
    training_data[col], _ = pd.factorize(training_data[col])

# Training data with missing features deleted
training_data_deleted = training_data.copy()
training_data_deleted = training_data_deleted.drop(columns=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])

# Fill missing training data with imputed values 
lf_mean = lf[lf < 250].mean() # Drop outlier around 300
mva_med = mva.median()
gyb_mean = gyb.mean()

training_data_filled = training_data.copy()
training_data_filled['LotFrontage'] = training_data_filled['LotFrontage'].fillna(lf_mean)
training_data_filled['MasVnrArea'] = training_data_filled['MasVnrArea'].fillna(mva_med)
training_data_filled['GarageYrBlt'] = training_data_filled['GarageYrBlt'].fillna(gyb_mean)