# Import Packages

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns
import sklearn
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb

# Data Exploration & Analysis

In [None]:
# import data
train_data=pd.read_csv('train.csv')
test_data=pd.read_csv('test.csv')

In [None]:
# get data information
print(train_data.info())

In [None]:
# get data summaries
print(train_data.describe())

In [None]:
# get missing or NaN vals
missing_values_df = pd.DataFrame(train_data.isnull().sum().reset_index())
missing_values_df.columns = ['column', 'count']

print(missing_values_df.query('count > 0').sort_values(by = 'count', ascending = False))

In [None]:
# define numeric features
num_train = train_data.select_dtypes(include='number')

# create correlation plot
corr = num_train.corr()
plt.figure(figsize=(16, 12))
sns.heatmap(corr, cmap='PiYG', cbar=True, vmin=-1, vmax=1, linewidth = 0.4, linecolor = 'grey')
plt.title('Correlation Matrix', fontsize=18)
plt.show()

In [None]:
# create density plot
def dollars(x, _):
    return f'${int(x):,}'

plt.figure(figsize=(14, 6))
sns.kdeplot(data=train_data.SalePrice, color='limegreen', fill=True)
plt.xlabel('Sales Price')
plt.gca().xaxis.set_major_formatter(FuncFormatter(dollars))
plt.title('Sales Price Density Plot', fontsize=18)
plt.show()

In [None]:
# log transform sale price
train_data_y=np.log(train_data.SalePrice)

# create density plot
plt.figure(figsize=(14, 6))
sns.kdeplot(data=train_data_y, color='limegreen', fill=True)
plt.xlabel('Sales Price (log transformed)')
plt.title('Log Transformed Sales Price Density Plot', fontsize=18)
plt.show()

# Data Preprocessing

In [None]:
# drop Id and garage year built columns
train_data=train_data.drop(columns=['Id','GarageYrBlt'])

# hold test Id for predictions
test_data_id=test_data['Id']

# drop Id and garage year built columns from predictors
test_data=test_data.drop(columns=['Id','GarageYrBlt'])

In [None]:
# define columns with NaN values and their fill values
fill_values = {
    'PoolQC': 'No',
    'MiscFeature': 'No',
    'Alley': 'No',
    'Fence': 'No',
    'MasVnrType': 'No',
    'MasVnrArea': 0,
    'FireplaceQu': 'No',
    'LotFrontage': 0,
    'GarageCond': 'No',
    'GarageType': 'No',
    'GarageFinish': 'No',
    'GarageQual': 'No',
    'BsmtFinType2': 'No',
    'BsmtExposure': 'No',
    'BsmtQual': 'No',
    'BsmtCond': 'No',
    'BsmtFinType1': 'No',
    'Electrical': 'No'
}

# automate filling missing values for train and test data
for column, value in fill_values.items():
    train_data[column].fillna(value, inplace=True)
    test_data[column].fillna(value, inplace=True)

In [None]:
# create feature and outcome structures
train_data_x=train_data.drop(columns=['SalePrice'])
train_data_y=np.log(train_data.SalePrice)

In [None]:
# define columns needing ordinal encoding
ordinal_cols = ['LotShape', 'LandContour','Utilities','LandSlope',  'BsmtQual',  'BsmtFinType1',  'CentralAir',  
                'Functional', 'PoolQC','Fence','FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 
                'ExterCond', 'KitchenQual', 'BsmtExposure', 'HeatingQC','ExterQual', 'BsmtCond', 'SaleCondition',
                'BsmtFinType2']

# define columns needing one_hot encoding
one_hot_cols = ['Street', 'LotConfig','Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
                'Exterior1st', 'Exterior2nd', 'MasVnrType','Foundation', 'Electrical',  'SaleType', 'MSZoning', 
                'Heating', 'GarageType', 'RoofMatl','MiscFeature','Alley']

# define numeric columns
num_cols = train_data.select_dtypes(include=['numeric']).columns
num_cols = num_cols.drop('SalePrice')

In [None]:
# define Pipelines to scale, encode, and impute features
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

ordinal_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

one_hot_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [None]:
# create column transformer based on pipelines created
col_trans = ColumnTransformer(transformers=[
    ('num_p', num_pipeline, num_cols),
    ('ode_p', ordinal_pipeline, ordinal_cols),
    ('ohe_p', one_hot_pipeline, one_hot_cols),
    ],
    remainder='passthrough', 
    n_jobs=-1)

In [None]:
# fit the column transformer on the training features 
train_data_x=col_trans.fit_transform(train_data_x)

# transform testing features
test_data=pd.DataFrame(col_trans.transform(test_data))

# Model Training and Selection

In [None]:
# define Random Forest Model
random_forest_model=RandomForestRegressor(n_estimators=1000, random)

# define the parameter grid
random_forest_param_grid = {
    'max_depth': range(5, 20, 5),
    'min_samples_split': range(2, 10, 1),
    'min_samples_leaf': range(1, 5, 1)
}

# set up GridSearchCV
random_forest_grid_search = GridSearchCV(estimator=random_forest_model, 
                           param_grid=random_forest_param_grid,
                           scoring='neg_mean_squared_error', 
                           cv=5, n_jobs=-1)

# fit models from grid search
random_forest_grid_search.fit(train_data_x, train_data_y)

# identify optimal parameters and corresponding RMSE
print('Random Forest best parameters:', random_forest_grid_search.best_params_)
print('RMSE:', np.sqrt(-1 * random_forest_grid_search.best_score_))

In [None]:
# define XGBoost model
xgb_model=xgb.XGBRegressor(n_estimators = 1000, eval_metric = 'rmse', enable_categorical=False)

# define the parameter grid
xgb_param_grid = {
    'max_depth': range(3, 10, 1),
    'learning_rate': np.arange(0.01, 0.05, 0.005)
}

# set up GridSearchCV
xgb_grid_search = GridSearchCV(estimator=xgb_model, 
                           param_grid=xgb_param_grid,
                           scoring='neg_mean_squared_error', 
                           cv=5, n_jobs=-1)

# fit models from grid search
xgb_grid_search.fit(train_data_x, train_data_y)

# identify optimal parameters and corresponding RMSE
print('XGBoost best parameters:', xgb_grid_search.best_params_)
print('RMSE:', np.sqrt(-1 * xgb_grid_search.best_score_))

In [None]:
# define the SVR model with radial basis function kernel
svr_model = SVR(kernel='rbf')

# set up the parameter grid for hyperparameter tuning
svr_param_grid = {
    'C': np.arange(0.1, 2.0, 0.1),
    'epsilon': np.arange(0.01, 0.1, 0.01)
}

# set up GridSearchCV
svr_grid_search = GridSearchCV(estimator=svr_model, 
                           param_grid=svr_param_grid,
                           scoring='neg_mean_squared_error', 
                           cv=5, n_jobs=-1)

# fit models from grid search
svr_grid_search.fit(train_data_x, train_data_y)

# identify optimal parameters and corresponding RMSE
print('SVR best parameters:', svr_grid_search.best_params_)
print('RMSE:', np.sqrt(-1 * svr_grid_search.best_score_))

# Test Predictions

In [None]:
# define test features
test_data_x=test_data

In [None]:
# Random Forest Model
final_random_forest_model=RandomForestRegressor(n_estimators=1000,
                                                max_depth=15,
                                                min_samples_split=2,
                                                min_samples_leaf=2)

final_random_forest_model.fit(train_data_x, train_data_y)

In [None]:
# XGBoost Model
final_xgboost_model=xgb.XGBRegressor(n_estimators = 1000,  
                                     max_depth=3,
                                     learning_rate=0.035,
                                     eval_metric = 'rmse',
                                     enable_categorical=True)

final_xgboost_model.fit(train_data_x, train_data_y)

In [None]:
# SVM Model
final_svr_model=SVR(kernel='rbf',
                    C=1.2,
                    epsilon=0.03
                   )

final_svr_model.fit(train_data_x, train_data_y)

In [None]:
# make predictions
y_hat_rf=final_random_forest_model.predict(test_data_x)
y_hat_xgb=final_xgboost_model.predict(test_data_x)
y_hat_svr=final_svr_model.predict(test_data_x)

# compile predictions with testing records
pred_df = pd.DataFrame({
    'Id':test_data_id,
    'rf':np.exp(y_hat_rf),
    'xgb':np.exp(y_hat_xgb),
    'svr':np.exp(y_hat_svr)
})

# average predictions of models to create prediction
pred_df['SalePrice'] = pred_df[['rf','xgb','svr']].mean(axis=1)

# create dataframe and csv for submission
final_df = pd.DataFrame(pred_df[['Id','SalePrice']])
final_df.to_csv('submission.csv', index=False)