In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

!python -m pip install plotly xgboost
import plotly as pl

from scipy import stats
from scipy.stats import norm, skew
from scipy.special import boxcox1p

from sklearn.preprocessing import RobustScaler, MinMaxScaler, PowerTransformer, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, KFold

from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

import xgboost as xgb

sns.set(rc={"figure.figsize":(12, 12)})

In [None]:
def imputinator(df:'pandas.DataFrame')->'pandas.DataFrame':
  
    for col in ['PoolQC', 'MiscFeature', 'Alley', 'FireplaceQu', 'GarageCond', 
              'GarageType', 'GarageFinish', 'GarageQual', 'BsmtExposure', 
              'BsmtFinType1', 'BsmtFinType2', 'BsmtCond', 'BsmtQual', 
              'MasVnrType','Fence']:
              
              df[col].fillna('None', inplace=True)
  
    for col in ['Electrical', 'LotFrontage', 'GarageYrBlt', 'MasVnrArea', 'MSZoning',
              'Functional', 'BsmtFullBath', 'BsmtHalfBath', 'Utilities', 'KitchenQual',
              'BsmtFinSF1', 'Exterior2nd', 'GarageCars', 'GarageArea', 'BsmtFinSF2',
              'SaleType', 'TotalBsmtSF', 'BsmtUnfSF', 'Exterior1st']:
              
              df[col].fillna(df[col].mode()[0], inplace=True)
    return df


def rmse(pred:'numpy.array', truth:'numpy.array')->'float':
    return np.sqrt(mean_squared_error(pred, truth))

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_id = train['Id']
train.drop(columns=['Id'])

train_nrows = train.shape[0]
test_nrows = test.shape[0]

In [None]:
df_concat = pd.concat([train, test]).reset_index(drop=True)

sale_concat = df_concat['SalePrice']

df_concat.drop(columns=['SalePrice', 'Id'], inplace=True)
df_concat.head(5)

In [None]:
missing_values_count = df_concat.isna().sum().sort_values(ascending=False)
missing_values = missing_values_count[missing_values_count > 0]

missing_values

In [None]:
sns.barplot(x=missing_values.index, y=missing_values)
plt.xticks(rotation='90')

In [None]:
df_concat = imputinator(df_concat)

In [None]:
df_numerical = pd.concat([train, test]).reset_index(drop=True).select_dtypes(exclude='object')

df_numerical.corr().apply(np.abs).SalePrice.sort_values(ascending=False)

In [None]:
df_numerical['TotalBathrooms'] = df_numerical['FullBath'] + df_numerical['BsmtFullBath'] + 0.5*(df_numerical['HalfBath'] + df_numerical['BsmtHalfBath'])
df_numerical['TotalArea'] = df_numerical['TotalBsmtSF'] + df_numerical['1stFlrSF'] + df_numerical['2ndFlrSF'] + df_numerical['GarageArea']
df_numerical['HouseAge'] = 2022 - df_numerical['YearBuilt']
df_numerical['YrSncRemod'] = 2022 - df_numerical['YearRemodAdd']

In [None]:
df_numerical.corr().apply(np.abs).SalePrice.sort_values(ascending=False)

The combined variables seem to have a much better correlation.

In [None]:
df_numerical.drop(columns=[
    'FullBath', 'BsmtFullBath', 'HalfBath', 'BsmtHalfBath', 'Id',
    '1stFlrSF', '2ndFlrSF', 'GarageArea', 'YearBuilt', 'YearRemodAdd'
], inplace=True)

In [None]:
df_numerical.corr().apply(np.abs).SalePrice.sort_values(ascending=False)

Drop seemingly meaningless variables cov(<0.15)

In [None]:
mask = df_numerical.corr().apply(np.abs).SalePrice.sort_values(ascending=False) < 0.15
index_to_be_dropped = df_numerical.corr().apply(np.abs).SalePrice.sort_values(ascending=False)[mask].index

In [None]:
df_numerical.drop(columns=index_to_be_dropped, inplace=True)

In [None]:
sns.heatmap(df_numerical.corr(), 
            vmin=-1.0, 
            vmax=1.0, 
            annot=True, 
            linewidths=0.5, 
            fmt='1.1f', 
            cmap='icefire')

Now look at covariance between dependant variable.

In [None]:
sns.heatmap(df_numerical.corr(), 
            vmin=-1.0, 
            vmax=1.0, 
            annot=True, 
            linewidths=0.5, 
            fmt='1.1f', 
            cmap='icefire', 
            mask=np.triu(df_numerical.apply(np.abs).corr()) < 0.65)

- `TotRmsAbvGrd`, `GrLivArea` and `BedroomAbvGr` give overlapping information. `SalePrice` is highly correlated with `GrLivArea`, `TotalArea` and `OverallQual`. We could do PCA but first let's try keeping only the most general.

- Keep: `TotalArea` and `OverallQual`
- Drop `TotRmsAbvGrd`, `GrLivArea`, `BedroomAbvGr`, and  `TotalBsmtSF`

This of course don't fix the issue of the high correlation between `OverallQual` and `TotalArea`.

In [None]:
df_numerical.drop(columns=['TotRmsAbvGrd', 'GrLivArea', 'BedroomAbvGr', 'TotalBsmtSF', 'GarageYrBlt' ,'GarageCars'], inplace=True)

In [None]:
sns.heatmap(df_numerical.corr(), 
            vmin=-1.0, 
            vmax=1.0, 
            annot=True, 
            linewidths=0.5, 
            fmt='1.1f', 
            cmap='icefire')

Apply changes to full dataset

In [None]:
df_concat['TotalBathrooms'] = df_concat['FullBath'] + df_concat['BsmtFullBath'] + 0.5*(df_concat['HalfBath'] + df_concat['BsmtHalfBath'])
df_concat['TotalArea'] = df_concat['TotalBsmtSF'] + df_concat['1stFlrSF'] + df_concat['2ndFlrSF'] + df_concat['GarageArea']
df_concat['HouseAge'] = 2022 - df_concat['YearBuilt']
df_concat['YrSncRemod'] = 2022 - df_concat['YearRemodAdd']

df_concat.drop(columns=[
    'FullBath', 'BsmtFullBath', 'HalfBath', 'BsmtHalfBath',
    '1stFlrSF', '2ndFlrSF', 'GarageArea', 'YearBuilt', 'YearRemodAdd'
], inplace=True)

df_concat.drop(columns=index_to_be_dropped, inplace=True)

df_concat.drop(columns=['TotRmsAbvGrd', 'GrLivArea', 'BedroomAbvGr', 'TotalBsmtSF', 'GarageYrBlt' ,'GarageCars'], inplace=True)

In [None]:
sns.heatmap(df_concat.corr(), 
            vmin=-1.0, 
            vmax=1.0, 
            annot=True, 
            linewidths=0.5, 
            fmt='1.1f', 
            cmap='icefire')

In [None]:
cols = df_numerical.corr().SalePrice.apply(np.abs).sort_values(ascending=False)[:5].index

In [None]:
import plotly.express as px

fig = px.scatter_matrix(df_numerical, dimensions=cols)
fig.show()

In [None]:
cols

In [None]:
import plotly.express as px

y = df_numerical.SalePrice

fig = px.scatter(
    df_numerical, x='TotalArea', y='SalePrice',
    marginal_x='histogram', marginal_y='histogram',
    trendline='ols'
)

fig.show()

In [None]:
fig = px.scatter(
    df_numerical, x='OverallQual', y='SalePrice',
    marginal_x='histogram', marginal_y='histogram',
    trendline='ols'
)

fig.show()

In [None]:
fig = px.scatter(
    df_numerical, x='TotalBathrooms', y='SalePrice',
    marginal_x='histogram', marginal_y='histogram',
    trendline='ols'
)

fig.show()

In [None]:
fig = px.scatter(
    df_numerical, x='HouseAge', y='SalePrice',
    marginal_x='histogram', marginal_y='histogram',
    trendline='ols'
)

fig.show()

In [None]:
outlier_index = df_numerical[df_numerical.TotalArea > 8697].index
df_numerical.drop(index=outlier_index, inplace=True)

In [None]:
y = df_numerical.SalePrice.apply(np.log1p)
x = df_numerical.TotalArea.apply(np.log1p)

fig = px.scatter(
    df_numerical, x=x, y=y,
    marginal_x='histogram', marginal_y='histogram',
    trendline='ols'
)

fig.show()

In [None]:
y = df_numerical.SalePrice.apply(np.log1p)
x = df_numerical.OverallQual.apply(np.log1p)

fig = px.scatter(
    df_numerical, x=x, y=y,
    marginal_x='histogram', marginal_y='histogram',
    trendline='ols'
)

fig.show()

In [None]:
y = df_numerical.SalePrice.apply(np.log1p)
x = df_numerical.TotalBathrooms.apply(np.log1p)

fig = px.scatter(
    df_numerical, x=x, y=y,
    marginal_x='histogram', marginal_y='histogram',
    trendline='ols'
)

fig.show()

In [None]:
y = df_numerical.SalePrice.apply(np.log1p)
x = df_numerical.HouseAge.apply(np.log1p)

fig = px.scatter(
    df_numerical, x=x, y=y,
    marginal_x='histogram', marginal_y='histogram',
    trendline='ols'
)

fig.show()

In [None]:
df_concat.LotFrontage.apply(np.log1p)
df_concat.OverallQual = df_concat.OverallQual.apply(np.log1p)
df_concat.MasVnrArea = df_concat.MasVnrArea.apply(np.log1p)
df_concat.BsmtFinSF1 = df_concat.BsmtFinSF1.apply(np.log1p)
df_concat.BsmtUnfSF = df_concat.BsmtUnfSF.apply(np.log1p)
df_concat.WoodDeckSF = df_concat.WoodDeckSF.apply(np.log1p)
df_concat.OpenPorchSF = df_concat.OpenPorchSF.apply(np.log1p)
df_concat.HouseAge = df_concat.HouseAge.apply(np.log1p)
df_concat.YrSncRemod = df_concat.YrSncRemod.apply(np.log1p)

In [None]:
sale_concat = sale_concat.apply(np.log1p)

Next look at the non-numerical values and determine how they should be cardinalized. There will be mtwo catergories:

- Variables that need an ranked encoding - Ordinal
- Variables that need a binary encoding - OneHotTransform


In [None]:
df_concat.select_dtypes(include='object').columns

In [None]:
df_concat = pd.get_dummies(
    df_concat, 
    columns=[
        'MSZoning', 
        'Street',
        'Alley',
        'LotShape',
        'LandContour',
        'Utilities',
        'LotConfig',
        'LandSlope',
        'Neighborhood',
        'Condition1',
        'Condition2',
        'BldgType',
        'HouseStyle',
        'RoofStyle',
        'RoofMatl',
        'Exterior1st',
        'Exterior2nd',
        'MasVnrType',
        'Foundation',
        'Heating',
        'CentralAir',
        'Electrical',
        'PavedDrive',
        'Fence',
        'MiscFeature',
        'SaleType',
        'BsmtFinType1',
        'BsmtFinType2',
        'Functional',
        'GarageType',
        'GarageFinish'
    ]
)


In [None]:
ordinal_columns = df_concat.select_dtypes(include='object').columns

In [None]:
df_ordinal = df_concat[ordinal_columns]

In [None]:
ordinal = OrdinalEncoder()

df_ordinal = pd.DataFrame(ordinal.fit_transform(df_ordinal), columns=ordinal_columns)

In [None]:
df_ordinal

In [None]:
df_concat.drop(columns=ordinal_columns, inplace=True)

In [None]:
df_concat = df_concat.merge(df_ordinal, how='inner', on=df_concat.index)

In [None]:
df_concat

In [None]:
scaler = RobustScaler()

df_concat = pd.DataFrame(scaler.fit_transform(df_concat), columns=df_concat.columns)

In [None]:
df_concat.drop(columns=['key_0'], inplace=True)
df_concat

Now we need to check the Gauss-Markov Assumptions

In [None]:
#transformer = PowerTransformer(method='yeo-johnson')

#df_concat.OverallQual = df_concat.OverallQual.transform(func = lambda x: transformer.fit_transform(x.values.reshape(-1, 1)))

In [None]:
X = df_concat[:train_nrows]
y = sale_concat[:train_nrows]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
y_results = pd.DataFrame()

In [None]:
results = {}

linear = LinearRegression()
linear.fit(X, y)

y_pred = linear.predict(X_val)
results['linear'] = rmse(y_pred, y_val)

y_results['linear'] = y_pred

In [None]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

ridge = Ridge()
param = {'alpha': [0.001, 0.1, 0.001, 10, 100]}

ridge_grid_search = GridSearchCV(estimator=ridge, 
                                 param_grid=param, 
                                 scoring='r2', 
                                 cv=kfolds, 
                                 verbose=True, 
                                 return_train_score=True)
ridge_grid_search.fit(X_train, y_train)

y_pred = ridge_grid_search.predict(X_val)

results['ridge'] = rmse(y_pred, y_val)
y_results['ridge'] = y_pred

ridge_grid_search.best_estimator_

In [None]:
plt.plot(ridge_grid_search.cv_results_['param_alpha'].data,
         ridge_grid_search.cv_results_['mean_train_score'].data)

plt.plot(ridge_grid_search.cv_results_['param_alpha'].data,
         ridge_grid_search.cv_results_['mean_test_score'].data)

In [None]:
lasso = Lasso()
param = {'alpha':[0.001, 0.01, 0.03, 0.05, 0.09, 0.7, 0.9, 5, 10, 20]}

lasso_grid_search = GridSearchCV(estimator=lasso, 
                                 param_grid=param, 
                                 scoring='r2', 
                                 cv=kfolds, 
                                 verbose=True,
                                 return_train_score=True)
lasso_grid_search.fit(X_train, y_train)

y_pred = lasso_grid_search.predict(X_val)

results['lasso'] = rmse(y_pred, y_val)
y_results['lasso'] = y_pred

lasso_grid_search.best_estimator_

In [None]:
plt.plot(lasso_grid_search.cv_results_['param_alpha'].data,
         lasso_grid_search.cv_results_['mean_train_score'].data)

plt.plot(lasso_grid_search.cv_results_['param_alpha'].data,
         lasso_grid_search.cv_results_['mean_test_score'].data)

In [None]:
elastic = ElasticNet()
param = {
    'alpha':[0.001, 0.01, 0.03, 0.05, 0.09, 0.7, 0.9, 5, 10, 20],
    'l1_ratio':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    }

elastic_grid_search = GridSearchCV(estimator=elastic, 
                                 param_grid=param, 
                                 scoring='r2', 
                                 cv=kfolds, 
                                 verbose=True,
                                 return_train_score=True)
elastic_grid_search.fit(X_train, y_train)

y_pred = elastic_grid_search.predict(X_val)

results['elastic_net'] = rmse(y_pred, y_val)
y_results['elastic_net'] = y_pred

In [None]:
svr = SVR()
param = {
    'kernel':['rbf'],
    'C': [0.1, 1, 10, 100, 500],
    'epsilon':[0.1, 0.5, 1, 5, 10, 100]
    }

svr_grid_search = GridSearchCV(estimator=svr,
                                 param_grid=param, 
                                 cv=kfolds, 
                                 verbose=True,
                                 return_train_score=True)
svr_grid_search.fit(X_train, y_train)

y_pred = svr_grid_search.predict(X_val)

results['svr'] = rmse(y_pred, y_val)
y_results['svr'] = y_pred

In [None]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =0, nthread = -1,
                             verbosity = 1)

model_xgb.fit(X_train, y_train)
xgb_train_pred = model_xgb.predict(X_train)
xgb_pred = np.expm1(model_xgb.predict(X_val))

results['xgb'] = rmse(np.log1p(xgb_pred), y_val)
y_results['xgb'] = y_pred

In [None]:
y_results

In [None]:
y_average_pred = y_results.mean(axis=1).values

In [None]:
rmse(y_average_pred, y_val)

In [None]:
results