In [1659]:
import re
import os

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import datetime as dt

from sklearn import model_selection

from sklearn.metrics import confusion_matrix

%matplotlib inline 

### Global variables

In [1660]:
random_state = 100
n_jobs = 6
cv = 5

k_best = 200
do_pca = True
pca_n_components = 0.9

scoring = 'neg_mean_absolute_error'

In [1661]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

### Load data

In [1662]:
train_df = pd.read_csv('./house-prices-advanced-regression-techniques/train.csv')
print('Train data shape: ', train_df.shape)

test_df = pd.read_csv('./house-prices-advanced-regression-techniques/test.csv')
print('Test data shape: ', test_df.shape)

Train data shape:  (1460, 81)
Test data shape:  (1459, 80)


In [1663]:
y_train = train_df['SalePrice']

train_df.drop(['SalePrice'], axis=1, inplace=True)

train_df.shape

(1460, 80)

In [1664]:
all_df = pd.concat([train_df, test_df])

all_df.shape

(2919, 80)

### Handle columns with nulls

In [1665]:
train_df.isnull().sum()[train_df.isnull().sum() > 0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [1666]:
test_df.isnull().sum()[test_df.isnull().sum() > 0]

MSZoning           4
LotFrontage      227
Alley           1352
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        16
MasVnrArea        15
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinSF1         1
BsmtFinType2      42
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu      730
GarageType        76
GarageYrBlt       78
GarageFinish      78
GarageCars         1
GarageArea         1
GarageQual        78
GarageCond        78
PoolQC          1456
Fence           1169
MiscFeature     1408
SaleType           1
dtype: int64

In [1667]:
# numeric features

numeric_features_fill_mean = [ 'LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
                             'TotalBsmtSF', 'GarageArea']

for feature in numeric_features_fill_mean:
    all_df[feature]=all_df[feature].fillna(all_df[feature].mean())

In [1668]:
numeric_features_fill_mode = ['GarageYrBlt', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars']

for feature in numeric_features_fill_mode:
    all_df[feature]=all_df[feature].fillna(all_df[feature].mode()[0])

In [1669]:
# categorical features

categorical_features_fill_none = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                                 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType',
                                 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
                                  'MiscFeature']

for feature in categorical_features_fill_none:
    all_df[feature]=all_df[feature].fillna('None')

In [1670]:
categorical_features_fill_mode = ['Electrical', 'MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd',
                                 'KitchenQual', 'Functional', 'SaleType']

for feature in categorical_features_fill_mode:
    all_df[feature]=all_df[feature].fillna(all_df[feature].mode()[0])

In [1671]:
test_id = test_df['Id']

train_df.drop(['Id'],axis=1,inplace=True)
test_df.drop(['Id'],axis=1,inplace=True)
all_df.drop(['Id'],axis=1,inplace=True)

In [1672]:
non_linear_train_df = all_df.iloc[:train_df.shape[0],:]
non_linear_test_df = all_df.iloc[train_df.shape[0]:,:]

In [1673]:
print('Train data shape: ', train_df.shape)
print('Test data shape: ', test_df.shape)
print('All df data shape: ', all_df.shape)

Train data shape:  (1460, 79)
Test data shape:  (1459, 79)
All df data shape:  (2919, 79)


### Encode categorical features

In [1674]:
all_df = pd.get_dummies(all_df)

all_df.shape

(2919, 302)

In [1675]:
colsToBeDummified = pd.Series(
    ['MSSubClass',
    'OverallQual',
    'OverallCond'])

for col in colsToBeDummified:
    dummies = pd.get_dummies(all_df[col], drop_first=True, prefix=col)
    all_df[dummies.columns] = dummies
    
all_df.drop(colsToBeDummified, axis=1, inplace=True)

In [1676]:
categorical_features = [ 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
                        'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
                        'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
                        'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
                        'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                        'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
                        'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
                        'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']

In [1677]:
from sklearn.preprocessing import OrdinalEncoder

ord_enc = OrdinalEncoder()

In [1678]:
for col in categorical_features:
    non_linear_train_df[col] = ord_enc.fit_transform(non_linear_train_df[[col]]) 

In [1679]:
for col in categorical_features:
    non_linear_test_df[col] = ord_enc.fit_transform(non_linear_test_df[[col]]) 

In [1680]:
print('For Non-linear model train data shape: ', non_linear_train_df.shape)
print('For Non-linear model test data shape: ', non_linear_test_df.shape)

For Non-linear model train data shape:  (1460, 79)
For Non-linear model test data shape:  (1459, 79)


### Normalize data

In [1681]:
from sklearn.preprocessing import PowerTransformer

scaler = PowerTransformer()

In [1682]:
all_df = pd.DataFrame(scaler.fit_transform(all_df), columns=all_df.columns)

  loglike = -n_samples / 2 * np.log(x_trans.var())
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)


In [1683]:
all_train_df = all_df.iloc[:train_df.shape[0],:]
all_test_df = all_df.iloc[train_df.shape[0]:,:]

In [1684]:
print('All Train data shape: ', all_train_df.shape)
print('All Test data shape: ', all_test_df.shape)
print('All df data shape: ', all_df.shape)

All Train data shape:  (1460, 331)
All Test data shape:  (1459, 331)
All df data shape:  (2919, 331)


In [1685]:
non_linear_train_df = pd.DataFrame(scaler.fit_transform(non_linear_train_df), columns=non_linear_train_df.columns)
non_linear_test_df = pd.DataFrame(scaler.fit_transform(non_linear_test_df), columns=non_linear_test_df.columns)

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  loglike = -n_samples / 2 * np.log(x_trans.var())
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)


### Feature selection and reduction

In [1686]:
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression

#fs = SelectKBest(score_func=f_regression, k=k_best)
fs = SelectKBest(score_func=mutual_info_regression, k=k_best)

x_train = all_df.iloc[:train_df.shape[0],:]

X_selected = fs.fit_transform(x_train, y_train)

mask = fs.get_support()

In [1687]:
selected_features = []

for bool, feature in zip(mask, all_df.columns):
    if bool:
        selected_features.append(feature)        

selected_features_df = pd.DataFrame(all_df[selected_features])

selected_features_df.shape

(2919, 200)

In [1688]:
other_features = []

for bool, feature in zip(~mask, all_df.columns):
    if bool:
        other_features.append(feature)
        
other_features_df = pd.DataFrame(all_df[other_features])

other_features_df.shape

(2919, 131)

In [1689]:
from sklearn.decomposition import PCA
pca = PCA(n_components = pca_n_components)

In [1690]:
data_rescaled = scaler.fit_transform(other_features_df)

pca.fit(data_rescaled)

reduced = pca.transform(data_rescaled)
reduced_df = pd.DataFrame(reduced)

reduced_df.shape

  loglike = -n_samples / 2 * np.log(x_trans.var())


(2919, 92)

In [1691]:
if do_pca:
    selected_features_df = selected_features_df.join(reduced_df)

selected_features_df.shape

(2919, 292)

In [1692]:
reduced_linear_train_df = selected_features_df.iloc[:train_df.shape[0],:]
reduced_linear_test_df = selected_features_df.iloc[train_df.shape[0]:,:]

In [1693]:
print('Reduced Linear Train data shape: ', reduced_linear_train_df.shape)
print('Reduced Linear Test data shape: ', reduced_linear_test_df.shape)
print('All selected df data shape: ', selected_features_df.shape)

Reduced Linear Train data shape:  (1460, 292)
Reduced Linear Test data shape:  (1459, 292)
All selected df data shape:  (2919, 292)


## Machine learning application

### Linear models

### ElasticNet

In [1694]:
from sklearn.linear_model import ElasticNetCV

elasticNet = ElasticNetCV(l1_ratio = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
                          alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 
                          max_iter = 50000,
                          cv = 5,
                          random_state=random_state)

In [1695]:
start = time.time()

elasticNet.fit(reduced_linear_train_df, y_train)

print('Time elapsed: ', time.time()-start)

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


Time elapsed:  211.11562991142273


  tol, rng, random, positive)


In [1696]:
elasticnet_pred = elasticNet.predict(reduced_linear_test_df)

In [1697]:
data = {'Id': test_id, 'SalePrice': elasticnet_pred}
submission_df = pd.DataFrame.from_dict(data)
submission_df.to_csv('elasticNet_sample_submission.csv', index=False)

### Stochastic gradient regressor

In [1235]:
from sklearn.linear_model import SGDRegressor

sgd = SGDRegressor()

In [1236]:
parameters = {
    'loss': ['squared_loss'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'l1_ratio' : [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'alpha' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'max_iter' : [ 50000 ],
    'epsilon': [1e-3, 1e-5, 1e-7],
    }

In [1237]:
random_cv = model_selection.RandomizedSearchCV(
            estimator=sgd,
            param_distributions=parameters,
            cv=cv,
            n_iter=50,
            scoring = scoring,
            n_jobs = n_jobs,
            verbose = 5, 
            return_train_score = True,
            random_state=random_state)

In [1238]:
start = time.time()

random_cv.fit(reduced_linear_train_df, y_train)

print('Time elapsed: ', time.time()-start)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:    0.1s


Time elapsed:  1.2440452575683594


[Parallel(n_jobs=6)]: Done 213 tasks      | elapsed:    1.1s
[Parallel(n_jobs=6)]: Done 250 out of 250 | elapsed:    1.2s finished


In [1239]:
sgd_pred = random_cv.predict(reduced_linear_test_df)

In [1240]:
data = {'Id': test_id, 'SalePrice': sgd_pred}
submission_df = pd.DataFrame.from_dict(data)
submission_df.to_csv('sgd_sample_submission.csv', index=False)

### Non-linear models

### Support vector regressor

In [1087]:
from sklearn.svm import SVR

svr = SVR()

In [1110]:
parameters = {
    'kernel': ['linear', 'poly', 'rbf'],
    'degree': [1, 2, 3],
    'gamma' : [0.001, 0.01, 0.1, 1, 10],
    'C': [125000, 150000, 175000],
    'epsilon': [0.001, 0.01, 0.1, 1, 10],
    'max_iter' : [ 50000 ],
    }

In [1111]:
random_cv = model_selection.RandomizedSearchCV(
            estimator=svr,
            param_distributions=parameters,
            cv=cv,
            n_iter=50,
            scoring = scoring,
            n_jobs = n_jobs,
            verbose = 5, 
            return_train_score = True,
            random_state=random_state)

In [1112]:
start = time.time()

random_cv.fit(non_linear_train_df, y_train)

print('Time elapsed: ', time.time()-start)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:    0.5s
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:    7.0s
[Parallel(n_jobs=6)]: Done 150 tasks      | elapsed:   22.9s
[Parallel(n_jobs=6)]: Done 250 out of 250 | elapsed:   39.5s finished


Time elapsed:  40.24771475791931


In [1113]:
svr_pred = random_cv.predict(non_linear_test_df)

In [1114]:
data = {'Id': test_id, 'SalePrice': svr_pred}
submission_df = pd.DataFrame.from_dict(data)
submission_df.to_csv('svr_sample_submission.csv', index=False)

### XGBoost

In [405]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

regressor=xgb.XGBRegressor()

In [406]:
n_estimators = [250, 500, 750, 1000]
max_depth = [5, 7, 10, 12]
booster = ['gbtree','gblinear']
learning_rate = [0.05,0.1,0.15]
min_child_weight = [1,2,3]
base_score = [0.25,0.5,0.75,1]

parameters = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'learning_rate': learning_rate,
    'min_child_weight':min_child_weight,
    'booster': booster,
    'base_score': base_score
    }

In [407]:
random_cv = model_selection.RandomizedSearchCV(
            estimator=regressor,
            param_distributions=parameters,
            cv=cv,
            n_iter=50,
            scoring = scoring,
            n_jobs = n_jobs,
            verbose = 5, 
            return_train_score = True,
            random_state=random_state)

In [408]:
#random_cv.fit(non_linear_train_df, y_train)

In [409]:
#xbg_pred = random_cv.predict(non_linear_test_df)

In [410]:
#data = {'Id': test_id, 'SalePrice': xbg_pred}
#submission_df = pd.DataFrame.from_dict(data)
#submission_df.to_csv('xgb_sample_submission.csv', index=False)

### Neural network

In [411]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import sklearn
import time
import scipy

from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras import initializers

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout

from keras import backend as K

from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from keras.optimizers import Adam

In [412]:
num_observations = non_linear_train_df.shape[0]
num_features     = non_linear_train_df.shape[1]

In [413]:
model = Sequential()

start = time.time()
tf.set_random_seed(random_state)

kernel_initializer = initializers.TruncatedNormal(mean=0.0,stddev=0.1)
bias_initializer   = initializers.Constant(0.1)
    
model.add(Dense(num_features*2, input_dim=num_features, input_shape=(num_features,),
                activation=tf.nn.relu, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer))
model.add(Dropout(0.5))

model.add(Dense(num_features, activation=tf.nn.relu))
model.add(Dropout(0.35))

model.add(Dense(num_features/2, activation=tf.nn.relu))
model.add(Dropout(0.2))

model.add(Dense(num_features/4, activation=tf.nn.relu))
model.add(Dropout(0.1))

model.add(Dense(units=1, activation=tf.nn.relu))

adam = Adam(learning_rate=0.01) 

#model.compile(loss=root_mean_squared_error, optimizer=adam)

#model.fit(non_linear_train_df, y_train, epochs=1000, batch_size=10, verbose=0)

print('Time elapsed: ', time.time()-start)

Time elapsed:  0.12975692749023438


In [414]:
#nn_pred = model.predict(non_linear_test_df)

In [415]:
#pred_sales_price = []
#
#for price in nn_pred:
#    pred_sales_price.append(price[0])

In [416]:
#data = {'Id': test_id, 'SalePrice': pred_sales_price}
#submission_df = pd.DataFrame.from_dict(data)
#submission_df.to_csv('nn_sample_submission.csv', index=False)