In [40]:
import re
import os

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import datetime as dt

from sklearn import model_selection

from sklearn.metrics import confusion_matrix

%matplotlib inline 

### Global variables

In [41]:
random_state = 100
n_jobs = 6
cv = 5

k_best = 200
do_pca = True
pca_n_components = 0.9

scoring = 'neg_mean_absolute_error'

### Load data

In [42]:
train_df = pd.read_csv('./house-prices-advanced-regression-techniques/train.csv')
print('Train data shape: ', train_df.shape)

test_df = pd.read_csv('./house-prices-advanced-regression-techniques/test.csv')
print('Test data shape: ', test_df.shape)

Train data shape:  (1460, 81)
Test data shape:  (1459, 80)


In [43]:
y_train = train_df['SalePrice']

train_df.drop(['SalePrice'], axis=1, inplace=True)

train_df.shape

(1460, 80)

In [44]:
all_df = pd.concat([train_df, test_df])

all_df.shape

(2919, 80)

### Handle columns with nulls

In [45]:
train_df.isnull().sum()[train_df.isnull().sum() > 0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [46]:
test_df.isnull().sum()[test_df.isnull().sum() > 0]

MSZoning           4
LotFrontage      227
Alley           1352
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        16
MasVnrArea        15
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinSF1         1
BsmtFinType2      42
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu      730
GarageType        76
GarageYrBlt       78
GarageFinish      78
GarageCars         1
GarageArea         1
GarageQual        78
GarageCond        78
PoolQC          1456
Fence           1169
MiscFeature     1408
SaleType           1
dtype: int64

In [47]:
# numeric features

numeric_features_fill_mean = [ 'LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
                             'TotalBsmtSF', 'GarageArea']

for feature in numeric_features_fill_mean:
    all_df[feature]=all_df[feature].fillna(all_df[feature].mean())

In [48]:
numeric_features_fill_mode = ['GarageYrBlt', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars']

for feature in numeric_features_fill_mode:
    all_df[feature]=all_df[feature].fillna(all_df[feature].mode()[0])

In [49]:
# categorical features

categorical_features_fill_none = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                                 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType',
                                 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
                                  'MiscFeature']

for feature in categorical_features_fill_none:
    all_df[feature]=all_df[feature].fillna('None')

In [50]:
categorical_features_fill_mode = ['Electrical', 'MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd',
                                 'KitchenQual', 'Functional', 'SaleType']

for feature in categorical_features_fill_mode:
    all_df[feature]=all_df[feature].fillna(all_df[feature].mode()[0])

In [51]:
test_id = test_df['Id']

train_df.drop(['Id'],axis=1,inplace=True)
test_df.drop(['Id'],axis=1,inplace=True)
all_df.drop(['Id'],axis=1,inplace=True)

In [52]:
print('Train data shape: ', train_df.shape)
print('Test data shape: ', test_df.shape)
print('All df data shape: ', all_df.shape)

Train data shape:  (1460, 79)
Test data shape:  (1459, 79)
All df data shape:  (2919, 79)


### Dummify categorical features

In [53]:
all_df = pd.get_dummies(all_df)

all_df.shape

(2919, 302)

In [54]:
colsToBeDummified = pd.Series(
    ['MSSubClass',
    'OverallQual',
    'OverallCond'])

for col in colsToBeDummified:
    dummies = pd.get_dummies(all_df[col], drop_first=True, prefix=col)
    all_df[dummies.columns] = dummies
    
all_df.drop(colsToBeDummified, axis=1, inplace=True)

### Normalize data

In [55]:
from sklearn.preprocessing import PowerTransformer

scaler = PowerTransformer()

all_df = pd.DataFrame(scaler.fit_transform(all_df), columns=all_df.columns)

  loglike = -n_samples / 2 * np.log(x_trans.var())
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)


In [56]:
all_train_df = all_df.iloc[:train_df.shape[0],:]
all_test_df = all_df.iloc[train_df.shape[0]:,:]

In [57]:
print('All Train data shape: ', all_train_df.shape)
print('All Test data shape: ', all_test_df.shape)
print('All df data shape: ', all_df.shape)

All Train data shape:  (1460, 331)
All Test data shape:  (1459, 331)
All df data shape:  (2919, 331)


### Feature selection and reduction

In [58]:
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

fs = SelectKBest(score_func=f_regression, k=k_best)

x_train = all_df.iloc[:train_df.shape[0],:]

X_selected = fs.fit_transform(x_train, y_train)

mask = fs.get_support()

  n_samples * X_means ** 2)
  corr /= X_norms


In [59]:
selected_features = []

for bool, feature in zip(mask, all_df.columns):
    if bool:
        selected_features.append(feature)        

selected_features_df = pd.DataFrame(all_df[selected_features])

selected_features_df.shape

(2919, 200)

In [60]:
other_features = []

for bool, feature in zip(~mask, all_df.columns):
    if bool:
        other_features.append(feature)
        
other_features_df = pd.DataFrame(all_df[other_features])

other_features_df.shape

(2919, 131)

In [61]:
from sklearn.decomposition import PCA
pca = PCA(n_components = pca_n_components)

In [62]:
data_rescaled = scaler.fit_transform(other_features_df)

pca.fit(data_rescaled)

reduced = pca.transform(data_rescaled)
reduced_df = pd.DataFrame(reduced)

reduced_df.shape

  loglike = -n_samples / 2 * np.log(x_trans.var())


(2919, 92)

In [63]:
if do_pca:
    selected_features_df = selected_features_df.join(reduced_df)

selected_features_df.shape

(2919, 292)

In [64]:
reduced_train_df = selected_features_df.iloc[:train_df.shape[0],:]
reduced_test_df = selected_features_df.iloc[train_df.shape[0]:,:]

In [65]:
print('Reduced Train data shape: ', reduced_train_df.shape)
print('Reduced Test data shape: ', reduced_test_df.shape)
print('All selected df data shape: ', selected_features_df.shape)

Reduced Train data shape:  (1460, 292)
Reduced Test data shape:  (1459, 292)
All selected df data shape:  (2919, 292)


## Machine learning application

### XGBoost

In [66]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

regressor=xgb.XGBRegressor()

In [67]:
n_estimators = [250, 500, 750, 1000]
max_depth = [5, 7, 10, 12]
booster = ['gbtree','gblinear']
learning_rate = [0.05,0.1,0.15]
min_child_weight = [1,2,3]
base_score = [0.25,0.5,0.75,1]

parameters = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'learning_rate': learning_rate,
    'min_child_weight':min_child_weight,
    'booster': booster,
    'base_score': base_score
    }

In [68]:
random_cv = model_selection.RandomizedSearchCV(estimator=regressor,
            param_distributions=parameters,
            cv=cv, n_iter=50,
            scoring = scoring,n_jobs = n_jobs,
            verbose = 5, 
            return_train_score = True,
            random_state=random_state)

In [None]:
random_cv.fit(reduced_train_df, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:   34.5s
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:  3.6min


In [None]:
xbg_pred = random_cv.predict(reduced_test_df)

In [None]:
data = {'Id': test_id, 'SalePrice': xbg_pred}
submission_df = pd.DataFrame.from_dict(data)
submission_df.to_csv('xgb_sample_submission.csv',index=False)

### Neural network

In [None]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import sklearn
import time
import scipy

from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras import initializers

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout

from keras import backend as K

from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from keras.optimizers import Adam

In [None]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [None]:
num_observations = reduced_train_df.shape[0]
num_features     = reduced_train_df.shape[1]

In [None]:
model = Sequential()

start = time.time()
tf.set_random_seed(random_state)

kernel_initializer = initializers.TruncatedNormal(mean=0.0,stddev=0.1)
bias_initializer   = initializers.Constant(0.1)
    
model.add(Dense(num_features*2, input_dim=num_features, input_shape=(num_features,),
                activation=tf.nn.relu, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer))
model.add(Dropout(0.5))

model.add(Dense(num_features, activation=tf.nn.relu))
model.add(Dropout(0.35))

model.add(Dense(num_features/2, activation=tf.nn.relu))
model.add(Dropout(0.2))

model.add(Dense(num_features/4, activation=tf.nn.relu))
model.add(Dropout(0.1))

model.add(Dense(units=1, activation=tf.nn.relu))

adam = Adam(learning_rate=0.01) 

model.compile(loss=root_mean_squared_error, optimizer=adam)

model.fit(reduced_train_df, y_train, epochs=1000, batch_size=10, verbose=0)

print('Time elapsed: ', time.time()-start)

In [None]:
nn_pred = model.predict(reduced_test_df)

In [None]:
pred_sales_price = []

for price in nn_pred:
    pred_sales_price.append(price[0])

In [None]:
data = {'Id': test_id, 'SalePrice': pred_sales_price}
submission_df = pd.DataFrame.from_dict(data)
submission_df.to_csv('nn_sample_submission.csv',index=False)