In [6]:
# Load in Data
import pandas as pd
import numpy as np

df_train = pd.read_csv("train.csv")
y_train = df_train['SalePrice'].to_numpy()
df_train = df_train.drop('SalePrice', 1)
id_train = df_train['Id'].to_numpy()
df_test = pd.read_csv("test.csv")
id_test = df_test['Id'].to_numpy()

print('df_train:', df_train.shape)
print('y_train:', y_train.shape)
print('df_test:', df_test.shape)

print('Data has been loaded')

df_train: (1460, 80)
y_train: (1460,)
df_test: (1459, 80)
Data has been loaded


In [2]:
# Preprocessing

num_cols = list(df_train._get_numeric_data().columns)
cat_cols=list(set(df_train.columns) - set(num_cols))

# Imputation

# From a post by AJ Welch here
# https://chartio.com/resources/tutorials/how-to-check-if-any-value-is-nan-in-a-pandas-dataframe/
# I got the df.isnull().sum().sum() part of the code below

#print('Null values train:', df_train.isnull().sum().sum())
#print('Null values test:', df_test.isnull().sum().sum())

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(df_train[cat_cols])

df_train[cat_cols] = imputer.transform(df_train[cat_cols])
df_test[cat_cols] = imputer.transform(df_test[cat_cols])

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(df_train[num_cols])
df_train[num_cols] = imputer.transform(df_train[num_cols])
df_test[num_cols] = imputer.transform(df_test[num_cols])

# I tried iterative imputer for my numeric data in other submissions
'''
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# try different imputation_orders
imputer = IterativeImputer(missing_values=np.nan, initial_strategy='mean', imputation_order='random')
imputer.fit(df_train[num_cols])

df_train[num_cols] = imputer.transform(df_train[num_cols])
df_test[num_cols] = imputer.transform(df_test[num_cols])
'''

print('Imputing missing data done')

Imputing missing data done


In [3]:
# Encoding

# Encoding - Ordinal
from sklearn.preprocessing import OrdinalEncoder

ordinal_cols_dict = {'Street': ['Grvl', 'Pave'],
                'Alley': ['Grvl', 'Pave'],
                'LotShape': ['IR3', 'IR2', 'IR1', 'Reg'],
                'LandContour': ['Low', 'HLS', 'Bnk', 'Lvl'],
                'Utilities': ['ELO', 'NoSeWa', 'NoSewr', 'AllPub'],
                'LandSlope': ['Sev', 'Mod', 'Gtl'],
                'ExterQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
                'ExterCond': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
                'BsmtQual': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
                'BsmtCond': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
                'BsmtExposure': ['NA', 'No', 'Mn', 'Av', 'Gd'], 
                'BsmtFinType1': ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
                'BsmtFinType2': ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
                'HeatingQC': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
                'CentralAir': ['N', 'Y'],
                'Electrical': ['Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr'],
                'KitchenQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
                'Functional': ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
                'FireplaceQu': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
                'GarageType': ['NA', 'Detchd', 'CarPort', 'BuiltIn', 'Basment', 'Attchd', '2Types'],
                'GarageFinish': ['NA', 'Unf', 'RFn', 'Fin'],
                'GarageQual': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
                'GarageCond': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
                'PavedDrive': ['N', 'P', 'Y'],
                'PoolQC': ['NA', 'Fa', 'TA', 'Gd', 'Ex'],
                'Fence': ['NA', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']
               }

ordinal_cols = ordinal_cols_dict.keys() # list of columns which I'm ordinal encoding

cats = list(ordinal_cols_dict.values()) # Categories corresponding to each column which I'm ordinal encoding

print('Categorical cols:', len(cat_cols))
print('Numerical cols:', len(num_cols))

ord_encoder = OrdinalEncoder(categories=cats)
ordinal_train = ord_encoder.fit_transform(df_train[ordinal_cols])
ordinal_test = ord_encoder.transform(df_test[ordinal_cols])


# Encoding - Onehot
from sklearn.preprocessing import OneHotEncoder

onehot_cols = list(set(cat_cols) - set(ordinal_cols))

onehot = OneHotEncoder(sparse=False) # handle_unknown='ignore'
onehot.fit(df_train[onehot_cols])
onehot_train = onehot.transform(df_train[onehot_cols])
onehot_test = onehot.transform(df_test[onehot_cols])

print('Encoding done')

Categorical cols: 43
Numerical cols: 37
Encoding done


In [4]:
# Making datasets

# Making X
X_train = np.concatenate([df_train[num_cols], ordinal_train, onehot_train], axis=1)
X_test = np.concatenate([df_test[num_cols], ordinal_test, onehot_test], axis=1)

# Normalization

from sklearn.preprocessing import MinMaxScaler
normalizer = MinMaxScaler()
normalizer.fit(X_train)
X_train = normalizer.transform(X_train)
X_test = normalizer.transform(X_test)


# Other data transformation techniques tried:

# PowerTransformer (log transform)
'''
from sklearn.preprocessing import PowerTransformer
log = PowerTransformer()
log.fit(X_train)
X_train = log.transform(X_train)
X_test = log.transform(X_test)
'''

# Standardization
'''
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
'''

'''
# Shuffling data is always good, especially since I'm making a val set
from sklearn.utils import shuffle

X_train, y_train = shuffle(X_train, y_train, random_state=0)


# Making additional validation set to evaluate performance beforehand
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=0)

print('\nX_train shape:', X_train.shape)
print(' X_test shape:', X_test.shape)
print('  X_val shape:', X_val.shape)
print('  y_val shape:', y_val.shape)
'''

print("Preprocessing done")

Preprocessing done


In [5]:
# Making root mean squared log error scorer for my models
# since Kaggle is using this metric for evaluation
from sklearn.metrics import make_scorer

def log_rmsle(pred, true):
    return -np.sqrt(np.mean(np.square((np.log(pred + 1) - np.log(true + 1)))))

RMSLE = make_scorer(log_rmsle, greater_is_better=False)
print('Root mean squared log error (RMSLE) custom scorer done')

Root mean squared log error (RMSLE) custom scorer done


In [6]:
# RandomForestRegressor (NOT USING)
'''
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': range(1, 300, 10),
    'max_depth': range(1, 300, 1),
    'min_samples_split': np.arange(0.01, 1.01, 0.01),
    'min_samples_leaf': range(1, 300, 1),
    'max_leaf_nodes': range(1, 300, 1),
    'min_impurity_decrease': np.arange(0.01, 1.01, 0.01)
}

for i in range(5):
    random_search_RFR = RandomizedSearchCV(estimator = RandomForestRegressor(random_state=0), 
                                 param_distributions = param_grid, cv = 5, n_jobs = -1,
                                 verbose = 2, scoring=RMSLE, error_score='raise')

    random_search_RFR.fit(X_train, y_train)

    print(f'Randomized Search Results {i}:')
    print('Best parameters:', random_search_RFR.best_params_)
    print("Lowest RMSLE: ", random_search_RFR.best_score_, '\n\n')

# If chosen as final model

p = random_search_RFR.best_params_

best_model_RFR = RandomForestRegressor(n_estimators=p['n_estimators'], max_depth=p['max_depth'],
                                   min_samples_split=p['min_samples_split'],
                                   min_samples_leaf=p['min_samples_leaf'],
                                   max_leaf_nodes=p['max_leaf_nodes'],
                                   min_impurity_decrease=p['min_impurity_decrease'])

best_model_RFR.fit(X_train, y_train)

y_pred = best_model_RFR.predict(X_val)

from sklearn.metrics import mean_squared_error
print('\nRMSE for validation set:', mean_squared_error(y_val, y_pred, squared=False))
print('\nRMSE for log of validation set:', mean_squared_error(np.log(y_val), np.log(y_pred), squared=False))


print('Random Forest Model done')
'''

'\nfrom sklearn.model_selection import RandomizedSearchCV\nfrom sklearn.ensemble import RandomForestRegressor\n\nparam_grid = {\n    \'n_estimators\': range(1, 300, 10),\n    \'max_depth\': range(1, 300, 1),\n    \'min_samples_split\': np.arange(0.01, 1.01, 0.01),\n    \'min_samples_leaf\': range(1, 300, 1),\n    \'max_leaf_nodes\': range(1, 300, 1),\n    \'min_impurity_decrease\': np.arange(0.01, 1.01, 0.01)\n}\n\nfor i in range(5):\n    random_search_RFR = RandomizedSearchCV(estimator = RandomForestRegressor(random_state=0), \n                                 param_distributions = param_grid, cv = 5, n_jobs = -1,\n                                 verbose = 2, scoring=RMSLE, error_score=\'raise\')\n\n    random_search_RFR.fit(X_train, y_train)\n\n    print(f\'Randomized Search Results {i}:\')\n    print(\'Best parameters:\', random_search_RFR.best_params_)\n    print("Lowest RMSLE: ", random_search_RFR.best_score_, \'\n\n\')\n\n# If chosen as final model\n\np = random_search_RFR.best_

In [7]:
# GradientBoostingRegressor
'''
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor

param_grid = {
    'learning_rate': np.arange(0.01, 1.01, 0.01),
    'n_estimators': range(100, 300, 1),
    'max_depth': range(1, 300, 1),
    'min_samples_split': np.arange(0.01, 1.0, 0.01),
    'min_samples_leaf': range(1, 300, 1),
    'max_leaf_nodes': range(2, 100, 1),
    'min_impurity_decrease': np.arange(0.01, 1.01, 0.01)
}

for i in range(5):
    random_search_GBR = RandomizedSearchCV(estimator = GradientBoostingRegressor(random_state=0), 
                                 param_distributions = param_grid, cv = 5, n_jobs = -1,
                                 verbose = 2, scoring=RMSLE, error_score='raise')

    random_search_GBR.fit(X_train, y_train)

    print(f'Randomized Search Results {i}:')
    print('Best parameters:', random_search_GBR.best_params_)
    print("Best RMSLE: ", random_search_GBR.best_score_, '\n\n')

# If chosen as final model

p = random_search_GBR.best_params_

best_model_GBR = GradientBoostingRegressor(learning_rate=p['learning_rate'], n_estimators=p['n_estimators'],
                                       max_depth=p['max_depth'], min_samples_split=p['min_samples_split'],
                                       min_samples_leaf=p['min_samples_leaf'],
                                       max_leaf_nodes=p['max_leaf_nodes'],
                                       min_impurity_decrease=p['min_impurity_decrease'])

best_model_GBR.fit(X_train, y_train)

y_pred = best_model_GBR.predict(X_val)

print('\nRMSE for validation set:', mean_squared_error(y_val, y_pred, squared=False))
print('\nRMSE for log of validation set:', mean_squared_error(np.log(y_val), np.log(y_pred), squared=False))

print('GradientBoosting Model done')
'''

'\nfrom sklearn.model_selection import RandomizedSearchCV\nfrom sklearn.ensemble import GradientBoostingRegressor\n\nparam_grid = {\n    \'learning_rate\': np.arange(0.01, 1.01, 0.01),\n    \'n_estimators\': range(100, 300, 1),\n    \'max_depth\': range(1, 300, 1),\n    \'min_samples_split\': np.arange(0.01, 1.0, 0.01),\n    \'min_samples_leaf\': range(1, 300, 1),\n    \'max_leaf_nodes\': range(2, 100, 1),\n    \'min_impurity_decrease\': np.arange(0.01, 1.01, 0.01)\n}\n\nfor i in range(5):\n    random_search_GBR = RandomizedSearchCV(estimator = GradientBoostingRegressor(random_state=0), \n                                 param_distributions = param_grid, cv = 5, n_jobs = -1,\n                                 verbose = 2, scoring=RMSLE, error_score=\'raise\')\n\n    random_search_GBR.fit(X_train, y_train)\n\n    print(f\'Randomized Search Results {i}:\')\n    print(\'Best parameters:\', random_search_GBR.best_params_)\n    print("Best RMSLE: ", random_search_GBR.best_score_, \'\n\n\')\n

In [9]:
# 1D Convolutional Neural Network
'''
import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv1D, MaxPooling1D
import keras.optimizers
import warnings
from keras.utils import np_utils
from keras.callbacks import EarlyStopping
import keras.regularizers
warnings.filterwarnings("ignore")
from keras.wrappers.scikit_learn import KerasRegressor
from keras import backend as K

# Old Architectures Used

#model = Sequential()
#model.add(Dense(233, input_dim=233,
#             activation='relu'))
#model.add(Dropout(0.1)) <= also tried w/o dropout
#model.add(Dense(233, activation='relu'))
#model.add(Dense(1))


# I learned about the keras backend and used the code provided by users Germán Sanchis and Eric Aya 
# from https://stackoverflow.com/questions/43855162/rmse-rmsle-loss-function-in-keras
# to make the rmsle function right below

def rmsle(y_true, y_pred):
    return K.sqrt(K.mean(K.square(K.log(y_pred + 1) - K.log(y_true + 1))))


# Building Neural Network
def build_model():
    model = Sequential()
    model.add(Dense(233, input_dim=233,
                 activation='relu'))
    model.add(Dense(233, activation='relu'))
    model.add(Dense(1))
    
    model.compile(loss='mean_squared_error',
              optimizer='adam',
              metrics=rmsle)
    
    return model

# Early Stopping
stop_early = EarlyStopping(monitor='val_rmsle', patience=2, verbose=0)

param_grid = {
    'batch_size': range(100, 300, 1),
}

# I got guidance from Jason Brownlee's article on
# https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/
# to set up the Keras neural network to work with Scikit Learn.

model_NN = KerasRegressor(build_fn=lambda: build_model(), epochs=100, 
                          callbacks=[stop_early], shuffle=True, validation_split=0.1, verbose=1)

random_search_NN = RandomizedSearchCV(estimator = model_NN, 
                                 param_distributions = param_grid, cv = 5, n_jobs = -1,
                                 verbose = 1, scoring=RMSLE, error_score='raise')

rs_results = random_search_NN.fit(X_train, y_train)

print('Randomized Search Results:')
print("Best parameters:", rs_results.best_params_)
print("Best RMSLE: ", rs_results.best_score_)

# If chosen as final model

p = random_search_NN.best_params_

model = build_model()
model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=p['batch_size'],
          epochs=300, shuffle=True, callbacks=[stop_early])

y_pred = model.predict(X_val)

print('\nRMSE for validation set:', mean_squared_error(y_val, y_pred, squared=False))
print('\nRMSE for log of validation set:', mean_squared_error(np.log(y_val), np.log(y_pred), squared=False))

print('\n Neural Network Model done')
'''



In [11]:
#!pip install xgboost
import xgboost
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
# https://xgboost.readthedocs.io/en/latest/parameter.html

'''
param_grid = {
    'learning_rate': np.arange(0.01, 1.01, 0.05),
    'n_estimators': range(1, 1000, 50),
    'max_depth': range(1, 50, 1),
    'colsample_bytree': np.arange(0.01, 1.0, 0.1),
    'subsample': np.arange(0.01, 1.01, 0.1),
    'alpha': range(0, 100, 5),
    'lambda': range(0, 100, 5),
    'gamma': range(0, 100, 5)
}
'''

# Current best
'''
Best parameters: {'subsample': 0.33,
'n_estimators': 711, 'max_depth': 5,
'learning_rate': 0.08, 'lambda': 19,
'gamma': 40, 'colsample_bytree': 0.23,
'alpha': 17}
'''

# 0.1262054666438511 
param_grid = {
    'learning_rate': [0.065],
    'n_estimators': [711],
    'max_depth': [5],
    'min_child_weight': [1],
    'colsample_bytree': [0.23],
    'subsample': [0.33],
    'alpha': [17],
    'lambda': [19],
    'gamma': [49],
}



for i in range(1):
    # Grid Search
    #random_search_XGB = GridSearchCV(estimator = XGBRegressor(seed=0), 
    #                             param_grid = param_grid, cv = 5, n_jobs = -1,
    #                             verbose = 2, scoring=RMSLE, error_score='raise')
    # Random Search

    random_search_XGB = RandomizedSearchCV(estimator = XGBRegressor(seed=0), 
                                 param_distributions = param_grid, cv = 5, n_jobs = -1,
                                 verbose = 2, scoring=RMSLE, error_score='raise')
    
    random_search_XGB.fit(X_train, y_train)

    print(f'Randomized Search Results {i}:')
    # From https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663
    print("Best parameters:", random_search_XGB.best_params_)
    print("Best RMSLE: ", random_search_XGB.best_score_, '\n\n')


# If chosen as final model

'''
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=0)

p = random_search_XGB.best_params_

best_model_XGB = xgb.XGBRegressor(subsample=0.33, n_estimators=711, max_depth=5, learning_rate=0.08,
                      reg_lambda=19, gamma=40, colsample_bytree=0.23, alpha=17)

best_model_XGB.fit(X_train, y_train)

y_pred = best_model_XGB.predict(X_val)

print('\nRMSE for validation set:', mean_squared_error(y_val, y_pred, squared=False))
print('\nRMSE for log of validation set:', mean_squared_error(np.log(y_val), np.log(y_pred), squared=False))
'''

print('XGBoost Model done')

Fitting 5 folds for each of 1 candidates, totalling 5 fits




Randomized Search Results 0:
Best parameters: {'subsample': 0.33, 'n_estimators': 711, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.065, 'lambda': 19, 'gamma': 49, 'colsample_bytree': 0.23, 'alpha': 17}
Best RMSLE:  0.12498681689303856 


XGBoost Model done


In [12]:
# Final Model

model = xgb.XGBRegressor(subsample=0.33, n_estimators=711, max_depth=5, min_child_weight=1, learning_rate=0.08,
                      reg_lambda=19, gamma=49, colsample_bytree=0.23, alpha=17)
model.fit(X_train, y_train)
ypred = model.predict(X_test)
print('ypred', model.score(X_train, y_train))
print(len(ypred))

ypred 0.9894072449977998
1459


In [10]:
# Create submission file.
submission = pd.DataFrame(ypred, columns=['SalePrice']) # Create new dataframe.
submission['Id'] = df_test['Id'].astype('Int32') # Kaggle expects two columns: Id, SalePrice.
submission.to_csv('LR_submission6.csv', index=False)