# Installing Libraries

In [None]:
!pip install vecstack

Collecting vecstack
  Downloading vecstack-0.4.0.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: vecstack
  Building wheel for vecstack (setup.py) ... [?25l[?25hdone
  Created wheel for vecstack: filename=vecstack-0.4.0-py3-none-any.whl size=19863 sha256=55a20d5490f7942fab0a3d24448f5cd47701e7cb455619379db1f265cd8615d2
  Stored in directory: /root/.cache/pip/wheels/b8/d8/51/3cf39adf22c522b0a91dc2208db4e9de4d2d9d171683596220
Successfully built vecstack
Installing collected packages: vecstack
Successfully installed vecstack-0.4.0


# Importing Libraries

In [None]:
# import the libraries

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.feature_selection import SelectFromModel

from sklearn.preprocessing import MinMaxScaler

from vecstack import stacking

# Reading Data

In [None]:
# read the data
train_data = pd.read_csv('train.csv')
train_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [None]:
test_data = pd.read_csv('test.csv')
test_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


# Data Preprocessing

In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [None]:
# plotting the number of null values present in each column
num_null_values = []
for col in train_data.columns:
  num_null_values.append(train_data[col].isnull().sum())

null_values_df = pd.DataFrame({'column_name':train_data.columns,  'num_null_values':num_null_values})
fig = px.bar(null_values_df, x = 'column_name', y = 'num_null_values', barmode='group')
fig.show()

print("Total number of records in the dataset that have atleast 1 null value: ", train_data.isna().any(axis=1).values.sum())

Total number of records in the dataset that have atleast 1 null value:  1460


In [None]:
# seprating the target column from the training dataset before any preprocessing is done
train_target_col = train_data["SalePrice"] #make copy of target column
train_data = train_data.drop(["SalePrice"], axis=1) #extracting training data without the target column

In [None]:
# combining the training and testing data so that the same preprocessing steps can be applied to both.
combined_data = pd.concat([train_data, test_data], keys=[0,1])
combined_data

Unnamed: 0,Unnamed: 1,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
0,1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
0,2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
0,3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
0,4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1,1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1,1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1,1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [None]:
# a column should have 60% of its values as non-nulls to be in the dataset. if a column has lesser than 60% of the values as non-nulls, then the column is
# deleted.
percent=int(0.6*(combined_data.shape[0]))
print(percent)

# a column should have atleast 1751 non-null values. if not, then the column is deleted.
cd1 =combined_data.dropna(thresh=percent,axis=1)
print(cd1.shape)

1751
(2919, 75)


In [None]:
cd1

Unnamed: 0,Unnamed: 1,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,61,0,0,0,0,0,2,2008,WD,Normal
0,1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,0,5,2007,WD,Normal
0,2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,42,0,0,0,0,0,9,2008,WD,Normal
0,3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,35,272,0,0,0,0,2,2006,WD,Abnorml
0,4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,84,0,0,0,0,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,1454,2915,160,RM,21.0,1936,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,0,6,2006,WD,Normal
1,1455,2916,160,RM,21.0,1894,Pave,Reg,Lvl,AllPub,Inside,...,24,0,0,0,0,0,4,2006,WD,Abnorml
1,1456,2917,20,RL,160.0,20000,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,0,9,2006,WD,Abnorml
1,1457,2918,85,RL,62.0,10441,Pave,Reg,Lvl,AllPub,Inside,...,32,0,0,0,0,700,7,2006,WD,Normal


In [None]:
# plotting the number of null values present in each column again
num_null_values = []
for col in cd1.columns:
  num_null_values.append(cd1[col].isnull().sum())

null_values_df = pd.DataFrame({'column_name':cd1.columns,  'num_null_values':num_null_values})
fig = px.bar(null_values_df, x = 'column_name', y = 'num_null_values', barmode='group')
fig.show()

In [None]:
# there are still some columns that have null values. But these columns have lesser number of null values. We can fill in these null values, instead of
# deleting these columns.


# for numeric columns, the null value is replaced with the mean of the values in that column

# selecting the columns that have a numeric datatype
numeric_cols = cd1.select_dtypes(include=['int','float64']).columns
for col in numeric_cols:
  cd1[col]=cd1[col].fillna(cd1[col].mean())

#IMPUTE (SUBSTITUTE) MODE VALUES FOR NaN IN CATEGORICAL COLUMNS
train_cat_cols = cd1.select_dtypes(exclude=['int','float64']).columns#selecting the categorical columns
for colss in train_cat_cols:
  cd1[colss]=cd1[colss].fillna(cd1[colss].mode()[0])

print(cd1)

          Id  MSSubClass MSZoning  LotFrontage  LotArea Street LotShape  \
0 0        1          60       RL         65.0     8450   Pave      Reg   
  1        2          20       RL         80.0     9600   Pave      Reg   
  2        3          60       RL         68.0    11250   Pave      IR1   
  3        4          70       RL         60.0     9550   Pave      IR1   
  4        5          60       RL         84.0    14260   Pave      IR1   
...      ...         ...      ...          ...      ...    ...      ...   
1 1454  2915         160       RM         21.0     1936   Pave      Reg   
  1455  2916         160       RM         21.0     1894   Pave      Reg   
  1456  2917          20       RL        160.0    20000   Pave      Reg   
  1457  2918          85       RL         62.0    10441   Pave      Reg   
  1458  2919          60       RL         74.0     9627   Pave      Reg   

       LandContour Utilities LotConfig  ... OpenPorchSF EnclosedPorch  \
0 0            Lvl    AllP

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cd1[col]=cd1[col].fillna(cd1[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cd1[colss]=cd1[colss].fillna(cd1[colss].mode()[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cd1[colss]=cd1[colss].fillna(cd1[colss].mode()[0])
A value is trying to be set on a copy of a slice from a DataF

In [None]:
# checking if there are any columns with null values left
null_cols = cd1.isnull().sum().sort_values(ascending = False).to_frame()
null_cols[null_cols[0]!=0]

# there are no more columns with null values left

Unnamed: 0,0


In [None]:
cd1.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2919 entries, (0, 0) to (1, 1458)
Data columns (total 75 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             2919 non-null   int64  
 1   MSSubClass     2919 non-null   int64  
 2   MSZoning       2919 non-null   object 
 3   LotFrontage    2919 non-null   float64
 4   LotArea        2919 non-null   int64  
 5   Street         2919 non-null   object 
 6   LotShape       2919 non-null   object 
 7   LandContour    2919 non-null   object 
 8   Utilities      2919 non-null   object 
 9   LotConfig      2919 non-null   object 
 10  LandSlope      2919 non-null   object 
 11  Neighborhood   2919 non-null   object 
 12  Condition1     2919 non-null   object 
 13  Condition2     2919 non-null   object 
 14  BldgType       2919 non-null   object 
 15  HouseStyle     2919 non-null   object 
 16  OverallQual    2919 non-null   int64  
 17  OverallCond    2919 non-null   int64  
 18

In [None]:
# performing one-hot encoding on categorical features

# retrieving columns that are not numeric
train_cat_cols = cd1.select_dtypes(exclude=['float','int']).columns #selecting the categorical columns
print(train_cat_cols.shape)
print(train_cat_cols)

#If there are categorical columns which are encoded as numeric ones
#then we need to explicitly enter the column names in a list and concatenate the two lists in python.


combined_Data = pd.get_dummies(cd1,train_cat_cols)
combined_Data

(38,)
Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')


Unnamed: 0,Unnamed: 1,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,1,60,65.0,8450,7,5,2003,2003,196.0,706.0,...,0,0,0,1,0,0,0,0,1,0
0,1,2,20,80.0,9600,6,8,1976,1976,0.0,978.0,...,0,0,0,1,0,0,0,0,1,0
0,2,3,60,68.0,11250,7,5,2001,2002,162.0,486.0,...,0,0,0,1,0,0,0,0,1,0
0,3,4,70,60.0,9550,7,5,1915,1970,0.0,216.0,...,0,0,0,1,1,0,0,0,0,0
0,4,5,60,84.0,14260,8,5,2000,2000,350.0,655.0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,1454,2915,160,21.0,1936,4,7,1970,1970,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1,1455,2916,160,21.0,1894,4,5,1970,1970,0.0,252.0,...,0,0,0,1,1,0,0,0,0,0
1,1456,2917,20,160.0,20000,5,7,1960,1996,0.0,1224.0,...,0,0,0,1,1,0,0,0,0,0
1,1457,2918,85,62.0,10441,5,5,1992,1992,0.0,337.0,...,0,0,0,1,0,0,0,0,1,0


In [None]:
# removing hierarchical index
cd2 = combined_Data.reset_index()
cd2

Unnamed: 0,level_0,level_1,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,0,1,60,65.0,8450,7,5,2003,2003,...,0,0,0,1,0,0,0,0,1,0
1,0,1,2,20,80.0,9600,6,8,1976,1976,...,0,0,0,1,0,0,0,0,1,0
2,0,2,3,60,68.0,11250,7,5,2001,2002,...,0,0,0,1,0,0,0,0,1,0
3,0,3,4,70,60.0,9550,7,5,1915,1970,...,0,0,0,1,1,0,0,0,0,0
4,0,4,5,60,84.0,14260,8,5,2000,2000,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,1,1454,2915,160,21.0,1936,4,7,1970,1970,...,0,0,0,1,0,0,0,0,1,0
2915,1,1455,2916,160,21.0,1894,4,5,1970,1970,...,0,0,0,1,1,0,0,0,0,0
2916,1,1456,2917,20,160.0,20000,5,7,1960,1996,...,0,0,0,1,1,0,0,0,0,0
2917,1,1457,2918,85,62.0,10441,5,5,1992,1992,...,0,0,0,1,0,0,0,0,1,0


In [None]:
# normalizing the data

scaler = MinMaxScaler()

model = scaler.fit(cd2.iloc[:,3:]) # not including the 'level_0', 'level_1', and 'id' column

scaled_data = model.transform(cd2.iloc[:,3:])

scaled_train_X_df = pd.DataFrame(scaled_data)

scaled_train_X_df.columns = cd2.iloc[:,3:].columns

scaled_train_X_df['level'] = cd2['level_0'] # adding the level column again to the normalized data so that train and test can be deleted

scaled_train_X_df['Id'] = cd2['Id'] # adding the id column back to the normalized data

scaled_train_X_df


Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,level,Id
0,0.235294,0.150685,0.033420,0.666667,0.500,0.949275,0.883333,0.12250,0.125089,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0,1
1,0.000000,0.202055,0.038795,0.555556,0.875,0.753623,0.433333,0.00000,0.173281,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0,2
2,0.235294,0.160959,0.046507,0.666667,0.500,0.934783,0.866667,0.10125,0.086109,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0,3
3,0.294118,0.133562,0.038561,0.666667,0.500,0.311594,0.333333,0.00000,0.038271,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0,4
4,0.235294,0.215753,0.060576,0.777778,0.500,0.927536,0.833333,0.21875,0.116052,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,0.823529,0.000000,0.002973,0.333333,0.750,0.710145,0.333333,0.00000,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,2915
2915,0.823529,0.000000,0.002776,0.333333,0.500,0.710145,0.333333,0.00000,0.044649,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1,2916
2916,0.000000,0.476027,0.087406,0.444444,0.750,0.637681,0.766667,0.00000,0.216867,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1,2917
2917,0.382353,0.140411,0.042726,0.444444,0.500,0.869565,0.700000,0.00000,0.059709,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,2918


In [None]:
# separating the training and testing data

trainData = scaled_train_X_df[scaled_train_X_df['level']==0]
testData = scaled_train_X_df[scaled_train_X_df['level']==1]
trainData = trainData.drop('level',axis=1) # deleting the level column from the training data
testData = testData.drop('level',axis=1) # deleting the level column from the testing data
trainData1=pd.concat([trainData, train_target_col], axis=1)


In [None]:
trainData1

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Id,SalePrice
0,0.235294,0.150685,0.033420,0.666667,0.500,0.949275,0.883333,0.122500,0.125089,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,208500
1,0.000000,0.202055,0.038795,0.555556,0.875,0.753623,0.433333,0.000000,0.173281,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2,181500
2,0.235294,0.160959,0.046507,0.666667,0.500,0.934783,0.866667,0.101250,0.086109,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3,223500
3,0.294118,0.133562,0.038561,0.666667,0.500,0.311594,0.333333,0.000000,0.038271,0.000000,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,4,140000
4,0.235294,0.215753,0.060576,0.777778,0.500,0.927536,0.833333,0.218750,0.116052,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,5,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.235294,0.140411,0.030929,0.555556,0.500,0.920290,0.833333,0.000000,0.000000,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1456,175000
1456,0.000000,0.219178,0.055505,0.555556,0.625,0.768116,0.633333,0.074375,0.139972,0.106815,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1457,210000
1457,0.294118,0.154110,0.036187,0.666667,1.000,0.500000,0.933333,0.000000,0.048724,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1458,266500
1458,0.000000,0.160959,0.039342,0.444444,0.625,0.565217,0.766667,0.000000,0.008682,0.674312,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1459,142125


In [None]:
testData

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Id
1460,0.000000,0.202055,0.048246,0.444444,0.625,0.644928,0.183333,0.00000,0.082920,0.094364,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1461
1461,0.000000,0.205479,0.060609,0.555556,0.625,0.623188,0.133333,0.06750,0.163536,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1462
1462,0.235294,0.181507,0.058566,0.444444,0.500,0.905797,0.800000,0.00000,0.140149,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1463
1463,0.235294,0.195205,0.040562,0.555556,0.625,0.913043,0.800000,0.01250,0.106662,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1464
1464,0.588235,0.075342,0.017318,0.777778,0.500,0.869565,0.700000,0.00000,0.046598,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,0.823529,0.000000,0.002973,0.333333,0.750,0.710145,0.333333,0.00000,0.000000,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2915
2915,0.823529,0.000000,0.002776,0.333333,0.500,0.710145,0.333333,0.00000,0.044649,0.000000,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2916
2916,0.000000,0.476027,0.087406,0.444444,0.750,0.637681,0.766667,0.00000,0.216867,0.000000,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2917
2917,0.382353,0.140411,0.042726,0.444444,0.500,0.869565,0.700000,0.00000,0.059709,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2918


In [None]:
# splitting input and output of the training data:

X = trainData1.drop('SalePrice', axis=1)
y = trainData1['SalePrice']

In [None]:
X

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Id
0,0.235294,0.150685,0.033420,0.666667,0.500,0.949275,0.883333,0.122500,0.125089,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
1,0.000000,0.202055,0.038795,0.555556,0.875,0.753623,0.433333,0.000000,0.173281,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2
2,0.235294,0.160959,0.046507,0.666667,0.500,0.934783,0.866667,0.101250,0.086109,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3
3,0.294118,0.133562,0.038561,0.666667,0.500,0.311594,0.333333,0.000000,0.038271,0.000000,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,4
4,0.235294,0.215753,0.060576,0.777778,0.500,0.927536,0.833333,0.218750,0.116052,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.235294,0.140411,0.030929,0.555556,0.500,0.920290,0.833333,0.000000,0.000000,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1456
1456,0.000000,0.219178,0.055505,0.555556,0.625,0.768116,0.633333,0.074375,0.139972,0.106815,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1457
1457,0.294118,0.154110,0.036187,0.666667,1.000,0.500000,0.933333,0.000000,0.048724,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1458
1458,0.000000,0.160959,0.039342,0.444444,0.625,0.565217,0.766667,0.000000,0.008682,0.674312,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1459


In [None]:
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [None]:
# splitting the kaggle training data into training and testing data to evaluate performance
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=42)

In [None]:
X_train

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Id
135,0.000000,0.202055,0.042534,0.666667,0.625,0.710145,0.333333,0.18000,0.000000,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,136
1452,0.941176,0.047945,0.011101,0.444444,0.500,0.963768,0.916667,0.05000,0.096917,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1453
762,0.235294,0.174658,0.034308,0.666667,0.500,0.992754,0.983333,0.00000,0.004252,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,763
932,0.000000,0.215753,0.048470,0.888889,0.500,0.971014,0.933333,0.18875,0.000000,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,933
435,0.235294,0.075342,0.043782,0.666667,0.625,0.898551,0.766667,0.00000,0.068214,0.225426,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,0.000000,0.195205,0.037472,0.555556,0.500,0.971014,0.933333,0.00000,0.004252,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1096
1130,0.176471,0.150685,0.030400,0.333333,0.250,0.405797,0.000000,0.00000,0.110206,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1131
1294,0.000000,0.133562,0.032120,0.444444,0.750,0.601449,0.666667,0.00000,0.029589,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1295
860,0.176471,0.116438,0.029643,0.666667,0.875,0.333333,0.800000,0.00000,0.000000,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,861


In [None]:
X_test

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Id
892,0.000000,0.167808,0.033252,0.555556,0.875,0.659420,0.883333,0.000000,0.117470,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,893
1105,0.235294,0.263699,0.051209,0.777778,0.500,0.884058,0.750000,0.226250,0.182849,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1106
413,0.058824,0.119863,0.035804,0.444444,0.625,0.398551,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,414
522,0.176471,0.099315,0.017294,0.555556,0.750,0.543478,0.000000,0.000000,0.070695,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,523
1036,0.000000,0.232877,0.054210,0.888889,0.500,0.978261,0.966667,0.043750,0.181077,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,0.000000,0.167808,0.032139,0.444444,0.625,0.623188,0.700000,0.000000,0.149894,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,332
323,0.000000,0.095890,0.021127,0.222222,0.875,0.601449,0.916667,0.000000,0.045358,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,324
650,0.235294,0.150685,0.031901,0.666667,0.625,0.978261,0.950000,0.063876,0.000000,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,651
439,0.176471,0.157534,0.051667,0.555556,0.875,0.347826,0.833333,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,440


# Building Models

## Random Forest Regressor

In [None]:
# random forest regressor

rfc = RandomForestRegressor()
rfc.fit(X_train.iloc[:,:-1], y_train)
rfc_predict_Test=rfc.predict(X_test.iloc[:,:-1])
rfc_predict_Train=rfc.predict(X_train.iloc[:,:-1])


print("RMSE (training) for Random Forest:{0:10f}".format(mean_squared_error(y_train,rfc_predict_Train)))
print("RMSE (Test Data) for Random Forest:{0:10f}".format(mean_squared_error(y_test,rfc_predict_Test)))

print()

# Accessing the individual decision trees in the Random Forest
individual_trees = rfc.estimators_

# Extracting the depth and number of leaf nodes of each tree
tree_depths = [tree.tree_.max_depth for tree in individual_trees]
tree_leaf_nodes = [tree.tree_.n_leaves for tree in individual_trees]

print("Tree depths:")
print('unique depths:', set(tree_depths))
print('max max_depth used:', max(tree_depths))
print('min max_depths used:', min(tree_depths))
print('avg depth:', sum(tree_depths)/len(tree_depths))

print()
print("leaf nodes:")
print('unique:', set(tree_leaf_nodes))
print('max:', max(tree_leaf_nodes))
print('min:', min(tree_leaf_nodes))


RMSE (training) for Random Forest:137765896.299486
RMSE (Test Data) for Random Forest:735422325.879572

Tree depths:
unique depths: {17, 18, 19, 20, 21, 22, 23, 24, 25}
max max_depth used: 25
min max_depths used: 17
avg depth: 20.2

leaf nodes:
unique: {642, 649, 653, 599, 604, 605, 606, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 633, 634, 635, 636, 637, 639}
max: 653
min: 599


## Random Forest Regressor - Hyperparameter tuning

In [None]:
param_dist = {
    'n_estimators': range(10,100,10),  # Number of trees in the forest
    'max_depth': range(10, 25,3),  # Maximum depth of the tree
    'max_leaf_nodes': range(500,800,50)
}

# Create a Random Forest model
rf = RandomForestRegressor(random_state=42)

# Set up the random search
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=50, cv=5, scoring='neg_mean_squared_error', random_state=42)

# Fit the random search to the data
random_search.fit(X_train.iloc[:,:-1], y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Use the best hyperparameters to build the final Random Forest model
best_rf = RandomForestRegressor(**random_search.best_params_, random_state=42)

# Fit the model to the training data
best_rf.fit(X_train.iloc[:,:-1], y_train)

rfc_predict_Test_1=best_rf.predict(X_test.iloc[:,:-1])
rfc_predict_Train_1=best_rf.predict(X_train.iloc[:,:-1])

print("RMSE (training) for Random Forest:{0:10f}".format(mean_squared_error(y_train,rfc_predict_Train_1)))
print("RMSE (Test Data) for Random Forest:{0:10f}".format(mean_squared_error(y_test,rfc_predict_Test_1)))

Best Hyperparameters: {'n_estimators': 20, 'max_leaf_nodes': 550, 'max_depth': 22}
RMSE (training) for Random Forest:145814512.590389
RMSE (Test Data) for Random Forest:757755945.315792


## Decsision Tree Regressor

In [None]:
# decision tree regressor

dt = DecisionTreeRegressor()
dt.fit(X_train.iloc[:,:-1], y_train)
dt_predict_Test=dt.predict(X_test.iloc[:,:-1])
dt_predict_Train=dt.predict(X_train.iloc[:,:-1])


print("RMSE (training) for Decision Tree:{0:10f}".format(mean_squared_error(y_train,dt_predict_Train)))
print("RMSE (Test Data) for Decision Tree:{0:10f}".format(mean_squared_error(y_test,dt_predict_Test)))

print()

tree_depth = dt.tree_.max_depth
num_leaves = dt.tree_.n_leaves

print("max depth:", tree_depth)
print("num leaves:", num_leaves)


RMSE (training) for Decision Tree:  0.000000
RMSE (Test Data) for Decision Tree:2044164473.751142

max depth: 20
num leaves: 979


## Decision Tree Regressor - Hyperparameter tuning

In [None]:
param_dist = {

    'max_depth': range(5, 25, 5),  # Maximum depth of the tree
    'max_leaf_nodes': range(100,300,25)
}


# Set up the random search
random_search = RandomizedSearchCV(dt, param_distributions=param_dist, n_iter=50, cv=5, scoring='neg_mean_squared_error', random_state=42)

# Fit the random search to the data
random_search.fit(X_train.iloc[:,:-1], y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Use the best hyperparameters to build the final Decision Tree model
best_dt = DecisionTreeRegressor(**random_search.best_params_, random_state=42)

# Fit the model to the training data
best_dt.fit(X_train.iloc[:,:-1], y_train)

dt_predict_Test_1=best_dt.predict(X_test.iloc[:,:-1])
dt_predict_Train_1=best_dt.predict(X_train.iloc[:,:-1])

print("RMSE (training) for Decision Tree:{0:10f}".format(mean_squared_error(y_train,dt_predict_Train_1)))
print("RMSE (Test Data) for Decision Tree:{0:10f}".format(mean_squared_error(y_test,dt_predict_Test_1)))



Best Hyperparameters: {'max_leaf_nodes': 275, 'max_depth': 20}
RMSE (training) for Decision Tree:35625280.510986
RMSE (Test Data) for Decision Tree:1519671225.729430


## MLP Regressor

In [None]:
# MLP regressor

mlp = MLPRegressor()
mlp.fit(X_train.iloc[:,:-1], y_train)
mlp_predict_Test=mlp.predict(X_test.iloc[:,:-1])
mlp_predict_Train=mlp.predict(X_train.iloc[:,:-1])


print("RMSE (training) for MLP:{0:10f}".format(mean_squared_error(y_train,mlp_predict_Train)))
print("RMSE (Test Data) for MLP:{0:10f}".format(mean_squared_error(y_test,mlp_predict_Test)))

RMSE (training) for MLP:35952472707.786926
RMSE (Test Data) for MLP:36462882487.212471




In [None]:
# performing feature selection to improve performance

#Do feature selection using RANDOM FOREST
print(rfc.feature_importances_)

#print sorted list of important features
importances = rfc.feature_importances_
indices = np.argsort(importances)[-30:] #the list is in ascending order
print("\n top 40 features = ", indices, "\n")


[0.00360839 0.01037378 0.0176548  0.5406828  0.00492121 0.01300404
 0.01139485 0.00527641 0.03144248 0.00657992 0.03307463 0.02345121
 0.03298419 0.12819663 0.00426132 0.00274863 0.00629461 0.00557041
 0.0063178  0.02128234 0.01889209 0.00783189 0.00720783 0.00635133
 0.00150045 0.00411044 0.00180554 0.00153135 0.00108983 0.00137245
 0.0137398  0.00389925 0.00127705 0.00199878 0.0011965  0.00141192
 0.00239384 0.0029156  0.00951873 0.00083486]

 top 40 features =  [36 15 37  0 31 25 14  4  7 17 16 18 23  9 22 21 38  1  6  5 30  2 20 19
 11  8 12 10 13  3] 



In [None]:
rfc = RandomForestRegressor()
rfc.fit(X_train.iloc[:,:-1], y_train)
model = SelectFromModel(rfc, prefit=True, max_features=30, threshold=-np.inf)
X_new = model.transform(X_train.iloc[:,:-1])
X_new_SelectedFeatures_RF= pd.DataFrame(X_new)
print(X_new_SelectedFeatures_RF)


            0         1         2         3      4         5         6   \
0     0.000000  0.202055  0.042534  0.666667  0.625  0.710145  0.333333   
1     0.941176  0.047945  0.011101  0.444444  0.500  0.963768  0.916667   
2     0.235294  0.174658  0.034308  0.666667  0.500  0.992754  0.983333   
3     0.000000  0.215753  0.048470  0.888889  0.500  0.971014  0.933333   
4     0.235294  0.075342  0.043782  0.666667  0.625  0.898551  0.766667   
...        ...       ...       ...       ...    ...       ...       ...   
1017  0.000000  0.195205  0.037472  0.555556  0.500  0.971014  0.933333   
1018  0.176471  0.150685  0.030400  0.333333  0.250  0.405797  0.000000   
1019  0.000000  0.133562  0.032120  0.444444  0.750  0.601449  0.666667   
1020  0.176471  0.116438  0.029643  0.666667  0.875  0.333333  0.800000   
1021  0.588235  0.109589  0.011143  0.666667  0.500  0.978261  0.950000   

           7         8         9   ...        20        21        22   23  \
0     0.18000  0.00000



In [None]:
print(model.get_support())

#Get column names
cols = model.get_support(indices=True) #get column indices
print("\n cols = ", cols, "\n")

features_df_new = X_train.iloc[:,cols] #Get cols of X-Train with col names
print(features_df_new.shape)
print ("\n features_df_new= ", features_df_new)

[ True  True  True  True  True  True  True  True  True False  True  True
  True  True False  True False False  True False  True False  True  True
  True  True  True  True  True False False  True False False  True False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False Fa

In [None]:
rfc1 = RandomForestRegressor()
features_df_new_test = X_test.iloc[:,cols]
rfc.fit(features_df_new, y_train)
rfc_predict_Test=rfc.predict(features_df_new_test)
rfc_predict_Train=rfc.predict(features_df_new)


print("RMSE (training) for Random Forest:{0:10f}".format(mean_squared_error(y_train,rfc_predict_Train)))
print("RMSE (Test Data) for Random Forest:{0:10f}".format(mean_squared_error(y_test,rfc_predict_Test)))

RMSE (training) for Decision Tree:131344243.466030
RMSE (Test Data) for Decision Tree:718002362.956667


## Stacking

In [None]:
#STACKING MODELS


models = [RandomForestRegressor(), DecisionTreeRegressor(), MLPRegressor()]

S_Train, S_Test = stacking(models,
                           X_train.iloc[:,:-1], y_train, X_test.iloc[:,:-1],
                           regression=True,

                           mode='oof_pred_bag',

                           needs_proba=False,

                           save_dir=None,

                           n_folds=4,

                           verbose=2,
                           shuffle = True)



task:         [regression]
metric:       [mean_absolute_error]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [RandomForestRegressor]
    fold  0:  [20502.77464844]
    fold  1:  [16850.21199219]
    fold  2:  [19875.06960784]
    fold  3:  [17600.62745098]
    ----
    MEAN:     [18707.17092486] + [1521.58757578]
    FULL:     [18707.11089041]

model  1:     [DecisionTreeRegressor]
    fold  0:  [28296.98828125]
    fold  1:  [25890.67578125]
    fold  2:  [27086.69019608]
    fold  3:  [26317.13333333]
    ----
    MEAN:     [26897.87189798] + [914.45069600]
    FULL:     [26898.25538160]

model  2:     [MLPRegressor]




    fold  0:  [180420.73979630]




    fold  1:  [171521.16585759]




    fold  2:  [178074.94642529]
    fold  3:  [179359.07409013]
    ----
    MEAN:     [177343.98154233] + [3462.89371365]
    FULL:     [177341.29459768]





## Stacked Model - Random Forest

In [None]:
# stacked model - random forest
model = RandomForestRegressor()

model = model.fit(S_Train, y_train)
y_pred_train = model.predict(S_Train)
y_pred_test = model.predict(S_Test)
mean_squared_error(y_train,y_pred_train)
print("RMSE (training) for Random Forest:{0:10f}".format(mean_squared_error(y_train,y_pred_train)))
mean_squared_error(y_test,y_pred_test)
print("RMSE (Test Data) for Random Forest:{0:10f}".format(mean_squared_error(y_test,y_pred_test)))

RMSE (training) for Decision Tree:152626433.259264
RMSE (Test Data) for Decision Tree:1129381069.605777


In [None]:
# Accessing the individual decision trees in the Random Forest
individual_trees = model.estimators_

# Extracting the depth and number of leaf nodes of each tree
tree_depths = [tree.tree_.max_depth for tree in individual_trees]
tree_leaf_nodes = [tree.tree_.n_leaves for tree in individual_trees]

print("Tree depths:")
print('unique depths:', set(tree_depths))
print('max max_depth used:', max(tree_depths))
print('min max_depths used:', min(tree_depths))
print('avg depth:', sum(tree_depths)/len(tree_depths))

print()
print("leaf nodes:")
print('unique:', set(tree_leaf_nodes))
print('max:', max(tree_leaf_nodes))
print('min:', min(tree_leaf_nodes))

Tree depths:
unique depths: {32, 19, 20, 21, 22, 23, 24, 25, 26, 27}
max max_depth used: 32
min max_depths used: 19
avg depth: 21.61

leaf nodes:
unique: {640, 641, 642, 643, 644, 645, 646, 647, 649, 651, 652, 656, 658, 607, 611, 613, 614, 619, 620, 621, 622, 623, 624, 626, 627, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639}
max: 658
min: 607


## Stacked Model - Random Forest with hyperparameter tuning

In [None]:
param_dist = {
    'n_estimators': range(10,100,10),  # Number of trees in the forest
    'max_depth': range(10, 25,3),  # Maximum depth of the tree
    'max_leaf_nodes': range(500,800,50)
}



# Set up the random search
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=50, cv=5, scoring='neg_mean_squared_error', random_state=42)

# Fit the random search to the data
random_search.fit(S_Train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Use the best hyperparameters to build the final Random Forest model
best_rf = RandomForestRegressor(**random_search.best_params_, random_state=42)

# Fit the model to the training data
best_rf.fit(S_Train, y_train)

rfc_predict_Test_1=best_rf.predict(S_Test)
rfc_predict_Train_1=best_rf.predict(S_Train)

print("RMSE (training) for Random Forest:{0:10f}".format(mean_squared_error(y_train,rfc_predict_Train_1)))
print("RMSE (Test Data) for Random Forest:{0:10f}".format(mean_squared_error(y_test,rfc_predict_Test_1)))

Best Hyperparameters: {'n_estimators': 60, 'max_leaf_nodes': 550, 'max_depth': 13}
RMSE (training) for Random Forest:164022847.148339
RMSE (Test Data) for Random Forest:1187158467.959053


# Stacked model - MLP Regressor

In [None]:
model = MLPRegressor()

model = model.fit(S_Train, y_train)
y_pred_train = model.predict(S_Train)
y_pred_test = model.predict(S_Test)
mean_squared_error(y_train,y_pred_train)
print("RMSE (training) for MLP:{0:10f}".format(mean_squared_error(y_train,y_pred_train)))
mean_squared_error(y_test,y_pred_test)
print("RMSE (Test Data) for MLP:{0:10f}".format(mean_squared_error(y_test,y_pred_test)))

RMSE (training) for Decision Tree:948611944.556073
RMSE (Test Data) for Decision Tree:812018275.406983


## Stacked model - Decision Tree Regressor

In [None]:
model = DecisionTreeRegressor()

model = model.fit(S_Train, y_train)
y_pred_train = model.predict(S_Train)
y_pred_test = model.predict(S_Test)
mean_squared_error(y_train,y_pred_train)
print("RMSE (training) for Decision Tree:{0:10f}".format(mean_squared_error(y_train,y_pred_train)))
mean_squared_error(y_test,y_pred_test)
print("RMSE (Test Data) for Decision Tree:{0:10f}".format(mean_squared_error(y_test,y_pred_test)))

RMSE (training) for Decision Tree:  0.000000
RMSE (Test Data) for Decision Tree:1901396688.331050


In [None]:
tree_depth = model.tree_.max_depth
num_leaves = model.tree_.n_leaves

print("max depth:", tree_depth)
print("num leaves:", num_leaves)


max depth: 24
num leaves: 998


## Stacked Model - Decision Tree Hyperparameter tuning

In [None]:
param_dist = {

    'max_depth': range(5, 25, 5),  # Maximum depth of the tree
    'max_leaf_nodes': range(100,300,25)
}



# Set up the random search
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=50, cv=5, scoring='neg_mean_squared_error', random_state=42)

# Fit the random search to the data
random_search.fit(S_Train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Use the best hyperparameters to build the final Decision Tree model
best_dt = DecisionTreeRegressor(**random_search.best_params_, random_state=42)

# Fit the model to the training data
best_dt.fit(S_Train, y_train)

dt_predict_Test_1=best_dt.predict(S_Test)
dt_predict_Train_1=best_dt.predict(S_Train)

print("RMSE (training) for Decision Tree:{0:10f}".format(mean_squared_error(y_train,dt_predict_Train_1)))
print("RMSE (Test Data) for Decision Tree:{0:10f}".format(mean_squared_error(y_test,dt_predict_Test_1)))



Best Hyperparameters: {'max_leaf_nodes': 100, 'max_depth': 5}
RMSE (training) for Decision Tree:669807391.739342
RMSE (Test Data) for Decision Tree:1655951910.276129


# Kaggle Scores

## MLP Regressor

In [None]:
model = MLPRegressor()

model = model.fit(X.iloc[:,:-1], y)
y_pred_test = model.predict(testData.iloc[:,:-1])
y_pred_test



array([14816.53027464, 15374.09201977, 15691.14381451, ...,
       15392.7554027 , 14816.52589885, 15865.9726766 ])

In [None]:
result = pd.DataFrame({'Id':testData['Id'],'SalePrice':y_pred_test})
result

Unnamed: 0,Id,SalePrice
1460,1461,14816.530275
1461,1462,15374.092020
1462,1463,15691.143815
1463,1464,15931.412963
1464,1465,15423.451978
...,...,...
2914,2915,14662.566764
2915,2916,14561.970029
2916,2917,15392.755403
2917,2918,14816.525899


In [None]:
result.to_csv('mlp_kaggle_result.csv', index=False)

## Decision Tree Regressor

In [None]:
model = DecisionTreeRegressor()

model = model.fit(X.iloc[:,:-1], y)
y_pred_test = model.predict(testData.iloc[:,:-1])

result = pd.DataFrame({'Id':testData['Id'],'SalePrice':y_pred_test})

result.to_csv('dt_kaggle_result.csv', index=False)

In [None]:
tree_depth = model.tree_.max_depth
num_leaves = model.tree_.n_leaves

print("max depth:", tree_depth)
print("num leaves:", num_leaves)


max depth: 30
num leaves: 1406


## Decision Tree Regressor - Hyperparameter tuning

In [None]:
param_dist = {

    'max_depth': range(5, 25, 5),  # Maximum depth of the tree
    'max_leaf_nodes': range(100,500,25)
}



# Set up the random search
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=50, cv=5, scoring='neg_mean_squared_error', random_state=42)

# Fit the random search to the data
random_search.fit(X.iloc[:,:-1], y)

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Use the best hyperparameters to build the final model Decision Tree model
best_dt = DecisionTreeRegressor(**random_search.best_params_, random_state=42)

# Fit the model to the training data
best_dt.fit(X.iloc[:,:-1], y)

dt_predict_Test_1=best_dt.predict(testData.iloc[:,:-1])

result = pd.DataFrame({'Id':testData['Id'],'SalePrice':dt_predict_Test_1})

result.to_csv('dt_ht_kaggle_result.csv', index=False)

Best Hyperparameters: {'max_leaf_nodes': 200, 'max_depth': 15}


## Random Forest Regressor

In [None]:
model = RandomForestRegressor()

model = model.fit(X.iloc[:,:-1], y)
y_pred_test = model.predict(testData.iloc[:,:-1])

result = pd.DataFrame({'Id':testData['Id'],'SalePrice':y_pred_test})

result.to_csv('rf_kaggle_result.csv', index=False)

# Accessing the individual decision trees in the Random Forest
individual_trees = model.estimators_

# Extracting the depth and number of leaf nodes of each tree
tree_depths = [tree.tree_.max_depth for tree in individual_trees]
tree_leaf_nodes = [tree.tree_.n_leaves for tree in individual_trees]

print("Tree depths:")
print('unique depths:', set(tree_depths))
print('max max_depth used:', max(tree_depths))
print('min max_depths used:', min(tree_depths))
print('avg depth:', sum(tree_depths)/len(tree_depths))

print()
print("leaf nodes:")
print('unique:', set(tree_leaf_nodes))
print('max:', max(tree_leaf_nodes))
print('min:', min(tree_leaf_nodes))

Tree depths:
unique depths: {19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}
max max_depth used: 29
min max_depths used: 19
avg depth: 22.68

leaf nodes:
unique: {896, 897, 898, 899, 900, 902, 903, 904, 906, 909, 910, 919, 857, 860, 865, 866, 867, 869, 870, 871, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 888, 889, 890, 891, 892, 893, 894, 895}
max: 919
min: 857


## Random Forest - Hyperparameter tuning

In [None]:
param_dist = {
    'n_estimators': range(10,100,10),  # Number of trees in the forest
    'max_depth': range(10, 30,5),  # Maximum depth of the tree
    'max_leaf_nodes': range(500,800,50)
}



# Set up the random search
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=50, cv=5, scoring='neg_mean_squared_error', random_state=42)

# Fit the random search to the data
random_search.fit(X.iloc[:,:-1], y)

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Use the best hyperparameters to build the final Random Forest model
best_rf = RandomForestRegressor(**random_search.best_params_, random_state=42)

# Fit the model to the training data
best_rf.fit(X.iloc[:,:-1], y)

rfc_predict_Test_1=best_rf.predict(testData.iloc[:,:-1])

result = pd.DataFrame({'Id':testData['Id'],'SalePrice':rfc_predict_Test_1})

result.to_csv('rf_ht_kaggle_result.csv', index=False)

Best Hyperparameters: {'n_estimators': 60, 'max_leaf_nodes': 650, 'max_depth': 15}


## Stacking

In [None]:
#STACKING MODELS


models = [RandomForestRegressor(), DecisionTreeRegressor(), MLPRegressor()]

S_Train, S_Test = stacking(models,
                           X.iloc[:,:-1], y, testData.iloc[:,:-1],
                           regression=True,

                           mode='oof_pred_bag',

                           needs_proba=False,

                           save_dir=None,

                           n_folds=4,

                           verbose=2,
                           shuffle = True)


task:         [regression]
metric:       [mean_absolute_error]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [RandomForestRegressor]
    fold  0:  [16959.86101370]
    fold  1:  [18330.73304110]
    fold  2:  [17580.86572603]
    fold  3:  [16111.01334247]
    ----
    MEAN:     [17245.61828082] + [815.29909762]
    FULL:     [17245.61828082]

model  1:     [DecisionTreeRegressor]
    fold  0:  [25583.37260274]
    fold  1:  [26018.59726027]
    fold  2:  [26976.69863014]
    fold  3:  [23427.78904110]
    ----
    MEAN:     [25501.61438356] + [1299.09399071]
    FULL:     [25501.61438356]

model  2:     [MLPRegressor]




    fold  0:  [172944.41490366]




    fold  1:  [169897.51145769]




    fold  2:  [179620.85345727]
    fold  3:  [165800.60543321]
    ----
    MEAN:     [172065.84631296] + [5044.92158128]
    FULL:     [172065.84631296]





## Stacked model - MLP

In [None]:
model = MLPRegressor()

model = model.fit(S_Train, y)
y_pred_test = model.predict(S_Test)

result = pd.DataFrame({'Id':testData['Id'],'SalePrice':y_pred_test})

result.to_csv('mlp_stacked_kaggle_result.csv', index=False)

## Stacking - base models hyperparameter tuned

In [None]:
#STACKING MODELS


models = [RandomForestRegressor(n_estimators = 70, max_leaf_nodes= 750, max_depth= 25), DecisionTreeRegressor(max_leaf_nodes= 200, max_depth= 15), MLPRegressor()]

S_Train, S_Test = stacking(models,
                           X.iloc[:,:-1], y, testData.iloc[:,:-1],
                           regression=True,

                           mode='oof_pred_bag',

                           needs_proba=False,

                           save_dir=None,

                           n_folds=4,

                           verbose=2,
                           shuffle = True)

task:         [regression]
metric:       [mean_absolute_error]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [RandomForestRegressor]
    fold  0:  [16938.76297456]
    fold  1:  [18008.60665362]
    fold  2:  [17557.26283105]
    fold  3:  [16170.77056751]
    ----
    MEAN:     [17168.85075669] + [690.13736992]
    FULL:     [17168.85075669]

model  1:     [DecisionTreeRegressor]
    fold  0:  [26070.97855561]
    fold  1:  [24708.23818285]
    fold  2:  [26099.73697820]
    fold  3:  [23391.56457763]
    ----
    MEAN:     [25067.62957357] + [1119.18619059]
    FULL:     [25067.62957357]

model  2:     [MLPRegressor]




    fold  0:  [172808.98818939]




    fold  1:  [171242.07360451]




    fold  2:  [179804.44030781]
    fold  3:  [166245.67097749]
    ----
    MEAN:     [172525.29326980] + [4851.46300748]
    FULL:     [172525.29326980]





## Stacked model - MLP

In [None]:
model = MLPRegressor()

model = model.fit(S_Train, y)
y_pred_test = model.predict(S_Test)

result = pd.DataFrame({'Id':testData['Id'],'SalePrice':y_pred_test})

result.to_csv('mlp_base_ht_stacked_kaggle_result.csv', index=False)

## Stacked model - Decision Tree

In [None]:
model = DecisionTreeRegressor()

model = model.fit(S_Train, y)
y_pred_test = model.predict(S_Test)

result = pd.DataFrame({'Id':testData['Id'],'SalePrice':y_pred_test})

result.to_csv('dt_stacked_kaggle_result.csv', index=False)

tree_depth = model.tree_.max_depth
num_leaves = model.tree_.n_leaves

print("max depth:", tree_depth)
print("num leaves:", num_leaves)

max depth: 23
num leaves: 1427


## Stacked model - hyperparameter tuned decision tree

In [None]:
param_dist = {

    'max_depth': range(5, 20, 5),  # Maximum depth of the tree
    'max_leaf_nodes': range(100,900,25)
}

# Set up the random search
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=50, cv=5, scoring='neg_mean_squared_error', random_state=42)

# Fit the random search to the data
random_search.fit(S_Train, y)

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Use the best hyperparameters to build the final Decision Tree model
best_dt = DecisionTreeRegressor(**random_search.best_params_, random_state=42)

# Fit the model to the training data
best_dt.fit(S_Train, y)

dt_predict_Test_1=best_dt.predict(S_Test)

result = pd.DataFrame({'Id':testData['Id'],'SalePrice':dt_predict_Test_1})

result.to_csv('dt_ht_stacked_kaggle_result.csv', index=False)

Best Hyperparameters: {'max_leaf_nodes': 100, 'max_depth': 5}


## Stacked model - Random Forest Regressor

In [None]:
model = RandomForestRegressor()

model = model.fit(S_Train, y)
y_pred_test = model.predict(S_Test)

result = pd.DataFrame({'Id':testData['Id'],'SalePrice':y_pred_test})

result.to_csv('rf_stacked_kaggle_result.csv', index=False)

# Accessing the individual decision trees in the Random Forest
individual_trees = model.estimators_

# Extracting the depth and number of leaf nodes of each tree
tree_depths = [tree.tree_.max_depth for tree in individual_trees]
tree_leaf_nodes = [tree.tree_.n_leaves for tree in individual_trees]

print("Tree depths:")
print('unique depths:', set(tree_depths))
print('max max_depth used:', max(tree_depths))
print('min max_depths used:', min(tree_depths))
print('avg depth:', sum(tree_depths)/len(tree_depths))

print()
print("leaf nodes:")
print('unique:', set(tree_leaf_nodes))
print('max:', max(tree_leaf_nodes))
print('min:', min(tree_leaf_nodes))

Tree depths:
unique depths: {20, 21, 22, 23, 24, 25, 26, 27, 28, 29}
max max_depth used: 29
min max_depths used: 20
avg depth: 23.65

leaf nodes:
unique: {896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 921, 922, 923, 924, 927, 928, 929, 931, 933, 872, 873, 875, 877, 879, 885, 888, 890, 891, 892, 893, 894, 895}
max: 933
min: 872


## Stacked model - Hyperparameter tuned Random Forest

In [None]:
param_dist = {
    'n_estimators': range(10,100,10),  # Number of trees in the forest
    'max_depth': range(10, 30,5),  # Maximum depth of the tree
    'max_leaf_nodes': range(500,800,50)
}



# Set up the random search
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=50, cv=5, scoring='neg_mean_squared_error', random_state=42)

# Fit the random search to the data
random_search.fit(S_Train, y)

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Use the best hyperparameters to build the final Random Forest model
best_rf = RandomForestRegressor(**random_search.best_params_, random_state=42)

# Fit the model to the training data
best_rf.fit(S_Train, y)

rfc_predict_Test_1=best_rf.predict(S_Test)

result = pd.DataFrame({'Id':testData['Id'],'SalePrice':rfc_predict_Test_1})

result.to_csv('rf_ht_stacked_kaggle_result.csv', index=False)

Best Hyperparameters: {'n_estimators': 10, 'max_leaf_nodes': 550, 'max_depth': 10}


## Feature Selection

In [None]:
print(model.feature_importances_) # model here is hyperparameter tuned random forest model

#print sorted list of important features
importances = model.feature_importances_
indices = np.argsort(importances)[-15:] #the list is in ascending order
print("\n top 15 features = ", indices, "\n")

[1.36340344e-03 6.95371318e-03 1.41576884e-02 5.84824857e-01
 4.58326900e-03 8.55590015e-03 6.83991196e-03 5.71236322e-03
 2.90402765e-02 5.60592938e-04 4.61094027e-03 3.63337521e-02
 2.23478378e-02 3.28302768e-02 1.67940767e-04 1.10329338e-01
 9.52057027e-04 9.00906435e-04 9.32947340e-03 1.02357938e-03
 1.89756826e-03 7.71695070e-04 8.26695187e-03 2.59394668e-03
 3.95982174e-03 1.44707143e-02 1.47098081e-02 4.44720632e-03
 4.07985153e-03 6.52005617e-04 2.15252738e-04 1.32526905e-03
 7.52797251e-05 1.36639475e-04 3.09281600e-03 1.35864353e-03
 2.68850518e-04 4.25467854e-05 3.27979074e-05 5.48355598e-04
 1.10081139e-03 2.89553334e-06 1.47839506e-06 2.67288759e-04
 3.45991570e-04 3.79970246e-06 1.53109579e-03 5.09319023e-04
 2.38542250e-04 9.66454073e-05 3.31371111e-04 5.21353601e-09
 4.37778374e-09 5.91301662e-04 2.84555921e-04 7.89953349e-05
 3.92283385e-05 2.68368586e-04 3.55961341e-04 1.37438707e-04
 7.32163961e-05 1.15802699e-05 9.63827966e-07 4.35233011e-06
 1.18251370e-04 2.328808

In [None]:
model = SelectFromModel(model, prefit=True, max_features=15, threshold=-np.inf)
X_new = model.transform(X.iloc[:,:-1])
X_new_SelectedFeatures= pd.DataFrame(X_new)
print(X_new_SelectedFeatures)

            0         1         2         3         4         5         6   \
0     0.150685  0.033420  0.666667  0.949275  0.883333  0.122500  0.125089   
1     0.202055  0.038795  0.555556  0.753623  0.433333  0.000000  0.173281   
2     0.160959  0.046507  0.666667  0.934783  0.866667  0.101250  0.086109   
3     0.133562  0.038561  0.666667  0.311594  0.333333  0.000000  0.038271   
4     0.215753  0.060576  0.777778  0.927536  0.833333  0.218750  0.116052   
...        ...       ...       ...       ...       ...       ...       ...   
1455  0.140411  0.030929  0.555556  0.920290  0.833333  0.000000  0.000000   
1456  0.219178  0.055505  0.555556  0.768116  0.633333  0.074375  0.139972   
1457  0.154110  0.036187  0.666667  0.500000  0.933333  0.000000  0.048724   
1458  0.160959  0.039342  0.444444  0.565217  0.766667  0.000000  0.008682   
1459  0.184932  0.040370  0.444444  0.673913  0.250000  0.000000  0.147059   

            7         8         9         10    11        12   



In [None]:
#Get column names
print(model.get_support())

#Get column names
cols = model.get_support(indices=True) #get column indices
print("\n cols = ", cols, "\n")

features_df_new = X.iloc[:,cols] #Get cols of X-Train with col names
print(features_df_new.shape)
print ("\n features_df_new= ", features_df_new)

[False  True  True  True False  True  True  True  True False False  True
  True  True False  True False False  True False False False  True False
 False  True  True False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False Fa

In [None]:
rf = RandomForestRegressor()
rf.fit(features_df_new, y)

#Get corresponding test set columns
features_df_new_test = testData.iloc[:,cols] #Get cols of X-Train with col names

rf_predict=rf.predict(features_df_new_test)

result = pd.DataFrame({'Id':testData['Id'],'SalePrice':rf_predict})

result.to_csv('rf_fs_kaggle_result.csv', index=False)

In [None]:
# feature selection using forward search
import joblib
import sys
sys.modules['sklearn.externals.joblib'] = joblib

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

rf = RandomForestRegressor()

sfs1 = SFS(rf,
           k_features=15,
           forward=True,
           floating=False,
           verbose=2,
           scoring='neg_mean_squared_error',
           cv=3)

sfs1 = sfs1.fit(X.iloc[:,:-1],y)
sfs1.subsets_

[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   13.3s
[Parallel(n_jobs=1)]: Done 161 tasks      | elapsed:   44.3s

[2023-11-24 00:59:07] Features: 1/15 -- score: -2015970318.0975056[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   13.7s
[Parallel(n_jobs=1)]: Done 161 tasks      | elapsed:   47.6s

[2023-11-24 01:00:25] Features: 2/15 -- score: -1703111955.0799692[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   14.5s
[Parallel(n_jobs=1)]: Done 161 tasks      | elapsed:   51.4s

[2023-11-24 01:01:49] Features: 3/15 -- score: -1528900491.6564867[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   22.7s
[Parallel(n_jobs=1)]: Done 161 tasks      | elapsed:  1.4min

[2023-11-24 01:04:12] Features: 4/15 -- score: -1108689566.0792315[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   27.3s
[Parallel(n_jobs=1)]: Done 161 tasks      | elapsed:  1.8min

[2023-11-24 01:07:05] Features: 5/15 -- score: -962121236.5927027[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   2

{1: {'feature_idx': (3,),
  'cv_scores': array([-1.74851890e+09, -2.28737230e+09, -2.01201975e+09]),
  'avg_score': -2015970318.0975056,
  'feature_names': ('OverallQual',)},
 2: {'feature_idx': (3, 25),
  'cv_scores': array([-1.41087620e+09, -2.02979688e+09, -1.66866279e+09]),
  'avg_score': -1703111955.0799692,
  'feature_names': ('OverallQual', 'GarageCars')},
 3: {'feature_idx': (3, 15, 25),
  'cv_scores': array([-1.43306155e+09, -1.81708042e+09, -1.33655950e+09]),
  'avg_score': -1528900491.6564867,
  'feature_names': ('OverallQual', 'GrLivArea', 'GarageCars')},
 4: {'feature_idx': (3, 8, 15, 25),
  'cv_scores': array([-9.42825644e+08, -1.43356435e+09, -9.49678701e+08]),
  'avg_score': -1108689566.0792315,
  'feature_names': ('OverallQual', 'BsmtFinSF1', 'GrLivArea', 'GarageCars')},
 5: {'feature_idx': (3, 8, 13, 15, 25),
  'cv_scores': array([-8.00552441e+08, -1.16616746e+09, -9.19643804e+08]),
  'avg_score': -962121236.5927027,
  'feature_names': ('OverallQual',
   'BsmtFinSF1',