In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.linear_model import LinearRegression,LogisticRegressionCV,LogisticRegression
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.metrics import r2_score,f1_score,accuracy_score,recall_score,precision_score,mean_absolute_error,mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from statsmodels.api import OLS
from xgboost import XGBClassifier,XGBRegressor

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [55]:
Most_null_features = ['Alley','PoolQC','Fence','MiscFeature','FireplaceQu']

In [56]:
train.drop(Most_null_features,axis = 1 , inplace=True)

In [57]:
numerical_features = train.select_dtypes(['int64','float64'])
numerical_features

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,0.0,0,...,0,40,0,0,0,0,0,8,2007,175000
1456,1457,20,85.0,13175,6,6,1978,1988,119.0,790,...,349,0,0,0,0,0,0,2,2010,210000
1457,1458,70,66.0,9042,7,9,1941,2006,0.0,275,...,0,60,0,0,0,0,2500,5,2010,266500
1458,1459,20,68.0,9717,5,6,1950,1996,0.0,49,...,366,0,112,0,0,0,0,4,2010,142125


In [58]:
num_col=numerical_features.columns.tolist()
num_col

['Id',
 'MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SalePrice']

In [59]:
categorical_features = train.select_dtypes('object')
categorical_features

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,SBrkr,TA,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,SBrkr,Gd,Typ,Detchd,Unf,TA,TA,Y,WD,Abnorml
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,...,SBrkr,TA,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
1456,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,...,SBrkr,TA,Min1,Attchd,Unf,TA,TA,Y,WD,Normal
1457,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
1458,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,...,FuseA,Gd,Typ,Attchd,Unf,TA,TA,Y,WD,Normal


In [60]:
cat_col = categorical_features.columns.tolist()
cat_col

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [61]:
correlation = train.corr()["SalePrice"]
correlation

Id              -0.021917
MSSubClass      -0.084284
LotFrontage      0.351799
LotArea          0.263843
OverallQual      0.790982
OverallCond     -0.077856
YearBuilt        0.522897
YearRemodAdd     0.507101
MasVnrArea       0.477493
BsmtFinSF1       0.386420
BsmtFinSF2      -0.011378
BsmtUnfSF        0.214479
TotalBsmtSF      0.613581
1stFlrSF         0.605852
2ndFlrSF         0.319334
LowQualFinSF    -0.025606
GrLivArea        0.708624
BsmtFullBath     0.227122
BsmtHalfBath    -0.016844
FullBath         0.560664
HalfBath         0.284108
BedroomAbvGr     0.168213
KitchenAbvGr    -0.135907
TotRmsAbvGrd     0.533723
Fireplaces       0.466929
GarageYrBlt      0.486362
GarageCars       0.640409
GarageArea       0.623431
WoodDeckSF       0.324413
OpenPorchSF      0.315856
EnclosedPorch   -0.128578
3SsnPorch        0.044584
ScreenPorch      0.111447
PoolArea         0.092404
MiscVal         -0.021190
MoSold           0.046432
YrSold          -0.028923
SalePrice        1.000000
Name: SalePr

In [62]:
high_corr_features = []
for i in correlation.iteritems():
    if (i[1] > 0.25) | (i[1] < -0.25) : 
        high_corr_features.append(i[0])
high_corr_features

['LotFrontage',
 'LotArea',
 'OverallQual',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'FullBath',
 'HalfBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'SalePrice']

In [63]:
features_to_drop = [x for x in num_col if x not in high_corr_features]
features_to_drop

['Id',
 'MSSubClass',
 'OverallCond',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'LowQualFinSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [64]:
new_train = train.drop(features_to_drop,axis=1)
new_train

Unnamed: 0,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,SaleType,SaleCondition,SalePrice
0,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,2,548,TA,TA,Y,0,61,WD,Normal,208500
1,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,2,460,TA,TA,Y,298,0,WD,Normal,181500
2,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,2,608,TA,TA,Y,0,42,WD,Normal,223500
3,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,3,642,TA,TA,Y,0,35,WD,Abnorml,140000
4,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,3,836,TA,TA,Y,192,84,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,62.0,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,...,2,460,TA,TA,Y,0,40,WD,Normal,175000
1456,RL,85.0,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,...,2,500,TA,TA,Y,349,0,WD,Normal,210000
1457,RL,66.0,9042,Pave,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,...,1,252,TA,TA,Y,0,60,WD,Normal,266500
1458,RL,68.0,9717,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,1,240,TA,TA,Y,366,0,WD,Normal,142125


In [17]:
new_train['PavedDrive'].value_counts()

Y    1340
N      90
P      30
Name: PavedDrive, dtype: int64

In [18]:
test['SaleCondition'].value_counts()

Normal     1204
Partial     120
Abnorml      89
Family       26
Alloca       12
AdjLand       8
Name: SaleCondition, dtype: int64

In [19]:
new_train.select_dtypes('object').columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')

In [65]:
cat_features_drop = ['Street','Utilities','Neighborhood','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl'
                     ,'Exterior1st','MasVnrType','BsmtCond','BsmtFinType2','Heating','Electrical','Functional','GarageQual'
                    ,'GarageCond','PavedDrive','Exterior2nd']

In [66]:
final_train_data = new_train.drop(cat_features_drop,axis = 1 )
final_train_data

Unnamed: 0,MSZoning,LotFrontage,LotArea,LotShape,LandContour,LotConfig,LandSlope,Condition1,OverallQual,YearBuilt,...,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,SaleType,SaleCondition,SalePrice
0,RL,65.0,8450,Reg,Lvl,Inside,Gtl,Norm,7,2003,...,Attchd,2003.0,RFn,2,548,0,61,WD,Normal,208500
1,RL,80.0,9600,Reg,Lvl,FR2,Gtl,Feedr,6,1976,...,Attchd,1976.0,RFn,2,460,298,0,WD,Normal,181500
2,RL,68.0,11250,IR1,Lvl,Inside,Gtl,Norm,7,2001,...,Attchd,2001.0,RFn,2,608,0,42,WD,Normal,223500
3,RL,60.0,9550,IR1,Lvl,Corner,Gtl,Norm,7,1915,...,Detchd,1998.0,Unf,3,642,0,35,WD,Abnorml,140000
4,RL,84.0,14260,IR1,Lvl,FR2,Gtl,Norm,8,2000,...,Attchd,2000.0,RFn,3,836,192,84,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,62.0,7917,Reg,Lvl,Inside,Gtl,Norm,6,1999,...,Attchd,1999.0,RFn,2,460,0,40,WD,Normal,175000
1456,RL,85.0,13175,Reg,Lvl,Inside,Gtl,Norm,6,1978,...,Attchd,1978.0,Unf,2,500,349,0,WD,Normal,210000
1457,RL,66.0,9042,Reg,Lvl,Inside,Gtl,Norm,7,1941,...,Attchd,1941.0,RFn,1,252,0,60,WD,Normal,266500
1458,RL,68.0,9717,Reg,Lvl,Inside,Gtl,Norm,5,1950,...,Attchd,1950.0,Unf,1,240,366,0,WD,Normal,142125


In [67]:
final_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 40 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSZoning       1460 non-null   object 
 1   LotFrontage    1201 non-null   float64
 2   LotArea        1460 non-null   int64  
 3   LotShape       1460 non-null   object 
 4   LandContour    1460 non-null   object 
 5   LotConfig      1460 non-null   object 
 6   LandSlope      1460 non-null   object 
 7   Condition1     1460 non-null   object 
 8   OverallQual    1460 non-null   int64  
 9   YearBuilt      1460 non-null   int64  
 10  YearRemodAdd   1460 non-null   int64  
 11  MasVnrArea     1452 non-null   float64
 12  ExterQual      1460 non-null   object 
 13  ExterCond      1460 non-null   object 
 14  Foundation     1460 non-null   object 
 15  BsmtQual       1423 non-null   object 
 16  BsmtExposure   1422 non-null   object 
 17  BsmtFinType1   1423 non-null   object 
 18  BsmtFinS

In [68]:
final_train_data.describe()

Unnamed: 0,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,FullBath,HalfBath,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,SalePrice
count,1201.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1379.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,70.049958,10516.828082,6.099315,1971.267808,1984.865753,103.685262,443.639726,1057.429452,1162.626712,346.992466,...,1.565068,0.382877,6.517808,0.613014,1978.506164,1.767123,472.980137,94.244521,46.660274,180921.19589
std,24.284752,9981.264932,1.382997,30.202904,20.645407,181.066207,456.098091,438.705324,386.587738,436.528436,...,0.550916,0.502885,1.625393,0.644666,24.689725,0.747315,213.804841,125.338794,66.256028,79442.502883
min,21.0,1300.0,1.0,1872.0,1950.0,0.0,0.0,0.0,334.0,0.0,...,0.0,0.0,2.0,0.0,1900.0,0.0,0.0,0.0,0.0,34900.0
25%,59.0,7553.5,5.0,1954.0,1967.0,0.0,0.0,795.75,882.0,0.0,...,1.0,0.0,5.0,0.0,1961.0,1.0,334.5,0.0,0.0,129975.0
50%,69.0,9478.5,6.0,1973.0,1994.0,0.0,383.5,991.5,1087.0,0.0,...,2.0,0.0,6.0,1.0,1980.0,2.0,480.0,0.0,25.0,163000.0
75%,80.0,11601.5,7.0,2000.0,2004.0,166.0,712.25,1298.25,1391.25,728.0,...,2.0,1.0,7.0,1.0,2002.0,2.0,576.0,168.0,68.0,214000.0
max,313.0,215245.0,10.0,2010.0,2010.0,1600.0,5644.0,6110.0,4692.0,2065.0,...,3.0,2.0,14.0,3.0,2010.0,4.0,1418.0,857.0,547.0,755000.0


In [69]:
filled_train = final_train_data.fillna(final_train_data.median())
filled_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 40 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSZoning       1460 non-null   object 
 1   LotFrontage    1460 non-null   float64
 2   LotArea        1460 non-null   int64  
 3   LotShape       1460 non-null   object 
 4   LandContour    1460 non-null   object 
 5   LotConfig      1460 non-null   object 
 6   LandSlope      1460 non-null   object 
 7   Condition1     1460 non-null   object 
 8   OverallQual    1460 non-null   int64  
 9   YearBuilt      1460 non-null   int64  
 10  YearRemodAdd   1460 non-null   int64  
 11  MasVnrArea     1460 non-null   float64
 12  ExterQual      1460 non-null   object 
 13  ExterCond      1460 non-null   object 
 14  Foundation     1460 non-null   object 
 15  BsmtQual       1423 non-null   object 
 16  BsmtExposure   1422 non-null   object 
 17  BsmtFinType1   1423 non-null   object 
 18  BsmtFinS

  filled_train = final_train_data.fillna(final_train_data.median())


In [70]:
filled_train = filled_train.apply(lambda x: x.fillna(x.value_counts().index[0]))
filled_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 40 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSZoning       1460 non-null   object 
 1   LotFrontage    1460 non-null   float64
 2   LotArea        1460 non-null   int64  
 3   LotShape       1460 non-null   object 
 4   LandContour    1460 non-null   object 
 5   LotConfig      1460 non-null   object 
 6   LandSlope      1460 non-null   object 
 7   Condition1     1460 non-null   object 
 8   OverallQual    1460 non-null   int64  
 9   YearBuilt      1460 non-null   int64  
 10  YearRemodAdd   1460 non-null   int64  
 11  MasVnrArea     1460 non-null   float64
 12  ExterQual      1460 non-null   object 
 13  ExterCond      1460 non-null   object 
 14  Foundation     1460 non-null   object 
 15  BsmtQual       1460 non-null   object 
 16  BsmtExposure   1460 non-null   object 
 17  BsmtFinType1   1460 non-null   object 
 18  BsmtFinS

In [71]:
features = filled_train.drop("SalePrice",axis = 1)
price = filled_train['SalePrice']
features

Unnamed: 0,MSZoning,LotFrontage,LotArea,LotShape,LandContour,LotConfig,LandSlope,Condition1,OverallQual,YearBuilt,...,Fireplaces,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,SaleType,SaleCondition
0,RL,65.0,8450,Reg,Lvl,Inside,Gtl,Norm,7,2003,...,0,Attchd,2003.0,RFn,2,548,0,61,WD,Normal
1,RL,80.0,9600,Reg,Lvl,FR2,Gtl,Feedr,6,1976,...,1,Attchd,1976.0,RFn,2,460,298,0,WD,Normal
2,RL,68.0,11250,IR1,Lvl,Inside,Gtl,Norm,7,2001,...,1,Attchd,2001.0,RFn,2,608,0,42,WD,Normal
3,RL,60.0,9550,IR1,Lvl,Corner,Gtl,Norm,7,1915,...,1,Detchd,1998.0,Unf,3,642,0,35,WD,Abnorml
4,RL,84.0,14260,IR1,Lvl,FR2,Gtl,Norm,8,2000,...,1,Attchd,2000.0,RFn,3,836,192,84,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,62.0,7917,Reg,Lvl,Inside,Gtl,Norm,6,1999,...,1,Attchd,1999.0,RFn,2,460,0,40,WD,Normal
1456,RL,85.0,13175,Reg,Lvl,Inside,Gtl,Norm,6,1978,...,2,Attchd,1978.0,Unf,2,500,349,0,WD,Normal
1457,RL,66.0,9042,Reg,Lvl,Inside,Gtl,Norm,7,1941,...,2,Attchd,1941.0,RFn,1,252,0,60,WD,Normal
1458,RL,68.0,9717,Reg,Lvl,Inside,Gtl,Norm,5,1950,...,0,Attchd,1950.0,Unf,1,240,366,0,WD,Normal


In [72]:
features_cat_num = pd.get_dummies(features)
features_cat_num

Unnamed: 0,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,65.0,8450,7,2003,2003,196.0,706,856,856,854,...,0,0,0,1,0,0,0,0,1,0
1,80.0,9600,6,1976,1976,0.0,978,1262,1262,0,...,0,0,0,1,0,0,0,0,1,0
2,68.0,11250,7,2001,2002,162.0,486,920,920,866,...,0,0,0,1,0,0,0,0,1,0
3,60.0,9550,7,1915,1970,0.0,216,756,961,756,...,0,0,0,1,1,0,0,0,0,0
4,84.0,14260,8,2000,2000,350.0,655,1145,1145,1053,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,62.0,7917,6,1999,2000,0.0,0,953,953,694,...,0,0,0,1,0,0,0,0,1,0
1456,85.0,13175,6,1978,1988,119.0,790,1542,2073,0,...,0,0,0,1,0,0,0,0,1,0
1457,66.0,9042,7,1941,2006,0.0,275,1152,1188,1152,...,0,0,0,1,0,0,0,0,1,0
1458,68.0,9717,5,1950,1996,0.0,49,1078,1078,0,...,0,0,0,1,0,0,0,0,1,0


In [73]:
scaller = StandardScaler()
data_scalled = scaller.fit_transform(features_cat_num)
data_scalled_pd = pd.DataFrame(data=data_scalled, columns=features_cat_num.columns)
data_scalled_pd

Unnamed: 0,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-0.220875,-0.207142,0.651479,1.050994,0.878668,0.514104,0.575425,-0.459303,-0.793434,1.161852,...,-0.058621,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
1,0.460320,-0.091886,-0.071836,0.156734,-0.429577,-0.570750,1.171992,0.466465,0.257140,-0.795163,...,-0.058621,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
2,-0.084636,0.073480,0.651479,0.984752,0.830215,0.325915,0.092907,-0.313369,-0.627826,1.189351,...,-0.058621,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
3,-0.447940,-0.096897,0.651479,-1.863632,-0.720298,-0.570750,-0.499274,-0.687324,-0.521734,0.937276,...,-0.058621,-0.301962,-0.045376,0.390293,3.668167,-0.052414,-0.091035,-0.117851,-2.138345,-0.305995
4,0.641972,0.375148,1.374795,0.951632,0.733308,1.366489,0.463568,0.199680,-0.045611,1.617877,...,-0.058621,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,-0.357114,-0.260560,-0.071836,0.918511,0.733308,-0.570750,-0.973018,-0.238122,-0.542435,0.795198,...,-0.058621,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
1456,0.687385,0.266407,-0.071836,0.222975,0.151865,0.087911,0.759659,1.104925,2.355701,-0.795163,...,-0.058621,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
1457,-0.175462,-0.147810,0.651479,-1.002492,1.024029,-0.570750,-0.369871,0.215641,0.065656,1.844744,...,-0.058621,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
1458,-0.084636,-0.080160,-0.795151,-0.704406,0.539493,-0.570750,-0.865548,0.046905,-0.218982,-0.795163,...,-0.058621,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995


In [29]:
price.skew()

1.8828757597682129

In [30]:
np.log(price).skew()

0.12133506220520406

In [80]:
price_log = np.log(price)

In [75]:
X_train, X_val, y_train, y_val = train_test_split(data_scalled_pd,price_log,test_size=0.5,random_state=32)

In [76]:
simple = LinearRegression()
simple.fit(X_train,y_train)
pcs = simple.predict(X_val)
r2_score(y_val,pcs)

-1.2778393365100613e+21

In [77]:
tree = DecisionTreeRegressor()
tree.fit(X_train,y_train)
pct = tree.predict(X_val)
r2_score(y_val,pct)

0.6933173223205737

In [78]:
forest = RandomForestRegressor()
forest.fit(X_train,y_train)
pcf = forest.predict(X_val)
r2_score(y_val,pcf)

0.8404235248800155

In [79]:
xgb = XGBRegressor()
xgb.fit(X_train,y_train)
pcx = xgb.predict(X_val)
r2_score(y_val,pcx)

0.848953836086964

In [81]:
fcoll = final_train_data.columns.tolist()
del fcoll[-1]

In [82]:
final_test = test[fcoll]
final_test

Unnamed: 0,MSZoning,LotFrontage,LotArea,LotShape,LandContour,LotConfig,LandSlope,Condition1,OverallQual,YearBuilt,...,Fireplaces,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,SaleType,SaleCondition
0,RH,80.0,11622,Reg,Lvl,Inside,Gtl,Feedr,5,1961,...,0,Attchd,1961.0,Unf,1.0,730.0,140,0,WD,Normal
1,RL,81.0,14267,IR1,Lvl,Corner,Gtl,Norm,6,1958,...,0,Attchd,1958.0,Unf,1.0,312.0,393,36,WD,Normal
2,RL,74.0,13830,IR1,Lvl,Inside,Gtl,Norm,5,1997,...,1,Attchd,1997.0,Fin,2.0,482.0,212,34,WD,Normal
3,RL,78.0,9978,IR1,Lvl,Inside,Gtl,Norm,6,1998,...,1,Attchd,1998.0,Fin,2.0,470.0,360,36,WD,Normal
4,RL,43.0,5005,IR1,HLS,Inside,Gtl,Norm,8,1992,...,0,Attchd,1992.0,RFn,2.0,506.0,0,82,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,RM,21.0,1936,Reg,Lvl,Inside,Gtl,Norm,4,1970,...,0,,,,0.0,0.0,0,0,WD,Normal
1455,RM,21.0,1894,Reg,Lvl,Inside,Gtl,Norm,4,1970,...,0,CarPort,1970.0,Unf,1.0,286.0,0,24,WD,Abnorml
1456,RL,160.0,20000,Reg,Lvl,Inside,Gtl,Norm,5,1960,...,1,Detchd,1960.0,Unf,2.0,576.0,474,0,WD,Abnorml
1457,RL,62.0,10441,Reg,Lvl,Inside,Gtl,Norm,5,1992,...,0,,,,0.0,0.0,80,32,WD,Normal


In [83]:
fill_test_num = final_test.fillna(final_test.median())
fill_test_num

  fill_test_num = final_test.fillna(final_test.median())


Unnamed: 0,MSZoning,LotFrontage,LotArea,LotShape,LandContour,LotConfig,LandSlope,Condition1,OverallQual,YearBuilt,...,Fireplaces,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,SaleType,SaleCondition
0,RH,80.0,11622,Reg,Lvl,Inside,Gtl,Feedr,5,1961,...,0,Attchd,1961.0,Unf,1.0,730.0,140,0,WD,Normal
1,RL,81.0,14267,IR1,Lvl,Corner,Gtl,Norm,6,1958,...,0,Attchd,1958.0,Unf,1.0,312.0,393,36,WD,Normal
2,RL,74.0,13830,IR1,Lvl,Inside,Gtl,Norm,5,1997,...,1,Attchd,1997.0,Fin,2.0,482.0,212,34,WD,Normal
3,RL,78.0,9978,IR1,Lvl,Inside,Gtl,Norm,6,1998,...,1,Attchd,1998.0,Fin,2.0,470.0,360,36,WD,Normal
4,RL,43.0,5005,IR1,HLS,Inside,Gtl,Norm,8,1992,...,0,Attchd,1992.0,RFn,2.0,506.0,0,82,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,RM,21.0,1936,Reg,Lvl,Inside,Gtl,Norm,4,1970,...,0,,1979.0,,0.0,0.0,0,0,WD,Normal
1455,RM,21.0,1894,Reg,Lvl,Inside,Gtl,Norm,4,1970,...,0,CarPort,1970.0,Unf,1.0,286.0,0,24,WD,Abnorml
1456,RL,160.0,20000,Reg,Lvl,Inside,Gtl,Norm,5,1960,...,1,Detchd,1960.0,Unf,2.0,576.0,474,0,WD,Abnorml
1457,RL,62.0,10441,Reg,Lvl,Inside,Gtl,Norm,5,1992,...,0,,1979.0,,0.0,0.0,80,32,WD,Normal


In [84]:
fill_test_full = fill_test_num.apply(lambda x: x.fillna(x.value_counts().index[0]))
fill_test_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 39 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSZoning       1459 non-null   object 
 1   LotFrontage    1459 non-null   float64
 2   LotArea        1459 non-null   int64  
 3   LotShape       1459 non-null   object 
 4   LandContour    1459 non-null   object 
 5   LotConfig      1459 non-null   object 
 6   LandSlope      1459 non-null   object 
 7   Condition1     1459 non-null   object 
 8   OverallQual    1459 non-null   int64  
 9   YearBuilt      1459 non-null   int64  
 10  YearRemodAdd   1459 non-null   int64  
 11  MasVnrArea     1459 non-null   float64
 12  ExterQual      1459 non-null   object 
 13  ExterCond      1459 non-null   object 
 14  Foundation     1459 non-null   object 
 15  BsmtQual       1459 non-null   object 
 16  BsmtExposure   1459 non-null   object 
 17  BsmtFinType1   1459 non-null   object 
 18  BsmtFinS

In [85]:
ftest = pd.get_dummies(fill_test_full)
ftest

Unnamed: 0,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,80.0,11622,5,1961,1961,0.0,468.0,882.0,896,0,...,0,0,0,1,0,0,0,0,1,0
1,81.0,14267,6,1958,1958,108.0,923.0,1329.0,1329,0,...,0,0,0,1,0,0,0,0,1,0
2,74.0,13830,5,1997,1998,0.0,791.0,928.0,928,701,...,0,0,0,1,0,0,0,0,1,0
3,78.0,9978,6,1998,1998,20.0,602.0,926.0,926,678,...,0,0,0,1,0,0,0,0,1,0
4,43.0,5005,8,1992,1992,0.0,263.0,1280.0,1280,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,21.0,1936,4,1970,1970,0.0,0.0,546.0,546,546,...,0,0,0,1,0,0,0,0,1,0
1455,21.0,1894,4,1970,1970,0.0,252.0,546.0,546,546,...,0,0,0,1,1,0,0,0,0,0
1456,160.0,20000,5,1960,1996,0.0,1224.0,1224.0,1224,0,...,0,0,0,1,1,0,0,0,0,0
1457,62.0,10441,5,1992,1992,0.0,337.0,912.0,970,0,...,0,0,0,1,0,0,0,0,1,0


In [86]:
scaller2 = StandardScaler()
new_ftest  = scaller2.fit_transform(ftest)
new_ftest_pd = pd.DataFrame(data=new_ftest,columns=ftest.columns)
new_ftest_pd

Unnamed: 0,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.567330,0.363929,-0.751101,-0.340945,-1.072885,-0.563316,0.063428,-0.370716,-0.654561,-0.775254,...,-0.045392,-0.295268,-0.052432,0.398568,-0.254879,-0.074253,-0.091066,-0.134699,0.460211,-0.299365
1,0.615963,0.897861,-0.054877,-0.439695,-1.214908,0.047057,1.063511,0.639230,0.433298,-0.775254,...,-0.045392,-0.295268,-0.052432,0.398568,-0.254879,-0.074253,-0.091066,-0.134699,0.460211,-0.299365
2,0.275532,0.809646,-0.751101,0.844059,0.678742,-0.563316,0.773377,-0.266784,-0.574165,0.891944,...,-0.045392,-0.295268,-0.052432,0.398568,-0.254879,-0.074253,-0.091066,-0.134699,0.460211,-0.299365
3,0.470064,0.032064,-0.054877,0.876976,0.678742,-0.450284,0.357958,-0.271303,-0.579190,0.837243,...,-0.045392,-0.295268,-0.052432,0.398568,-0.254879,-0.074253,-0.091066,-0.134699,0.460211,-0.299365
4,-1.232092,-0.971808,1.337571,0.679475,0.394694,-0.563316,-0.387160,0.528520,0.310192,-0.775254,...,-0.045392,-0.295268,-0.052432,0.398568,-0.254879,-0.074253,-0.091066,-0.134699,0.460211,-0.299365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,-2.302019,-1.591330,-1.447325,-0.044694,-0.646813,-0.563316,-0.965230,-1.129871,-1.533893,0.523306,...,-0.045392,-0.295268,-0.052432,0.398568,-0.254879,-0.074253,-0.091066,-0.134699,0.460211,-0.299365
1455,-2.302019,-1.599808,-1.447325,-0.044694,-0.646813,-0.563316,-0.411338,-1.129871,-1.533893,0.523306,...,-0.045392,-0.295268,-0.052432,0.398568,3.923424,-0.074253,-0.091066,-0.134699,-2.172917,-0.299365
1456,4.457971,2.055150,-0.751101,-0.373861,0.584059,-0.563316,1.725105,0.401995,0.169499,-0.775254,...,-0.045392,-0.295268,-0.052432,0.398568,3.923424,-0.074253,-0.091066,-0.134699,-2.172917,-0.299365
1457,-0.308065,0.125527,-0.751101,0.679475,0.394694,-0.563316,-0.224509,-0.302935,-0.468645,-0.775254,...,-0.045392,-0.295268,-0.052432,0.398568,-0.254879,-0.074253,-0.091066,-0.134699,0.460211,-0.299365


0       169277.052498
1       187758.393989
2       183583.683570
3       179317.477511
4       150730.079977
            ...      
1454    167081.220949
1455    164788.778231
1456    219222.423400
1457    184924.279659
1458    187741.866657
Name: SalePrice, Length: 1459, dtype: float64

In [118]:
test_price_final.mean()

179183.91824266282

0       12.039292
1       12.142911
2       12.120426
3       12.096913
4       11.923246
          ...    
1454    12.026235
1455    12.012420
1456    12.297842
1457    12.127702
1458    12.142823
Name: SalePrice, Length: 1459, dtype: float64

In [159]:
p2=linear.predict(new_ftest_pd)

Feature names unseen at fit time:
- BsmtExposure_Av
- BsmtExposure_Gd
- BsmtExposure_Mn
- BsmtExposure_No
- BsmtFinType1_ALQ
- ...
Feature names seen at fit time, yet now missing:
- 3SsnPorch
- BedroomAbvGr
- BsmtFinSF2
- BsmtFullBath
- BsmtHalfBath
- ...



ValueError: X has 114 features, but LinearRegression is expecting 37 features as input.

In [143]:
p2 = Simple.predict(new_ftest_pd)

In [151]:
p2.mean()

12.024101335762134

In [145]:
price_log.mean()

12.024050901109373

In [154]:
p2[:3]

array([ 2.05931619e+10, -6.17477143e+09, -4.76318137e+09])

In [149]:
price_log[:3]

0    12.247694
1    12.109011
2    12.317167
Name: SalePrice, dtype: float64

In [146]:
r2_score(test_log,p2)

-3.185980868303851e+22

array([ 2.05931619e+10, -6.17477143e+09, -4.76318137e+09, ...,
       -1.46037191e+10,  4.03329298e+09, -9.36994153e+09])

In [93]:
Tree = DecisionTreeRegressor()
Tree.fit(data_scalled_pd,price_log)
predictt=Tree.predict(new_ftest_pd)
r2_score(test_log,predictt)

-19.553657011909927

In [None]:
Tree = DecisionTreeRegressor()
Tree.fit(data_scalled_pd,price_log)
predictt=Tree.predict(new_ftest_pd)
r2_score(test_log,predictt)

In [305]:
X_train

Unnamed: 0,MSSubClass,LotFrontage,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFullBath,FullBath,HalfBath,BedroomAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,MoSold,YrSold
133,-0.872563,-0.039223,1.374795,-0.517200,0.984752,0.830215,0.182006,1.107810,0.789741,-0.761621,-1.062465,-0.318683,-0.951226,0.934226,0.311725,-0.119110,0.891994
290,0.073375,2.276839,1.374795,-0.517200,1.150356,1.024029,-0.570750,-0.819964,0.789741,1.227585,0.163779,0.912210,0.600495,1.142656,0.311725,0.250891,-1.367655
24,-0.872563,-0.039223,-0.795151,2.179628,-0.108232,0.781761,-0.570750,1.107810,-1.026041,-0.761621,0.163779,-0.318683,0.600495,-0.441416,-1.026858,-0.489110,1.645210
1093,-0.872563,0.051603,-0.795151,2.179628,-0.207594,0.636400,0.348055,1.107810,-1.026041,1.227585,-2.288708,-0.318683,-0.951226,-0.066241,0.311725,1.360892,-1.367655
1288,1.492282,-1.356200,1.374795,-0.517200,0.951632,0.733308,-0.570750,1.107810,0.789741,-0.761621,-1.062465,-0.934130,0.600495,0.892540,0.311725,1.360892,0.891994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
802,0.073375,-0.311701,0.651479,-0.517200,1.117235,0.975575,-0.570750,1.107810,0.789741,1.227585,0.163779,0.296763,0.600495,1.100970,0.311725,1.360892,0.138777
53,-0.872563,-0.084636,2.098110,-0.517200,0.322337,0.103412,-0.570750,3.035585,-2.841822,1.227585,-3.514952,-0.934130,0.600495,0.100504,1.650307,1.730892,-1.367655
350,1.492282,-0.084636,2.098110,-0.517200,1.183477,1.072482,1.432909,-0.819964,0.789741,-0.761621,-1.062465,-0.318683,0.600495,1.184343,0.311725,2.100892,-0.614439
79,-0.163109,-0.447940,-0.795151,0.381743,-2.029235,-0.187309,-0.570750,-0.819964,-1.026041,1.227585,-1.062465,-0.934130,-0.951226,-0.524788,0.311725,-0.489110,0.891994


In [306]:
max_depthX = [int(x) for x in np.linspace(start=2,stop=20,num=10)]
learning_rate = [float(x) for x in np.linspace(start=0.01,stop=0.3,num=10)]
n_estimatorsX = [int(x) for x in np.linspace(start=100,stop=1000,num=20)]
params = { 'max_depth': max_depthX,
           'learning_rate': learning_rate,
           'subsample': np.arange(0.5, 1.0, 0.1),
           'colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'n_estimators': n_estimatorsX
}



xgb_tunned = XGBRegressor()
grid_xgb = RandomizedSearchCV(estimator=xgb_tunned, param_distributions=params,verbose=2, n_iter=100,cv=3,n_jobs=-1)
grid_xgb.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          callbacks=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          early_stopping_rounds=None,
                                          enable_categorical=False,
                                          eval_metric=None, feature_types=None,
                                          gamma=None, gpu_id=None,
                                          grow_policy=None,
                                          importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=...
                                                          0.042222222222222223,
                           

In [308]:
p2x = grid_xgb.best_estimator_.predict(dtpd)

In [309]:
r2_score(test_log,p2x)

-16.049904278513576

In [51]:
n_estimators = [int(x) for x in np.linspace(start=100,stop=2000,num=20)]
max_depth = [int(x) for x in np.linspace(start=10,stop=100,num=10)]
max_depth.append(None)
max_features = ['auto','sqrt']
bootstrap = [False,True]
min_samples_leaf = [1,2,5]
min_samples_split = [2,5,10]
random_grid={
    'n_estimators' : n_estimators,
    'max_depth': max_depth,
    'max_features' : max_features,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features' : max_features,
    'bootstrap' : bootstrap
}

Forest_model_tunned = RandomForestRegressor()
grid_forest = RandomizedSearchCV(estimator=Forest_model_tunned, param_distributions=random_grid, verbose=2 , cv=3 , n_iter= 100,n_jobs=-1)
grid_forest.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [False, True],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 5],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000, 1100, 1200,
                                                         1300, 1400, 1500, 1600,
                                                         1700, 1800, 1900,
                                                         2000]},
 

In [52]:
ee =grid_forest.best_estimator_.predict(new_ftest_pd)
r2_score(test_log,ee)

-13.708921463110034

# JUST NUMERICAL FEATURES


In [6]:
JUST_NUMERICAL = train.select_dtypes(['int64','float64'])


In [7]:
JUST_NUMERICAL.fillna(JUST_NUMERICAL.median(),inplace=True)

In [8]:
JUST_NUMERICAL.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,69.863699,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.117123,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,22.027677,9981.264932,1.382997,1.112799,30.202904,20.645407,180.731373,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,60.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,79.0,11601.5,7.0,6.0,2000.0,2004.0,164.25,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [9]:
f = JUST_NUMERICAL.drop('SalePrice',axis=1)
p = JUST_NUMERICAL['SalePrice']

In [10]:
sc = StandardScaler()
fs = sc.fit_transform(f)
fpd = pd.DataFrame(fs , columns=f.columns)
fpd.drop('Id',axis=1,inplace=True)
fpd

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,0.073375,-0.220875,-0.207142,0.651479,-0.517200,1.050994,0.878668,0.514104,0.575425,-0.288653,...,0.351000,-0.752176,0.216503,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,0.138777
1,-0.872563,0.460320,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.570750,1.171992,-0.288653,...,-0.060731,1.626195,-0.704483,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-0.489110,-0.614439
2,0.073375,-0.084636,0.073480,0.651479,-0.517200,0.984752,0.830215,0.325915,0.092907,-0.288653,...,0.631726,-0.752176,-0.070361,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.990891,0.138777
3,0.309859,-0.447940,-0.096897,0.651479,-0.517200,-1.863632,-0.720298,-0.570750,-0.499274,-0.288653,...,0.790804,-0.752176,-0.176048,4.092524,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,-1.367655
4,0.073375,0.641972,0.375148,1.374795,-0.517200,0.951632,0.733308,1.366489,0.463568,-0.288653,...,1.698485,0.780197,0.563760,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,2.100892,0.138777
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.073375,-0.357114,-0.260560,-0.071836,-0.517200,0.918511,0.733308,-0.570750,-0.973018,-0.288653,...,-0.060731,-0.752176,-0.100558,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.620891,-0.614439
1456,-0.872563,0.687385,0.266407,-0.071836,0.381743,0.222975,0.151865,0.087911,0.759659,0.722112,...,0.126420,2.033231,-0.704483,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,1.645210
1457,0.309859,-0.175462,-0.147810,0.651479,3.078570,-1.002492,1.024029,-0.570750,-0.369871,-0.288653,...,-1.033914,-0.752176,0.201405,-0.359325,-0.116339,-0.270208,-0.068692,4.953112,-0.489110,1.645210
1458,-0.872563,-0.084636,-0.080160,-0.795151,0.381743,-0.704406,0.539493,-0.570750,-0.865548,6.092188,...,-1.090059,2.168910,-0.704483,1.473789,-0.116339,-0.270208,-0.068692,-0.087688,-0.859110,1.645210


In [13]:
p_log = np.log(p)
p_log

0       12.247694
1       12.109011
2       12.317167
3       11.849398
4       12.429216
          ...    
1455    12.072541
1456    12.254863
1457    12.493130
1458    11.864462
1459    11.901583
Name: SalePrice, Length: 1460, dtype: float64

In [14]:
X_train , X_val , y_train, y_val = train_test_split(fpd ,p_log,test_size=0.2,random_state=100)

In [16]:
linear = LinearRegression()
linear.fit(X_train,y_train)
p1 = linear.predict(X_val)


In [17]:
r2_score(y_val,p1)

0.8719060574899569

In [18]:
dt = test.select_dtypes(['int64','float64'])
dt

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,730.0,140,0,0,0,120,0,0,6,2010
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,312.0,393,36,0,0,0,0,12500,6,2010
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,482.0,212,34,0,0,0,0,0,3,2010
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,470.0,360,36,0,0,0,0,0,6,2010
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,506.0,0,82,0,0,144,0,0,1,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,21.0,1936,4,7,1970,1970,0.0,0.0,...,0.0,0,0,0,0,0,0,0,6,2006
1455,2916,160,21.0,1894,4,5,1970,1970,0.0,252.0,...,286.0,0,24,0,0,0,0,0,4,2006
1456,2917,20,160.0,20000,5,7,1960,1996,0.0,1224.0,...,576.0,474,0,0,0,0,0,0,9,2006
1457,2918,85,62.0,10441,5,5,1992,1992,0.0,337.0,...,0.0,80,32,0,0,0,0,700,7,2006


In [19]:
dt.fillna(dt.median(),inplace=True)

In [20]:
scc = StandardScaler()
dtt = scc.fit_transform(dt)
dtpd = pd.DataFrame(dtt,columns=dt.columns)
dtpd.drop('Id',axis=1, inplace=True)
dtpd

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,-0.874711,0.567330,0.363929,-0.751101,0.400766,-0.340945,-1.072885,-0.563316,0.063428,0.517537,...,1.185921,0.366678,-0.701628,-0.360738,-0.088827,1.818960,-0.057227,-0.092244,-0.038281,1.713905
1,-0.874711,0.615963,0.897861,-0.054877,0.400766,-0.439695,-1.214908,0.047057,1.063511,-0.297689,...,-0.741235,2.347867,-0.178826,-0.360738,-0.088827,-0.301543,-0.057227,19.730438,-0.038281,1.713905
2,0.061351,0.275532,0.809646,-0.751101,-0.497418,0.844059,0.678742,-0.563316,0.773377,-0.297689,...,0.042537,0.930495,-0.207871,-0.360738,-0.088827,-0.301543,-0.057227,-0.092244,-1.140614,1.713905
3,0.061351,0.470064,0.032064,-0.054877,0.400766,0.876976,0.678742,-0.450284,0.357958,-0.297689,...,-0.012788,2.089451,-0.178826,-0.360738,-0.088827,-0.301543,-0.057227,-0.092244,-0.038281,1.713905
4,1.465443,-1.232092,-0.971808,1.337571,-0.497418,0.679475,0.394694,-0.563316,-0.387160,-0.297689,...,0.153187,-0.729632,0.489198,-0.360738,-0.088827,2.243060,-0.057227,-0.092244,-1.875504,1.713905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.401505,-2.302019,-1.591330,-1.447325,1.298950,-0.044694,-0.646813,-0.563316,-0.965230,-0.297689,...,-2.179687,-0.729632,-0.701628,-0.360738,-0.088827,-0.301543,-0.057227,-0.092244,-0.038281,-1.359958
1455,2.401505,-2.302019,-1.599808,-1.447325,-0.497418,-0.044694,-0.646813,-0.563316,-0.411338,-0.297689,...,-0.861106,-0.729632,-0.353093,-0.360738,-0.088827,-0.301543,-0.057227,-0.092244,-0.773170,-1.359958
1456,-0.874711,4.457971,2.055150,-0.751101,1.298950,-0.373861,0.584059,-0.563316,1.725105,-0.297689,...,0.475916,2.982161,-0.701628,-0.360738,-0.088827,-0.301543,-0.057227,-0.092244,1.064053,-1.359958
1457,0.646389,-0.308065,0.125527,-0.751101,-0.497418,0.679475,0.394694,-0.563316,-0.224509,-0.297689,...,-2.179687,-0.103169,-0.236915,-0.360738,-0.088827,-0.301543,-0.057227,1.017827,0.329164,-1.359958


In [21]:
#dtpd.drop(fckers,axis=1,inplace=True)
#dtpd.drop(fckers2,axis=1,inplace=True)

In [22]:
#fckers = dtpd.columns[8:16].tolist()
#fckers2=['BsmtHalfBath','LotArea', 'KitchenAbvGr', 'GarageArea', 'WoodDeckSF' ,'OpenPorchSF','EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']

In [23]:
test_price = pd.read_csv("sample_submission.csv")
test_price_final = test_price['SalePrice']
test_log = np.log(test_price_final)
test_log

0       12.039292
1       12.142911
2       12.120426
3       12.096913
4       11.923246
          ...    
1454    12.026235
1455    12.012420
1456    12.297842
1457    12.127702
1458    12.142823
Name: SalePrice, Length: 1459, dtype: float64

In [24]:
p2=linear.predict(dtpd)
r2_score(test_log,p2)

-6.6715695454902385e+22