In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import randint
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.feature_selection import RFECV
import time
from sklearn.decomposition import PCA


### train set

In [58]:
import os

In [102]:
path=os.getcwd().replace('\\','/')+'/housing_competition/train.csv'

In [105]:
df = pd.read_csv(path)
df = df.set_index('Id')
#df.isna().sum()
df.sample(3)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1215,85,RL,69.0,10205,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,5,2006,WD,Normal,134500
967,50,RL,130.0,9600,Pave,,IR1,HLS,AllPub,Inside,...,0,,,,0,6,2009,WD,Normal,160000
826,20,RL,114.0,14803,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,6,2008,New,Partial,385000


In [3]:
df.shape

(1460, 80)

In [4]:
df[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']].max()

YearBuilt       2010.0
YearRemodAdd    2010.0
GarageYrBlt     2010.0
dtype: float64

In [5]:
df['Age'] = 2010 - df.YearBuilt 
df['AgeRemod'] = 2010 - df.YearRemodAdd   
df['AgeGarage'] = 2010 - df.GarageYrBlt   
df.drop(labels=['YearBuilt', 'YearRemodAdd', 'GarageYrBlt'], axis=1, inplace=True)

In [6]:
y = df['SalePrice'].copy()
X = df.drop(columns=['SalePrice']).copy()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### final test set for prediction

In [8]:
path=os.getcwd().replace('\\','/')+'/housing_competition/test.csv'
df_test = pd.read_csv(path)
df_test = df_test.set_index('Id')
#df.isna().sum()
df_test.sample(3)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2385,20,RL,88.0,10738,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,MnPrv,Shed,400,11,2007,WD,Normal
1877,20,RL,65.0,10739,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,4,2009,WD,Normal
2811,20,RL,,9535,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,7,2006,WD,Normal


In [9]:
df_test.shape

(1459, 79)

In [10]:
df_test['Age'] = 2010 - df_test.YearBuilt   
df_test['AgeRemod'] = 2010 - df_test.YearRemodAdd   
df_test['AgeGarage'] = 2010 - df_test.GarageYrBlt   
df_test.drop(labels=['YearBuilt', 'YearRemodAdd', 'GarageYrBlt'], axis=1, inplace=True)

### preprocessing pipeline

In [11]:
X_num_col = X.select_dtypes(include="number").copy().columns
X_cat_ord_col = ['LotShape', 'Utilities', 'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtFinType1','BsmtFinType2', 'HeatingQC', 'KitchenQual', 'Functional', 'FireplaceQu','GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC']
X_cat_col = [col for col in X.select_dtypes(exclude="number").copy().columns if col not in X_cat_ord_col]

In [12]:
cat_list1 = ['Reg', 'IR1', 'IR2', 'IR3']
cat_list2 = ['AllPub', 'NoSewr', 'NoSeWa', 'ELO']
cat_list3 = ['Gtl', 'Mod', 'Sev']
cat_list42 = ['Ex', 'Gd', 'TA', 'Fa']
cat_list43 = ['Gd', 'TA', 'Fa', 'Po']
cat_list4 = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
cat_list5 = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']
cat_list6 = ['Gd', 'Av', 'Mn', 'No']
cat_list7 = ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf']
cat_list8 = ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal']
cat_list9 = ['Fin', 'RFn', 'Unf']
cat_list10 = ['Ex', 'Gd', 'Fa']

In [13]:
num_pipe = make_pipeline(
    SimpleImputer(),
    StandardScaler())

ord_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(categories=[cat_list1, cat_list2, cat_list3, cat_list4, cat_list4, cat_list4, cat_list7, cat_list7,cat_list4, cat_list4, cat_list8, cat_list4, cat_list9, cat_list4, cat_list4, cat_list10])
    )


cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(drop="first", handle_unknown="ignore", sparse=False))
    
cat_pipe_pca = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(drop="first", handle_unknown="ignore", sparse=False),
    PCA(n_components=0.95))

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", num_pipe, X_num_col),
        ("ord_pipe", ord_pipe, X_cat_ord_col),
        ("cat_pipe", cat_pipe_pca, X_cat_col)])

## models

In [15]:
performances = {}

### baseline decision tree

In [16]:
# Decision tree
from sklearn.tree import DecisionTreeRegressor
full_pipe_dt = make_pipeline(
    preprocessor,
    DecisionTreeRegressor())

param_grid_dt = {
    "columntransformer__num_pipe__simpleimputer__strategy":['mean','median'], 
    "decisiontreeregressor__max_depth": range(2, 8),
    "decisiontreeregressor__min_samples_leaf": range(3, 12)
}

dt_search = RandomizedSearchCV(full_pipe_dt,
                                   param_grid_dt,
                                   scoring="neg_root_mean_squared_error",
                                   n_iter=10,
                                   cv=5,
                                   n_jobs=4,
                                   random_state=123)

dt_search.fit(X_train, y_train)

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num_pipe',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer()),
                                                                                               ('standardscaler',
                                                                                                StandardScaler())]),
                                                                               Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF'...
                                                                                'MiscFeature',
          

In [21]:
dt_search.best_params_

{'decisiontreeregressor__min_samples_leaf': 6,
 'decisiontreeregressor__max_depth': 5,
 'columntransformer__num_pipe__simpleimputer__strategy': 'median'}

In [26]:
y_pred = dt_search.predict(X_test)



In [30]:
performances["baseline_tree"]= mean_squared_error(y_test, y_pred, squared=False)
performances

{'baseline_tree': 43879.873518573455}

### baseline Knearest

In [32]:
# K Nearest neighbors
from sklearn.neighbors import KNeighborsRegressor
full_pipe_kn = make_pipeline(
    preprocessor,
    KNeighborsRegressor())

param_grid_kn = {
    "columntransformer__num_pipe__simpleimputer__strategy":['mean','median'], 
    "kneighborsregressor__n_neighbors": range(3, 8, 2) ,
    "kneighborsregressor__weights": ['uniform', 'distance']
}

kn_search = RandomizedSearchCV(full_pipe_kn,
                                   param_grid_kn,
                                   scoring="neg_root_mean_squared_error",
                                   n_iter=10,
                                   cv=5,
                                   n_jobs=4,
                                   random_state=123)

In [33]:
kn_search.fit(X_train, y_train)

kn_search.best_params_

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func

{'kneighborsregressor__weights': 'distance',
 'kneighborsregressor__n_neighbors': 7,
 'columntransformer__num_pipe__simpleimputer__strategy': 'mean'}

In [35]:
y_pred = kn_search.predict(X_test)



In [37]:
performances["baseline_KNear"]= mean_squared_error(y_test, y_pred, squared=False)
performances

{'baseline_tree': 43879.873518573455, 'baseline_KNear': 30674.17660600554}

### baseline OLS

In [38]:
# OLS
from sklearn.linear_model import LinearRegression
full_pipe_ols = make_pipeline(
    preprocessor,
    LinearRegression())

param_grid_ols = {
    "columntransformer__num_pipe__simpleimputer__strategy":['mean','median']
}

ols_search = RandomizedSearchCV(full_pipe_ols,
                                   param_grid_ols,
                                   scoring="neg_root_mean_squared_error",
                                   n_iter=10,
                                   cv=5,
                                   n_jobs=4,
                                   random_state=123)

In [39]:
ols_search.fit(X_train, y_train)

2 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num_pipe',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer()),
                                                                                               ('standardscaler',
                                                                                                StandardScaler())]),
                                                                               Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF'...
                                                                                'Foundation',
           

In [40]:
ols_search.best_params_

{'columntransformer__num_pipe__simpleimputer__strategy': 'mean'}

In [308]:
#tree_pred = ols_search.predict(df_test)

In [41]:
y_pred = ols_search.predict(X_test)



In [42]:
performances["baseline_OLS"]= mean_squared_error(y_test, y_pred, squared=False)
performances

{'baseline_tree': 43879.873518573455,
 'baseline_KNear': 30674.17660600554,
 'baseline_OLS': 6337158731047.73}

### feature selection from model

In [106]:
from sklearn.feature_selection import SelectFromModel

### Decision tree

In [108]:
# Decision tree
from sklearn.tree import DecisionTreeRegressor
full_pipe_dt = make_pipeline(
    preprocessor,
    SelectFromModel(DecisionTreeRegressor()),
    DecisionTreeRegressor())

param_grid_dt = {
    "columntransformer__num_pipe__simpleimputer__strategy":['mean','median'], 
    "selectfrommodel__estimator__max_depth": range(2,8),
    "selectfrommodel__estimator__min_samples_leaf": range(3,12),
    "decisiontreeregressor__max_depth": range(2, 8),
    "decisiontreeregressor__min_samples_leaf": range(3, 12)
}

dt_search = RandomizedSearchCV(full_pipe_dt,
                                   param_grid_dt,
                                   scoring="neg_root_mean_squared_error",
                                   n_iter=10,
                                   cv=5,
                                   n_jobs=4,
                                   random_state=123)

dt_search.fit(X_train, y_train)

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num_pipe',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer()),
                                                                                               ('standardscaler',
                                                                                                StandardScaler())]),
                                                                               Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF'...
                   param_distributions={'columntransformer__num_pipe__simpleimputer__strategy': ['mean',


In [109]:
dt_search.best_params_

{'selectfrommodel__estimator__min_samples_leaf': 3,
 'selectfrommodel__estimator__max_depth': 4,
 'decisiontreeregressor__min_samples_leaf': 6,
 'decisiontreeregressor__max_depth': 3,
 'columntransformer__num_pipe__simpleimputer__strategy': 'median'}

In [111]:
y_pred = dt_search.predict(X_test)



In [112]:
performances["modelselect_tree"]= mean_squared_error(y_test, y_pred, squared=False)
performances

{'baseline_tree': 43879.873518573455,
 'baseline_KNear': 30674.17660600554,
 'baseline_OLS': 6337158731047.73,
 'modelselect_tree': 42606.33604125767}

### KNearest

In [113]:
# K Nearest neighbors
full_pipe_kn = make_pipeline(
    preprocessor,
    SelectFromModel(DecisionTreeRegressor()),
    KNeighborsRegressor())

param_grid_kn = {
    "columntransformer__num_pipe__simpleimputer__strategy":['mean','median'], 
    "selectfrommodel__estimator__max_depth": range(2,8),
    "selectfrommodel__estimator__min_samples_leaf": range(3,12),
    "kneighborsregressor__n_neighbors": range(3, 8, 2) ,
    "kneighborsregressor__weights": ['uniform', 'distance']
}

kn_search = RandomizedSearchCV(full_pipe_kn,
                                   param_grid_kn,
                                   scoring="neg_root_mean_squared_error",
                                   n_iter=10,
                                   cv=5,
                                   n_jobs=4,
                                   random_state=123)

kn_search.fit(X_train, y_train)

kn_search.best_params_

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func

{'selectfrommodel__estimator__min_samples_leaf': 7,
 'selectfrommodel__estimator__max_depth': 5,
 'kneighborsregressor__weights': 'distance',
 'kneighborsregressor__n_neighbors': 3,
 'columntransformer__num_pipe__simpleimputer__strategy': 'mean'}

In [115]:
y_pred = kn_search.predict(X_test)



In [116]:
performances["modelselect_KNear"]= mean_squared_error(y_test, y_pred, squared=False)
performances

{'baseline_tree': 43879.873518573455,
 'baseline_KNear': 30674.17660600554,
 'baseline_OLS': 6337158731047.73,
 'modelselect_tree': 42606.33604125767,
 'modelselect_KNear': 30262.37680691028}

### OLS

In [121]:
# OLS
full_pipe_ols = make_pipeline(
    preprocessor,
    SelectFromModel(DecisionTreeRegressor()),
    LinearRegression())

param_grid_ols = {
    "columntransformer__num_pipe__simpleimputer__strategy":['mean','median'],
    "selectfrommodel__estimator__max_depth": range(2,8),
    "selectfrommodel__estimator__min_samples_leaf": range(3,12),
    "selectfrommodel__threshold": ['mean', 'median']
}

ols_search = RandomizedSearchCV(full_pipe_ols,
                                   param_grid_ols,
                                   scoring="neg_root_mean_squared_error",
                                   n_iter=10,
                                   cv=5,
                                   n_jobs=4,
                                   random_state=123)

In [122]:
ols_search.fit(X_train, y_train)

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num_pipe',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer()),
                                                                                               ('standardscaler',
                                                                                                StandardScaler())]),
                                                                               Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF'...
                                             ('linearregression',
                                       

In [123]:
ols_search.best_params_

{'selectfrommodel__threshold': 'median',
 'selectfrommodel__estimator__min_samples_leaf': 7,
 'selectfrommodel__estimator__max_depth': 5,
 'columntransformer__num_pipe__simpleimputer__strategy': 'median'}

In [124]:
y_pred = ols_search.predict(X_test)



In [125]:
performances["modelselect_OLS"]= mean_squared_error(y_test, y_pred, squared=False)
performances

{'baseline_tree': 43879.873518573455,
 'baseline_KNear': 30674.17660600554,
 'baseline_OLS': 6337158731047.73,
 'modelselect_tree': 42606.33604125767,
 'modelselect_KNear': 30262.37680691028,
 'modelselect_OLS': nan}

## RFE

In [126]:
from sklearn.feature_selection import RFECV

### Decision Tree

In [127]:
# Decision tree
from sklearn.tree import DecisionTreeRegressor
full_pipe_dt = make_pipeline(
    preprocessor,
    RFECV(DecisionTreeRegressor(), scoring="neg_root_mean_squared_error"))

param_grid_dt = {
    "columntransformer__num_pipe__simpleimputer__strategy":['mean','median'], 
    "rfecv__estimator__max_depth": range(2, 8),
    "rfecv__estimator__min_samples_leaf": range(3, 12)
}

dt_search = RandomizedSearchCV(full_pipe_dt,
                                   param_grid_dt,
                                   scoring="neg_root_mean_squared_error",
                                   n_iter=10,
                                   cv=5,
                                   n_jobs=4,
                                   random_state=123)

dt_search.fit(X_train, y_train)

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num_pipe',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer()),
                                                                                               ('standardscaler',
                                                                                                StandardScaler())]),
                                                                               Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF'...
                                                                                'SaleCondition'])])),
   

In [128]:
dt_search.best_params_

{'rfecv__estimator__min_samples_leaf': 6,
 'rfecv__estimator__max_depth': 5,
 'columntransformer__num_pipe__simpleimputer__strategy': 'median'}

In [129]:
y_pred = dt_search.predict(X_test)



In [130]:
performances["RFE_tree"]= mean_squared_error(y_test, y_pred, squared=False)
performances

{'baseline_tree': 43879.873518573455,
 'baseline_KNear': 30674.17660600554,
 'baseline_OLS': 6337158731047.73,
 'modelselect_tree': 42606.33604125767,
 'modelselect_KNear': 30262.37680691028,
 'modelselect_OLS': nan,
 'RFE_tree': 36041.3362870785}

### OLS

In [131]:
# OLS
from sklearn.linear_model import LinearRegression
full_pipe_ols = make_pipeline(
    preprocessor,
    RFECV(LinearRegression(), scoring="neg_root_mean_squared_error"))

param_grid_ols = {
    "columntransformer__num_pipe__simpleimputer__strategy":['mean','median']
}

ols_search = RandomizedSearchCV(full_pipe_ols,
                                   param_grid_ols,
                                   scoring="neg_root_mean_squared_error",
                                   n_iter=10,
                                   cv=5,
                                   n_jobs=4,
                                   random_state=123)

In [132]:
ols_search.fit(X_train, y_train)

2 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num_pipe',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer()),
                                                                                               ('standardscaler',
                                                                                                StandardScaler())]),
                                                                               Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF'...
                                                                                'CentralAir',
           

In [133]:
ols_search.best_params_

{'columntransformer__num_pipe__simpleimputer__strategy': 'mean'}

In [134]:
y_pred = ols_search.predict(X_test)



In [135]:
performances["RFE_OLS"]= mean_squared_error(y_test, y_pred, squared=False)
performances

{'baseline_tree': 43879.873518573455,
 'baseline_KNear': 30674.17660600554,
 'baseline_OLS': 6337158731047.73,
 'modelselect_tree': 42606.33604125767,
 'modelselect_KNear': 30262.37680691028,
 'modelselect_OLS': nan,
 'RFE_tree': 36041.3362870785,
 'RFE_OLS': 4145836444633610.0}

## PCA for whole df

### Decision Tree

In [136]:
# Decision tree
full_pipe_dt = make_pipeline(
    preprocessor,
    PCA(n_components=0.95),
    DecisionTreeRegressor())

param_grid_dt = {
    "columntransformer__num_pipe__simpleimputer__strategy":['mean','median'], 
    "decisiontreeregressor__max_depth": range(2, 8),
    "decisiontreeregressor__min_samples_leaf": range(3, 12)
}

dt_search = RandomizedSearchCV(full_pipe_dt,
                                   param_grid_dt,
                                   scoring="neg_root_mean_squared_error",
                                   n_iter=10,
                                   cv=5,
                                   n_jobs=4,
                                   random_state=123)

dt_search.fit(X_train, y_train)

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num_pipe',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer()),
                                                                                               ('standardscaler',
                                                                                                StandardScaler())]),
                                                                               Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF'...
                                                                                'SaleCondition'])])),
   

In [137]:
dt_search.best_params_

{'decisiontreeregressor__min_samples_leaf': 6,
 'decisiontreeregressor__max_depth': 5,
 'columntransformer__num_pipe__simpleimputer__strategy': 'median'}

In [138]:
y_pred = dt_search.predict(X_test)



In [139]:
performances["PCA_tree"]= mean_squared_error(y_test, y_pred, squared=False)
performances

{'baseline_tree': 43879.873518573455,
 'baseline_KNear': 30674.17660600554,
 'baseline_OLS': 6337158731047.73,
 'modelselect_tree': 42606.33604125767,
 'modelselect_KNear': 30262.37680691028,
 'modelselect_OLS': nan,
 'RFE_tree': 36041.3362870785,
 'RFE_OLS': 4145836444633610.0,
 'PCA_tree': 34196.93924897161}

### KNearest

In [140]:
# K Nearest neighbors
full_pipe_kn = make_pipeline(
    preprocessor,
    PCA(n_components=0.95),
    KNeighborsRegressor())

param_grid_kn = {
    "columntransformer__num_pipe__simpleimputer__strategy":['mean','median'], 
    "kneighborsregressor__n_neighbors": range(3, 8, 2) ,
    "kneighborsregressor__weights": ['uniform', 'distance']
}

kn_search = RandomizedSearchCV(full_pipe_kn,
                                   param_grid_kn,
                                   scoring="neg_root_mean_squared_error",
                                   n_iter=10,
                                   cv=5,
                                   n_jobs=4,
                                   random_state=123)

kn_search.fit(X_train, y_train)

kn_search.best_params_

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func

{'kneighborsregressor__weights': 'distance',
 'kneighborsregressor__n_neighbors': 7,
 'columntransformer__num_pipe__simpleimputer__strategy': 'mean'}

In [141]:
y_pred = kn_search.predict(X_test)



In [143]:
performances["PCA_KNear"]= mean_squared_error(y_test, y_pred, squared=False)
performances

{'baseline_tree': 43879.873518573455,
 'baseline_KNear': 30674.17660600554,
 'baseline_OLS': 6337158731047.73,
 'modelselect_tree': 42606.33604125767,
 'modelselect_KNear': 30262.37680691028,
 'modelselect_OLS': nan,
 'RFE_tree': 36041.3362870785,
 'RFE_OLS': 4145836444633610.0,
 'PCA_tree': 34196.93924897161,
 'PCA_KNear': 30569.46190602987}

### OLS

In [144]:
# OLS
full_pipe_ols = make_pipeline(
    preprocessor,
    PCA(n_components=0.95),
    LinearRegression())

param_grid_ols = {
    "columntransformer__num_pipe__simpleimputer__strategy":['mean','median']
}

ols_search = RandomizedSearchCV(full_pipe_ols,
                                   param_grid_ols,
                                   scoring="neg_root_mean_squared_error",
                                   n_iter=10,
                                   cv=5,
                                   n_jobs=4,
                                   random_state=123)

In [145]:
ols_search.fit(X_train, y_train)

2 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num_pipe',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer()),
                                                                                               ('standardscaler',
                                                                                                StandardScaler())]),
                                                                               Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF'...
                                                                                'BsmtExposure',
         

In [146]:
ols_search.best_params_

{'columntransformer__num_pipe__simpleimputer__strategy': 'mean'}

In [None]:
y_pred = ols_search.predict(X_test)

In [148]:
performances["PCA_OLS"]= mean_squared_error(y_test, y_pred, squared=False)
performances

{'baseline_tree': 43879.873518573455,
 'baseline_KNear': 30674.17660600554,
 'baseline_OLS': 6337158731047.73,
 'modelselect_tree': 42606.33604125767,
 'modelselect_KNear': 30262.37680691028,
 'modelselect_OLS': nan,
 'RFE_tree': 36041.3362870785,
 'RFE_OLS': 4145836444633610.0,
 'PCA_tree': 34196.93924897161,
 'PCA_KNear': 30569.46190602987,
 'PCA_OLS': 26674.981051124178}

### PCA for all, then model select

In [151]:
# K Nearest neighbors
full_pipe_kn = make_pipeline(
    preprocessor,
    PCA(n_components=0.95),
    SelectFromModel(DecisionTreeRegressor()),
    KNeighborsRegressor())

param_grid_kn = {
    "columntransformer__num_pipe__simpleimputer__strategy":['mean','median'], 
    "selectfrommodel__estimator__max_depth": range(2, 8),
    "selectfrommodel__estimator__min_samples_leaf": range(3, 12),
    "kneighborsregressor__n_neighbors": range(3, 8, 2) ,
    "kneighborsregressor__weights": ['uniform', 'distance']
}

kn_search = RandomizedSearchCV(full_pipe_kn,
                                   param_grid_kn,
                                   scoring="neg_root_mean_squared_error",
                                   n_iter=10,
                                   cv=5,
                                   n_jobs=4,
                                   random_state=123)

kn_search.fit(X_train, y_train)

kn_search.best_params_

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func

{'selectfrommodel__estimator__min_samples_leaf': 7,
 'selectfrommodel__estimator__max_depth': 5,
 'kneighborsregressor__weights': 'distance',
 'kneighborsregressor__n_neighbors': 3,
 'columntransformer__num_pipe__simpleimputer__strategy': 'mean'}

In [152]:
y_pred = kn_search.predict(X_test)



In [153]:
performances["PCA_modelselect_KNear"]= mean_squared_error(y_test, y_pred, squared=False)
performances

{'baseline_tree': 43879.873518573455,
 'baseline_KNear': 30674.17660600554,
 'baseline_OLS': 6337158731047.73,
 'modelselect_tree': 42606.33604125767,
 'modelselect_KNear': 30262.37680691028,
 'modelselect_OLS': nan,
 'RFE_tree': 36041.3362870785,
 'RFE_OLS': 4145836444633610.0,
 'PCA_tree': 34196.93924897161,
 'PCA_KNear': 30569.46190602987,
 'PCA_OLS': 26674.981051124178,
 'PCA_modelselect_KNear': 30887.26034321068}

In [154]:
# OLS
full_pipe_ols = make_pipeline(
    preprocessor,
    PCA(n_components=0.95),
    SelectFromModel(DecisionTreeRegressor()),
    LinearRegression())

param_grid_ols = {
    "columntransformer__num_pipe__simpleimputer__strategy":['mean','median'], 
    "selectfrommodel__threshold": ['mean', 'median'],
    "selectfrommodel__estimator__max_depth": range(2, 8),
    "selectfrommodel__estimator__min_samples_leaf": range(3, 12)
}

ols_search = RandomizedSearchCV(full_pipe_ols,
                                   param_grid_ols,
                                   scoring="neg_root_mean_squared_error",
                                   n_iter=10,
                                   cv=5,
                                   n_jobs=4,
                                   random_state=123)

ols_search.fit(X_train, y_train)

ols_search.best_params_

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\chicmachina\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func

{'selectfrommodel__threshold': 'median',
 'selectfrommodel__estimator__min_samples_leaf': 7,
 'selectfrommodel__estimator__max_depth': 5,
 'columntransformer__num_pipe__simpleimputer__strategy': 'median'}

In [155]:
y_pred = ols_search.predict(X_test)



In [156]:
performances["PCA_modelselect_OLS"]= mean_squared_error(y_test, y_pred, squared=False)
performances

{'baseline_tree': 43879.873518573455,
 'baseline_KNear': 30674.17660600554,
 'baseline_OLS': 6337158731047.73,
 'modelselect_tree': 42606.33604125767,
 'modelselect_KNear': 30262.37680691028,
 'modelselect_OLS': nan,
 'RFE_tree': 36041.3362870785,
 'RFE_OLS': 4145836444633610.0,
 'PCA_tree': 34196.93924897161,
 'PCA_KNear': 30569.46190602987,
 'PCA_OLS': 26674.981051124178,
 'PCA_modelselect_KNear': 30887.26034321068,
 'PCA_modelselect_OLS': 26665.329808676965}

In [161]:
y_pred_submit = ols_search.predict(df_test)



In [401]:
df_test.head(1)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Age,AgeRemod,AgeGarage
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,MnPrv,,0,6,2010,WD,Normal,49,49,49.0


## submission

In [159]:
df_test = pd.read_csv(path)
df_test = df_test.set_index('Id')
#df.isna().sum()
df_test.sample(3)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
712,50,C (all),66.0,8712,Pave,Pave,Reg,HLS,AllPub,Inside,...,0,,,,0,1,2010,WD,Abnorml,102776
1267,190,RM,60.0,10120,Pave,,IR1,Bnk,AllPub,Inside,...,0,,MnPrv,,0,1,2007,WD,Normal,122000
834,20,RL,100.0,10004,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2009,WD,Normal,167000


In [None]:
df_test.shape

In [160]:
df_test['Age'] = 2010 - df_test.YearBuilt   
df_test['AgeRemod'] = 2010 - df_test.YearRemodAdd   
df_test['AgeGarage'] = 2010 - df_test.GarageYrBlt   
df_test.drop(labels=['YearBuilt', 'YearRemodAdd', 'GarageYrBlt'], axis=1, inplace=True)

In [162]:
df_test['SalePrice']=y_pred_submit
df_test.sample(3)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,Age,AgeRemod,AgeGarage
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
158,60,RL,92.0,12003,Pave,,Reg,Lvl,AllPub,Corner,...,,0,5,2010,New,Partial,255975.48849,1,0,1.0
556,45,RM,58.0,6380,Pave,,Reg,Lvl,AllPub,Inside,...,,0,8,2006,WD,Normal,107160.22129,88,60,88.0
1142,60,RL,,10304,Pave,,IR1,Lvl,AllPub,CulDSac,...,,0,10,2009,WD,Normal,183827.433773,34,34,34.0


In [163]:
#submit= df_test.rename_axis('').rename_axis("Id", axis="columns").loc[:,['Id','SalePrice']]
submit= df_test.reset_index(level=0).loc[:,['Id','SalePrice']]
submit.head()

Unnamed: 0,Id,SalePrice
0,1,221343.322214
1,2,184840.186988
2,3,225543.263733
3,4,204586.369706
4,5,285541.415985


In [404]:
submit.to_csv('Prediction_SalePrice.csv',index=False)