# My entry for the [House Prices - Advanced Regression Techniques](https://www.kaggle.com/c/house-prices-advanced-regression-techniques) competition! #


In [95]:
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import MissingIndicator
from sklearn.preprocessing import FunctionTransformer

np.random.seed(0)

from category_encoders import CatBoostEncoder
from category_encoders.wrapper import NestedCVWrapper

from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Mute warnings
#warnings.filterwarnings('ignore')

def clean(df:pd.DataFrame):
    df[["Exterior1st", "Exterior2nd"]] = df[["Exterior1st", "Exterior2nd"]].replace(
        {"Brk Cmn": "BrkComm",
         "BrkCmn" : "BrkComm",
         "Wd Sdng": "WdSdng",
         "Wd Shng": "WdShng"}
    )
    # Some values of GarageYrBlt are corrupt, so we'll replace them
    # with the year the house was built
    df["GarageYrBlt"] = df["GarageYrBlt"].where(df.GarageYrBlt <= 2010, df.YearBuilt)
    return df

def load_data():
    # Read data
    data_dir = Path("../input/house-prices-advanced-regression-techniques/")
    df_train = pd.read_csv(data_dir / "train.csv", index_col="Id")
    df_test = pd.read_csv(data_dir / "test.csv", index_col="Id")

    X_train = df_train.copy()
    y_train = X_train.pop("SalePrice")
    X_test = df_test.copy()

    perm = np.random.permutation(len(X_train))
    X_train = X_train.iloc[perm].reset_index(drop=True)
    y_train = y_train.iloc[perm].reset_index(drop=True)

    return X_train, y_train, X_test

# The nominative (unordered) categorical features
features_nom = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating", "GarageType", "MiscFeature", "SaleType", "SaleCondition"]

# The ordinal (ordered) categorical features
# Pandas calls the categories "levels"
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(1,11))

ordered_levels = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
}

ordered_levels = {key: ["None"] + value for key, value in ordered_levels.items()}


def preprocess_pipeline_steps():
    imputer = ColumnTransformer(
        transformers=[
            ("ordered_cats", SimpleImputer(strategy="constant", fill_value="None", copy=False), list(ordered_levels.keys())),
            ("nominal_cats", SimpleImputer(strategy="constant", fill_value="None", copy=False), features_nom)
        ],
        remainder=SimpleImputer(strategy="constant", fill_value=0.0, copy=False),
    )

    steps= [("clean", FunctionTransformer(clean, check_inverse=False)),
            ("impute", imputer),
            ('encode', ColumnTransformer(
                transformers=[
                    ("ordered_cats", OrdinalEncoder(categories=list(ordered_levels.values())),
                     list(range(len(ordered_levels.keys())))),
                    ("nominal_cats", OrdinalEncoder(categories='auto', handle_unknown='use_encoded_value', unknown_value=-1),
                     list(range(len(ordered_levels.keys()), len(ordered_levels.keys())+len(features_nom))))
                ],
                remainder='passthrough',
           ))
        ]
    return steps

class PreprocessPipeline(Pipeline):

    feature_names_in = None

    def get_feature_names_out(self, input_features=None):
        remainder_columns = list(input_features if input_features is not None else self.feature_names_in)
        for col in ordered_levels.keys(): remainder_columns.remove(col)
        for col in features_nom: remainder_columns.remove(col)

        return list(ordered_levels.keys())+features_nom+remainder_columns

    def _fit(self, X, y=None, **fit_params_steps):
        self.feature_names_in = X.columns
        return super()._fit(X, y=None, **fit_params_steps)

    def transform(self, X):
        fromparent = super().transform(X)
        return self.convert_to_df(fromparent)

    def fit_transform(self, X, y=None, **fit_params):
        fromparent = super().fit_transform(X, y=None, **fit_params)
        return self.convert_to_df(fromparent)

    def convert_to_df(self, array):
        df= pd.DataFrame(array, columns=self.get_feature_names_out())
        for col in df.columns:
            df[col] = df[col].astype("float64")

        # Names beginning with numbers are awkward to work with
        df = df.rename(columns={
                "1stFlrSF": "FirstFlrSF",
                "2ndFlrSF": "SecondFlrSF",
                "3SsnPorch": "Threeseasonporch",
            }
        )
        return df


def score_dataset(X, y, estimator):
    #Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        estimator, X, log_y, cv=5, scoring="neg_mean_squared_error", error_score='raise'
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

# Raw Data

In [96]:
X_train, y_train, X_test = load_data()

X_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RL,,32668,Pave,,IR1,Lvl,AllPub,CulDSac,...,0,0,,,,0,3,2007,WD,Alloca
1,50,RL,79.0,9490,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,8,2006,WD,Normal
2,50,RL,,7015,Pave,,IR1,Bnk,AllPub,Corner,...,0,0,,,,0,7,2009,WD,Normal
3,60,RL,83.0,10005,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,3,2008,WD,Normal
4,160,RM,21.0,1680,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,3,2010,WD,Family
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,82.0,9430,Pave,,Reg,Lvl,AllPub,Inside,...,180,0,,,,0,7,2009,WD,Normal
1456,20,RL,60.0,9600,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2010,WD,Normal
1457,90,RM,68.0,8930,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2010,WD,Normal
1458,120,RL,,3196,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,10,2006,WD,Normal


In [97]:
y_train

0       200624
1       133000
2       110000
3       192000
4        88000
         ...  
1455    337000
1456    128000
1457    112000
1458    234000
1459    221000
Name: SalePrice, Length: 1460, dtype: int64

# Preprocessed data - imputed, cleaned and labels encoded.

In [98]:
pp_pipeline = PreprocessPipeline(preprocess_pipeline_steps())
X_preproccessed = pp_pipeline.fit_transform(X_train, y_train)
X_preproccessed

Unnamed: 0,OverallQual,OverallCond,ExterQual,ExterCond,BsmtQual,BsmtCond,HeatingQC,KitchenQual,FireplaceQu,GarageQual,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,Threeseasonporch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,6.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,484.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,3.0,2007.0
1,6.0,7.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,...,240.0,0.0,0.0,32.0,0.0,0.0,0.0,0.0,8.0,2006.0
2,5.0,4.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,...,352.0,0.0,0.0,248.0,0.0,0.0,0.0,0.0,7.0,2009.0
3,7.0,5.0,3.0,3.0,4.0,3.0,5.0,3.0,3.0,3.0,...,505.0,288.0,117.0,0.0,0.0,0.0,0.0,0.0,3.0,2008.0
4,6.0,5.0,3.0,3.0,3.0,3.0,3.0,3.0,0.0,3.0,...,264.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2010.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,8.0,5.0,4.0,3.0,4.0,3.0,5.0,4.0,4.0,3.0,...,856.0,0.0,128.0,0.0,0.0,180.0,0.0,0.0,7.0,2009.0
1456,4.0,7.0,3.0,3.0,4.0,3.0,3.0,4.0,0.0,3.0,...,436.0,290.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2010.0
1457,6.0,5.0,3.0,3.0,0.0,0.0,3.0,3.0,0.0,3.0,...,539.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2010.0
1458,7.0,5.0,4.0,3.0,4.0,3.0,5.0,4.0,3.0,3.0,...,420.0,143.0,20.0,0.0,0.0,0.0,0.0,0.0,10.0,2006.0


# Score for model without any feature engineering

In [99]:
# # Establish Baseline

preprocess_and_model = Pipeline(steps=[("pre", pp_pipeline),
                                       ("model", XGBRegressor())])

def print_estimator_score(X, y, es, str_describing_estimator):
    score = score_dataset(X, y, estimator=es)
    print(f"Score for {str_describing_estimator}: {score:.5f} RMSLE")

print_estimator_score(X_train, y_train, preprocess_and_model, "preprocessing steps and model only")

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Score for preprocessing steps and model only: 0.13635 RMSLE


# Feature Engineering

## Drop Uninformative

In [100]:


def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X_preproccessed, y_train)

def drop_uninformative(df, col_mi_scores):
    return df.loc[:, col_mi_scores > 0.0]

fe_pipeline = Pipeline(steps =[("du", FunctionTransformer(drop_uninformative, kw_args={'col_mi_scores':mi_scores}))])

pp_fe_pipeline = Pipeline(
    steps=[
        ("pp", pp_pipeline),
        ("fe", fe_pipeline),
        ("model", XGBRegressor())
    ]
)

transformation_pipeline = Pipeline(
    steps=[
        ("pp", pp_pipeline),
        ("fe", fe_pipeline)
    ]
)

print_estimator_score(X_train, y_train, pp_fe_pipeline, "drop uninformative features")

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Score for drop uninformative features: 0.13439 RMSLE


In [101]:

def mathematical_transforms(df):
    X = pd.DataFrame()  # dataframe to hold new features
    X["LivLotRatio"] = df.GrLivArea / df.LotArea
    X["Spaciousness"] = (df.FirstFlrSF + df.SecondFlrSF) / df.TotRmsAbvGrd
    return df.join(X)

fe_pipeline.steps.append(("mt", FunctionTransformer(mathematical_transforms)))

def interactions(df):
    X = pd.get_dummies(df.BldgType, prefix="Bldg")
    X = X.mul(df.GrLivArea, axis=0)
    return df.join(X)

fe_pipeline.steps.append(("int", FunctionTransformer(interactions)))

def counts(df):
    X = pd.DataFrame()
    X["PorchTypes"] = df[[
        "WoodDeckSF",
        "OpenPorchSF",
        "EnclosedPorch",
        "ScreenPorch",
    ]].gt(0.0).sum(axis=1)
    return df.join(X)

fe_pipeline.steps.append(("ct", FunctionTransformer(counts)))

def group_transforms(df):
    X = pd.DataFrame()
    X["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")
    return df.join(X)

fe_pipeline.steps.append(("gt", FunctionTransformer(group_transforms)))

def pca_inspired(df):
    X = pd.DataFrame()
    X["Feature1"] = df.GrLivArea + df.TotalBsmtSF
    X["Feature2"] = df.YearRemodAdd * df.TotalBsmtSF
    return df.join(X)

fe_pipeline.steps.append(("pcai", FunctionTransformer(pca_inspired)))

print_estimator_score(X_train, y_train, pp_fe_pipeline, "all features")


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Score for all features: 0.13579 RMSLE


In [102]:
#convert df to array as catbooster 2.4 doesn't handle dfs well.
def df_to_array(df:pd.DataFrame):
    return df.to_numpy(dtype='float64')

fe_pipeline.steps.append(("to_numpy", FunctionTransformer(df_to_array)))
transformation_pipeline.fit_transform(X_train, y_train)

array([[6.000000e+00, 3.000000e+00, 4.000000e+00, ..., 1.717000e+03,
        4.550000e+03, 4.019125e+06],
       [6.000000e+00, 7.000000e+00, 3.000000e+00, ..., 1.200000e+03,
        2.384000e+03, 1.571700e+06],
       [5.000000e+00, 4.000000e+00, 3.000000e+00, ..., 1.210500e+03,
        1.912000e+03, 1.382550e+06],
       ...,
       [6.000000e+00, 5.000000e+00, 3.000000e+00, ..., 1.106000e+03,
        1.902000e+03, 0.000000e+00],
       [7.000000e+00, 5.000000e+00, 4.000000e+00, ..., 1.500000e+03,
        2.931000e+03, 2.753496e+06],
       [7.000000e+00, 5.000000e+00, 4.000000e+00, ..., 2.418000e+03,
        3.034000e+03, 2.387610e+06]])

In [103]:

cat_boost = ("catbooster", CatBoostEncoder(cols=[22], a=1))
fe_pipeline.steps.append(cat_boost)


In [104]:
#raise SystemExit("Stop right there!")

In [105]:
print_estimator_score(X_train, y_train, pp_fe_pipeline, "all feature engineering including cat booster")

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Score for all feature engineering including cat booster: 0.13517 RMSLE


In [106]:
transformation_pipeline.fit_transform(X_train, y_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,71,72,73,74,75,76,77,78,79,80
0,6.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,279.444444,2515.0,0.0,0.0,0.0,0.0,1.0,1717.0,4550.0,4019125.0
1,6.0,7.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,...,315.600000,1578.0,0.0,0.0,0.0,0.0,1.0,1200.0,2384.0,1571700.0
2,5.0,4.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,...,240.600000,1203.0,0.0,0.0,0.0,0.0,1.0,1210.5,1912.0,1382550.0
3,7.0,5.0,3.0,3.0,4.0,3.0,5.0,3.0,3.0,3.0,...,252.750000,2022.0,0.0,0.0,0.0,0.0,2.0,1738.0,3182.0,2293320.0
4,6.0,5.0,3.0,3.0,3.0,3.0,3.0,3.0,0.0,3.0,...,182.000000,0.0,0.0,0.0,1092.0,0.0,0.0,1155.0,1617.0,1034775.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,8.0,5.0,4.0,3.0,4.0,3.0,5.0,4.0,4.0,3.0,...,295.625000,2365.0,0.0,0.0,0.0,0.0,2.0,2418.0,3617.0,2502748.0
1456,4.0,7.0,3.0,3.0,4.0,3.0,3.0,4.0,0.0,3.0,...,266.750000,1067.0,0.0,0.0,0.0,0.0,1.0,1106.0,2134.0,2128665.0
1457,6.0,5.0,3.0,3.0,0.0,0.0,3.0,3.0,0.0,3.0,...,237.750000,0.0,0.0,1902.0,0.0,0.0,0.0,1106.0,1902.0,0.0
1458,7.0,5.0,4.0,3.0,4.0,3.0,5.0,4.0,3.0,3.0,...,222.428571,0.0,0.0,0.0,0.0,1557.0,2.0,1500.0,2931.0,2753496.0


In [107]:
print_estimator_score(X_train, y_train, pp_fe_pipeline, "all feature engineering including cat booster")

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Score for all feature engineering including cat booster: 0.13517 RMSLE


# Hyperparameter Tuning #
## Better XGB Params


In [108]:

xgb_params = dict(
    max_depth=6,           # maximum depth of each tree - try 2 to 10
    learning_rate=0.01,    # effect of each tree - try 0.0001 to 0.1
    n_estimators=1000,     # number of trees (that is, boosting rounds) - try 1000 to 8000
    min_child_weight=1,    # minimum number of houses in a leaf - try 1 to 10
    colsample_bytree=0.7,  # fraction of features (columns) per tree - try 0.2 to 1.0
    subsample=0.7,         # fraction of instances (rows) per tree - try 0.2 to 1.0
    reg_alpha=0.5,         # L1 regularization (like LASSO) - try 0.0 to 10.0
    reg_lambda=1.0,        # L2 regularization (like Ridge) - try 0.0 to 10.0
    num_parallel_tree=1,   # set > 1 for boosted random forests
)
pl_params = {'model__'+xgb_params_var_name: val for xgb_params_var_name, val in xgb_params.items()}


pp_fe_pipeline.set_params(**pl_params)
print_estimator_score(X_train, y_train, pp_fe_pipeline, "everything including better XGB params")

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Score for everything including better XGB params: 0.12335 RMSLE


## Scan for best params with optuna

In [109]:
import optuna

def objective(trial):
    xgb_params = dict(
        max_depth=trial.suggest_int("max_depth", 2, 10),
        learning_rate=trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 1000, 8000),
        min_child_weight=trial.suggest_int("min_child_weight", 1, 10),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),
        subsample=trial.suggest_float("subsample", 0.2, 1.0),
        reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
        reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
    )
    pl_params = {'model__'+xgb_params_var_name: val for xgb_params_var_name, val in xgb_params.items()}

    pp_fe_pipeline.set_params(**pl_params)
    return score_dataset(X_train, y_train, pp_fe_pipeline)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)
pl_params = study.best_params
pl_params

[32m[I 2022-04-14 20:03:36,371][0m A new study created in memory with name: no-name-7420b4f7-8baf-400f-a458-3a57c2a799bc[0m
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
[32m[I 2022-04-14 20:04:15,241][0m Trial 0 finished with value: 3.6490924717272 and parameters: {'max_depth': 7, 'learning_rate': 0.00021505955131427277, 'n_estimators': 5544, 'min_child_weight': 2, 'colsample_bytree': 0.25099323214150515, 'subsample': 0.5577485908092215, 'reg_alpha': 0.0004524176513934834, 'reg_lambda': 20.48022682103387}. Best is trial 0 with value: 3.6490924717272.[0m
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.c

KeyboardInterrupt: 

In [None]:
pl_params = {'model__'+xgb_params_var_name: val for xgb_params_var_name, val in pl_params.items()}
pp_fe_pipeline.set_params(**pl_params)
# XGB minimizes MSE, but competition loss is RMSLE
# So, we need to log-transform y to train and exp-transform the predictions
pp_fe_pipeline.fit(X_train, np.log(y_train))
predictions = np.exp(pp_fe_pipeline.predict(X_test))

output = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")