# My entry for the [House Prices - Advanced Regression Techniques](https://www.kaggle.com/c/house-prices-advanced-regression-techniques) competition! #


In [282]:
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from pandas.api.types import CategoricalDtype

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import MissingIndicator
from sklearn.preprocessing import FunctionTransformer

np.random.seed(0)

from category_encoders import CatBoostEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split


# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Mute warnings
warnings.filterwarnings('ignore')


def clean(df:pd.DataFrame):
    df[["Exterior1st", "Exterior2nd"]] = df[["Exterior1st", "Exterior2nd"]].replace(
        {"Brk Cmn": "BrkComm",
         "BrkCmn" : "BrkComm",
         "Wd Sdng": "WdSdng",
         "Wd Shng": "WdShng"}
    )
    # Some values of GarageYrBlt are corrupt, so we'll replace them
    # with the year the house was built
    df["GarageYrBlt"] = df["GarageYrBlt"].where(df.GarageYrBlt <= 2010, df.YearBuilt)
    return df

def load_data():
    # Read data
    data_dir = Path("../input/house-prices-advanced-regression-techniques/")
    df_train = pd.read_csv(data_dir / "train.csv", index_col="Id")
    df_test = pd.read_csv(data_dir / "test.csv", index_col="Id")

    X_train = df_train.copy()
    y_train = X_train.pop("SalePrice")
    X_test = df_test.copy()

    perm = np.random.permutation(len(X_train))
    X_train = X_train.iloc[perm].reset_index(drop=True)
    y_train = y_train.iloc[perm].reset_index(drop=True)

    return X_train, y_train, X_test

# The nominative (unordered) categorical features
features_nom = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating", "GarageType", "MiscFeature", "SaleType", "SaleCondition"]

# The ordinal (ordered) categorical features
# Pandas calls the categories "levels"
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(1,11))

ordered_levels = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
}

ordered_levels = {key: ["None"] + value for key, value in ordered_levels.items()}


def preprocess_pipeline_steps():
    imputer = ColumnTransformer(
        transformers=[
            ("ordered_cats", SimpleImputer(strategy="constant", fill_value="None", copy=False), list(ordered_levels.keys())),
            ("nominal_cats", SimpleImputer(strategy="constant", fill_value="None", copy=False), features_nom)
        ],
        remainder=SimpleImputer(strategy="constant", fill_value=0.0, copy=False),
    )

    steps= [("clean", FunctionTransformer(clean, check_inverse=False)),
            ("impute", imputer),
            ('encode', ColumnTransformer(
                transformers=[
                    ("ordered_cats", OrdinalEncoder(categories=list(ordered_levels.values())),
                     list(range(len(ordered_levels.keys())))),
                    ("nominal_cats", OrdinalEncoder(categories='auto', handle_unknown='use_encoded_value', unknown_value=-1),
                     list(range(len(ordered_levels.keys()), len(ordered_levels.keys())+len(features_nom))))
                ],
                remainder='passthrough',
           ))
        ]
    return steps

class PreprocessPipeline(Pipeline):

    feature_names_in = None

    def get_feature_names_out(self, input_features=None):
        remainder_columns = list(input_features if input_features is not None else self.feature_names_in)
        for col in ordered_levels.keys(): remainder_columns.remove(col)
        for col in features_nom: remainder_columns.remove(col)

        return list(ordered_levels.keys())+features_nom+remainder_columns

    def _fit(self, X, y=None, **fit_params_steps):
        self.feature_names_in = X.columns
        return super()._fit(X, y=None, **fit_params_steps)

    def transform(self, X):
        fromparent = super().transform(X)
        return self.convert_to_df(fromparent)

    def fit_transform(self, X, y=None, **fit_params):
        fromparent = super().fit_transform(X, y=None, **fit_params)
        return self.convert_to_df(fromparent)

    def convert_to_df(self, array):
        df= pd.DataFrame(array, columns=self.get_feature_names_out())
        for col in df.columns:
            df[col] = df[col].astype("float64")

        # Names beginning with numbers are awkward to work with
        df = df.rename(columns={
                "1stFlrSF": "FirstFlrSF",
                "2ndFlrSF": "SecondFlrSF",
                "3SsnPorch": "Threeseasonporch",
            }
        )
        return df


def score_dataset(X, y, estimator):
    #Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        estimator, X, log_y, cv=5, scoring="neg_mean_squared_error"
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

X_train, y_train, X_test = load_data()
# # Establish Baseline
# # Finally, let’s establish a baseline score to judge our feature engineering against.
es = Pipeline(steps=[("pre", PreprocessPipeline(preprocess_pipeline_steps())),
                     ("model", XGBRegressor())])
baseline_score = score_dataset(X_train, y_train, es)
print(f"Baseline score: {baseline_score:.5f} RMSLE")

pp_pipeline = PreprocessPipeline(preprocess_pipeline_steps())

def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

X_preproccessed = pp_pipeline.fit_transform(X_train)

mi_scores = make_mi_scores(X_preproccessed, y_train)

def drop_uninformative(df, col_mi_scores):
    return df.loc[:, col_mi_scores > 0.0]

fe_pipeline = Pipeline(steps =[("du", FunctionTransformer(drop_uninformative, kw_args={'col_mi_scores':mi_scores}))])

pp_fe_pipeline = Pipeline(
    steps=[
        ("pp", pp_pipeline),
        ("fe", fe_pipeline),
        ("model", XGBRegressor())
    ]
)

transformation_pipeline = Pipeline(
    steps=[
        ("pp", pp_pipeline),
        ("fe", fe_pipeline)
    ]
)

score_dataset(X_train, y_train, pp_fe_pipeline)

def mathematical_transforms(df):
    X = pd.DataFrame()  # dataframe to hold new features
    X["LivLotRatio"] = df.GrLivArea / df.LotArea
    X["Spaciousness"] = (df.FirstFlrSF + df.SecondFlrSF) / df.TotRmsAbvGrd
    return df.join(X)

fe_pipeline.steps.append(("mt", FunctionTransformer(mathematical_transforms)))

def interactions(df):
    X = pd.get_dummies(df.BldgType, prefix="Bldg")
    X = X.mul(df.GrLivArea, axis=0)
    return df.join(X)

fe_pipeline.steps.append(("int", FunctionTransformer(interactions)))

def counts(df):
    X = pd.DataFrame()
    X["PorchTypes"] = df[[
        "WoodDeckSF",
        "OpenPorchSF",
        "EnclosedPorch",
        "ScreenPorch",
    ]].gt(0.0).sum(axis=1)
    return df.join(X)

fe_pipeline.steps.append(("ct", FunctionTransformer(counts)))

def group_transforms(df):
    X = pd.DataFrame()
    X["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")
    return df.join(X)

fe_pipeline.steps.append(("gt", FunctionTransformer(group_transforms)))

def pca_inspired(df):
    X = pd.DataFrame()
    X["Feature1"] = df.GrLivArea + df.TotalBsmtSF
    X["Feature2"] = df.YearRemodAdd * df.TotalBsmtSF
    return df.join(X)

fe_pipeline.steps.append(("pcai", FunctionTransformer(pca_inspired)))

score_dataset(X_train, y_train, pp_fe_pipeline)

In [286]:

class CrossFoldEncoder:
    def __init__(self, encoder, cols, **kwargs):
        self.encoder_ = encoder
        self.cols_ = cols
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=5)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit(self, X, y):
        self.fitted_encoders_ = []
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=self.cols_, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[self.cols_])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

cat_boost = ("catbooster", CatBoostEncoder(cols=['MSSubClass'], a=1))
fe_pipeline.steps.append(cat_boost)

In [287]:

score_dataset(X_train, y_train, pp_fe_pipeline)

0.1351326790054262

# Hyperparameter Tuning #


In [300]:

xgb_params = dict(
    max_depth=6,           # maximum depth of each tree - try 2 to 10
    learning_rate=0.01,    # effect of each tree - try 0.0001 to 0.1
    n_estimators=1000,     # number of trees (that is, boosting rounds) - try 1000 to 8000
    min_child_weight=1,    # minimum number of houses in a leaf - try 1 to 10
    colsample_bytree=0.7,  # fraction of features (columns) per tree - try 0.2 to 1.0
    subsample=0.7,         # fraction of instances (rows) per tree - try 0.2 to 1.0
    reg_alpha=0.5,         # L1 regularization (like LASSO) - try 0.0 to 10.0
    reg_lambda=1.0,        # L2 regularization (like Ridge) - try 0.0 to 10.0
    num_parallel_tree=1,   # set > 1 for boosted random forests
)
pl_params = {'model__'+xgb_params_var_name: val for xgb_params_var_name, val in xgb_params.items()}


pp_fe_pipeline.set_params(**pl_params)
score_dataset(X_train, y_train, pp_fe_pipeline)

0.1251627782781732

In [302]:
import optuna

def objective(trial):
    xgb_params = dict(
        max_depth=trial.suggest_int("max_depth", 2, 10),
        learning_rate=trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 1000, 8000),
        min_child_weight=trial.suggest_int("min_child_weight", 1, 10),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),
        subsample=trial.suggest_float("subsample", 0.2, 1.0),
        reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
        reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
    )
    pl_params = {'model__'+xgb_params_var_name: val for xgb_params_var_name, val in xgb_params.items()}

    pp_fe_pipeline.set_params(**pl_params)
    return score_dataset(X_train, y_train, pp_fe_pipeline)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)
pl_params = study.best_params
pl_params

[32m[I 2022-04-13 12:57:45,740][0m A new study created in memory with name: no-name-ca0d5124-d1b9-415c-902e-21a92f05b0a4[0m
[32m[I 2022-04-13 12:58:17,210][0m Trial 0 finished with value: 2.211172657368156 and parameters: {'max_depth': 10, 'learning_rate': 0.0003945168191265213, 'n_estimators': 4284, 'min_child_weight': 2, 'colsample_bytree': 0.9097327197416958, 'subsample': 0.49877758519981197, 'reg_alpha': 0.014255102852680068, 'reg_lambda': 7.985012472593777}. Best is trial 0 with value: 2.211172657368156.[0m
[32m[I 2022-04-13 12:59:53,709][0m Trial 1 finished with value: 0.1262903673814937 and parameters: {'max_depth': 9, 'learning_rate': 0.02068447038417672, 'n_estimators': 3349, 'min_child_weight': 6, 'colsample_bytree': 0.48781061548323934, 'subsample': 0.9885455326050718, 'reg_alpha': 0.00026527781028817896, 'reg_lambda': 0.12960081505619328}. Best is trial 1 with value: 0.1262903673814937.[0m
[32m[I 2022-04-13 13:00:53,237][0m Trial 2 finished with value: 6.30975580

{'max_depth': 4,
 'learning_rate': 0.004259286656797621,
 'n_estimators': 6927,
 'min_child_weight': 1,
 'colsample_bytree': 0.8017598422158565,
 'subsample': 0.9037555821331148,
 'reg_alpha': 0.0064442044964354685,
 'reg_lambda': 0.0024178339209708194}

In [305]:
pl_params = {'model__'+xgb_params_var_name: val for xgb_params_var_name, val in pl_params.items()}
pp_fe_pipeline.set_params(**pl_params)
# XGB minimizes MSE, but competition loss is RMSLE
# So, we need to log-transform y to train and exp-transform the predictions
pp_fe_pipeline.fit(X_train, np.log(y_train))
predictions = np.exp(pp_fe_pipeline.predict(X_test))

output = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Parameters: { "model__colsample_bytree", "model__learning_rate", "model__max_depth", "model__min_child_weight", "model__n_estimators", "model__reg_alpha", "model__reg_lambda", "model__subsample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "model__colsample_bytree", "model__learning_rate", "model__max_depth", "model__min_child_weight", "model__n_estimators", "model__reg_alpha", "model__reg_lambda", "model__subsample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Your submission was successful

To submit these predictions to the competition, follow these steps:

1. Begin by clicking on the blue **Save Version** button in the top right corner of the window.  This will generate a pop-up window.
2. Ensure that the **Save and Run All** option is selected, and then click on the blue **Save** button.
3. This generates a window in the bottom left corner of the notebook.  After it has finished running, click on the number to the right of the **Save Version** button.  This pulls up a list of versions on the right of the screen.  Click on the ellipsis **(...)** to the right of the most recent version, and select **Open in Viewer**.  This brings you into view mode of the same page. You will need to scroll down to get back to these instructions.
4. Click on the **Output** tab on the right of the screen.  Then, click on the file you would like to submit, and click on the blue **Submit** button to submit your results to the leaderboard.

You have now successfully submitted to the competition!

# Next Steps #

If you want to keep working to improve your performance, select the blue **Edit** button in the top right of the screen. Then you can change your code and repeat the process. There's a lot of room to improve, and you will climb up the leaderboard as you work.

Be sure to check out [other users' notebooks](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/notebooks) in this competition. You'll find lots of great ideas for new features and as well as other ways to discover more things about the dataset or make better predictions. There's also the [discussion forum](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/discussion), where you can share ideas with other Kagglers.

Have fun Kaggling!