In [7]:
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer

np.random.seed(0)

from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor

# Mute warnings
warnings.filterwarnings('ignore')


def clean(df:pd.DataFrame):
    df[["Exterior1st", "Exterior2nd"]] = df[["Exterior1st", "Exterior2nd"]].replace(
        {"Brk Cmn": "BrkComm",
         "BrkCmn" : "BrkComm",
         "Wd Sdng": "WdSdng",
         "Wd Shng": "WdShng"}
    )
    # Some values of GarageYrBlt are corrupt, so we'll replace them
    # with the year the house was built
    df["GarageYrBlt"] = df["GarageYrBlt"].where(df.GarageYrBlt <= 2010, df.YearBuilt)
    # Names beginning with numbers are awkward to work with
    df = df.rename(columns={
            "1stFlrSF": "FirstFlrSF",
            "2ndFlrSF": "SecondFlrSF",
            "3SsnPorch": "Threeseasonporch",
        }
    )
    return df


# The numeric features are already encoded correctly (`float` for
# continuous, `int` for discrete), but the categoricals we'll need to
# do ourselves. Note in particular, that the `MSSubClass` feature is
# read as an `int` type, but is actually a (nominative) categorical.

# The nominative (unordered) categorical features
features_nom = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating", "GarageType", "MiscFeature", "SaleType", "SaleCondition"]


# The ordinal (ordered) categorical features
# Pandas calls the categories "levels"
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(1,11))

ordered_levels = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
}

for col in ordered_levels.keys(): ordered_levels[col].insert(0, "None")

def load_data():
    # Read data
    data_dir = Path("../../input/house-prices-advanced-regression-techniques/")
    df_train = pd.read_csv(data_dir / "train.csv", index_col="Id")
    df_test = pd.read_csv(data_dir / "test.csv", index_col="Id")

    X_train = df_train.copy()
    y_train = X_train.pop("SalePrice")
    X_test = df_test.copy()

    return X_train, y_train, X_test


def array_to_df(array, *, columns):
    return pd.DataFrame(array, columns=columns)

def pipeline() :
    imputer = ColumnTransformer(
        transformers=[
            ("ordered_cats", SimpleImputer(strategy="constant", fill_value="None", copy=False), list(ordered_levels.keys())),
            ("nominal_cats", SimpleImputer(strategy="constant", fill_value="None", copy=False), features_nom)
        ],
        remainder=SimpleImputer(strategy="constant", fill_value=0, copy=False)
    )

    clean_encode_impute_model = Pipeline(
        steps= [
            ("clean", FunctionTransformer(clean, check_inverse=False)),
            ("impute", imputer),
            ('label_encoder', ColumnTransformer(
                transformers=[
                    ("ordered_cats", OrdinalEncoder(categories=list(ordered_levels.values())),
                     list(range(len(ordered_levels.keys())))),

                    ("nominal_cats", OrdinalEncoder(categories='auto', handle_unknown='use_encoded_value', unknown_value=-1),
                     list(range(len(ordered_levels.keys()), len(ordered_levels.keys())+len(features_nom))))
                ],
                remainder='passthrough'))
        ]
    )
    return clean_encode_impute_model


def score_dataset(X, y, model=XGBRegressor()):
    datapipeline = pipeline()
    datapipeline.steps.append(("model", model))
    #Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        datapipeline, X, log_y, cv=5, scoring="neg_mean_squared_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

X_train, y_train, X_test = load_data()
# Establish Baseline
# Finally, let’s establish a baseline score to judge our feature engineering against.
baseline_score = score_dataset(X_train, y_train)
print(f"Baseline score: {baseline_score:.5f} RMSLE")

Baseline score: 0.13803 RMSLE


# Let's take a look at the transformed data.

In [8]:
transformed = pipeline().fit_transform(X_train)
transformed

array([[8.0, 6.0, 13.0, ..., 0.0, 2.0, 2008.0],
       [7.0, 9.0, 12.0, ..., 0.0, 5.0, 2007.0],
       [8.0, 6.0, 13.0, ..., 0.0, 9.0, 2008.0],
       ...,
       [8.0, 10.0, 14.0, ..., 2500.0, 5.0, 2010.0],
       [6.0, 7.0, 12.0, ..., 0.0, 4.0, 2010.0],
       [6.0, 7.0, 13.0, ..., 0.0, 6.0, 2008.0]], dtype=object)