In [7]:
import pandas as pd

from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [2]:
df = pd.read_csv("data/housing/train.csv", index_col="Id")

In [3]:
# Separate variables

X = df.drop('SalePrice', axis=1)
y = df.SalePrice

In [5]:
# Separate columns
categorical_object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

categorical_num_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64'] and 
                        X_train[col].nunique() <= 10]

numerical_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64'] and 
                        X_train[col].nunique() > 10]


numerical_transformer = SimpleImputer(strategy="median")

categorical_obj_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='most_frequent')),
           ('onehot', OneHotEncoder(handle_unknown="ignore"))
    ])

categorical_num_transformer = SimpleImputer(strategy="most_frequent")

preprocessor = ColumnTransformer(
    transformers=[
      ('num', numerical_transformer, numerical_cols),
      ('cat_obj', categorical_obj_transformer, categorical_object_cols),
      ('cat_num', categorical_num_transformer, categorical_num_cols)
])

model = XGBRegressor()

pipeline = Pipeline(
    steps=[
       ('preprocessor', preprocessor),
       ('model', model)
    ])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='median'),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'YearBuilt',
                                                   'YearRemodAdd', 'MasVnrArea',
                                                   'BsmtFinSF1', 'BsmtFinSF2',
                                                   'BsmtUnfSF', 'TotalBsmtSF',
                                                   '1stFlrSF', '2ndFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'TotRmsAbvGrd',
                                                   'GarageYrBlt', 'GarageArea',
                                                   'WoodDeckSF', 'OpenPorchSF...
                              gamma=0, gpu_id=-1,

In [10]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X_train)):
    
    print("-"*15, "Fold", fold, "-"*15)
    X_train, X_valid = X.iloc[train_indicies], X.iloc[valid_indicies]
    y_train, y_valid = y.iloc[train_indicies].values, y.iloc[valid_indicies].values 

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_valid)
    score = mean_squared_error(y_valid, y_pred, squared=False)
    print("Score: ", score)
    cv_scores.append(score)

--------------- Fold 0 ---------------
Score:  39741.44948432244
--------------- Fold 1 ---------------
Score:  31421.92662987563
--------------- Fold 2 ---------------
Score:  48578.97841047301
--------------- Fold 3 ---------------
Score:  33027.96401406169
--------------- Fold 4 ---------------
Score:  27908.62848880376
