In [1]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from tqdm import tqdm

In [2]:
# load train data
X = pd.read_csv("train.csv", index_col="Id")
y = X.pop("SalePrice")
# load test data
test = pd.read_csv("test.csv", index_col="Id")

In [3]:
def create_encoders(df):
    """Creates Dictionary with trained Labelencoder for every column"""
    encoder_dict = {}
    for c in df.columns:
        encoder_dict[c] = LabelEncoder().fit(df[c])
    return encoder_dict

def apply_encoders(df1, df2, encoder_dict):
    """Applies encoders to categorical columns of df"""
    columns_to_drop = []
    for c in tqdm(df1.columns):
        try:
            df1[c] = encoder_dict[c].transform(df1[c])
            df2[c] = encoder_dict[c].transform(df2[c])
        except (KeyError, ValueError):
            print(f"Unseen value in column {c} - dropping column")
            columns_to_drop.append(c)
    df1.drop(columns_to_drop, axis="columns", inplace=True)
    df2.drop(columns_to_drop, axis="columns", inplace=True)
    return df1, df2

In [4]:
# get all columns with categorical data
columns_to_encode = []
for c in X.columns:
    if X[c].dtype not in ["int64", "float64"]:
        columns_to_encode.append(c)
# create df with just the columns from above
categorical_data_train = X[columns_to_encode].astype("str")
categorical_data_test = test[columns_to_encode].astype("str")
# create encoder for each categorical column
encoder_dict = create_encoders(categorical_data_train)
# apply encoders to train and test data
categorical_data_train, categorical_data_test = apply_encoders(categorical_data_train, categorical_data_test, encoder_dict)

100%|██████████| 43/43 [00:00<00:00, 795.58it/s]Unseen value in column MSZoning - dropping column
Unseen value in column Utilities - dropping column
Unseen value in column Exterior1st - dropping column
Unseen value in column Exterior2nd - dropping column
Unseen value in column KitchenQual - dropping column
Unseen value in column Functional - dropping column
Unseen value in column SaleType - dropping column



In [5]:
# add encoded categorical data to train df
X.drop(columns_to_encode, axis="columns", inplace=True)
X = pd.concat([X, categorical_data_train], axis="columns")
X.fillna(0, inplace=True)
# add encoded categorical data to test df
test.drop(columns_to_encode, axis="columns", inplace=True)
test = pd.concat([test, categorical_data_test], axis="columns")
test.fillna(0, inplace=True)

# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, shuffle=True)

In [6]:
def rmsle(predt, dtrain):
    ''' Root mean squared log error metric.'''
    y = dtrain.get_label()
    predt[predt < -1] = -1 + 1e-6
    elements = np.power(np.log1p(y) - np.log1p(predt), 2)
    return 'PyRMSLE', float(np.sqrt(np.sum(elements) / len(y)))

In [7]:
model = xgb.XGBRegressor(
    max_depth=5,
    n_estimators=1000,
    min_child_weight=0.5, 
    eta=0.1,
    seed=42)

model.fit(
    X_train, 
    y_train, 
    eval_metric=rmsle, 
    eval_set=[(X_test, y_test)], 
    verbose=True, 
    early_stopping_rounds=5)
print("Training finished")

[0]	validation_0-rmse:169040.59375	validation_0-PyRMSLE:2.30161
[1]	validation_0-rmse:152717.89062	validation_0-PyRMSLE:1.65999
[2]	validation_0-rmse:138128.51562	validation_0-PyRMSLE:1.30909
[3]	validation_0-rmse:124945.98438	validation_0-PyRMSLE:1.07102
[4]	validation_0-rmse:113251.04688	validation_0-PyRMSLE:0.90050
[5]	validation_0-rmse:102326.39062	validation_0-PyRMSLE:0.76435
[6]	validation_0-rmse:92602.57812	validation_0-PyRMSLE:0.65803
[7]	validation_0-rmse:84084.70312	validation_0-PyRMSLE:0.57198
[8]	validation_0-rmse:76572.90625	validation_0-PyRMSLE:0.50252
[9]	validation_0-rmse:69695.18750	validation_0-PyRMSLE:0.44404
[10]	validation_0-rmse:63679.85156	validation_0-PyRMSLE:0.39511
[11]	validation_0-rmse:58301.75781	validation_0-PyRMSLE:0.35454
[12]	validation_0-rmse:53419.26172	validation_0-PyRMSLE:0.31880
[13]	validation_0-rmse:49253.82031	validation_0-PyRMSLE:0.28948
[14]	validation_0-rmse:45439.06641	validation_0-PyRMSLE:0.26391
[15]	validation_0-rmse:42078.66016	validatio

In [8]:
predictions = model.predict(test.values)
submission = pd.read_csv("sample_submission.csv", index_col="Id")
submission["SalePrice"] = predictions
submission.to_csv("submission.csv")