In [1]:
# Planning Steps for this project:
# 1. import all relevant libraries and functions
# 2. read in data and then decide numerical_columns and categorical_columns
# 3. train test split X and y
# 4. define pipeline with preprocessors and XGBRegressor model
# 5. fit the model and make predictions
# 6. make improvements and then use all training data
# 7. submit (woohoo)

In [2]:
# 1.1 import general libraries

In [3]:
import numpy as np
import pandas as pd
import sklearn as sk
import xgboost as xg
import joblib

In [4]:
# 1.2 import specific libraries

In [5]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# pipeline / preprocessing 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# xgboost
from xgboost import XGBRegressor

In [6]:
# 2.1 read in the data

In [7]:
train_df = pd.read_csv(r"C:\Users\miria\OneDrive\Documents\DS Learn\practice datasets\kagglehousingpricesdata\train.csv")
test_df = pd.read_csv(r"C:\Users\miria\OneDrive\Documents\DS Learn\practice datasets\kagglehousingpricesdata\test.csv")

In [8]:
# 2.2 determine numerical_columns and categorical_columns features

In [9]:
numerical_columns = ['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'PoolArea', 'YrSold', 'MoSold']
categorical_columns = ['Street', 'Alley', 'LotShape', 'Utilities', 'LotConfig', 'Neighborhood', 'ExterQual', 'Foundation', 'BsmtCond', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'Functional', 'GarageType', 'PoolQC', 'Fence', 'SaleCondition']
features = numerical_columns + categorical_columns

In [10]:
# 3.1 set X and y

In [11]:
X = train_df[features]
y = train_df[['SalePrice']]

In [12]:
# 3.2 train test split X and y

In [13]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

In [14]:
# 4.1 Set preprocessor for numerical data

In [15]:
numerical_transformer = SimpleImputer(strategy = 'constant').set_output(transform="pandas")

In [16]:
# 4.2 Set preprocessor for categorical data

In [17]:
categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')), 
    ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
]).set_output(transform="pandas")

In [18]:
# 4.3 bundle the preprocessors together

In [19]:
preprocessor = ColumnTransformer(transformers = [
    ('num', numerical_transformer, numerical_columns), 
    ('cat', categorical_transformer, categorical_columns)
]).set_output(transform="pandas")

In [20]:
# 4.4 set XGBRegressor model

In [21]:
my_model = XGBRegressor(n_estimators = 500, 
                        learning_rate = 0.05, 
                        random_state = 0,
                        early_stopping_rounds = 10,
                        enable_categorical = True
                        )

In [22]:
# use preprocessor on training data

In [23]:
encoder = preprocessor.fit(train_X)
train_X = encoder.transform(train_X)
val_X = encoder.transform(val_X)

In [24]:
# 5.1 fit the model, using early stopping rounds on a pipeline

In [25]:
my_model.fit(X = train_X, 
             y = train_y,
             eval_set = [(val_X, val_y)])

[0]	validation_0-rmse:77929.03440
[1]	validation_0-rmse:74778.50371
[2]	validation_0-rmse:71761.23248
[3]	validation_0-rmse:69111.46696
[4]	validation_0-rmse:66599.76096
[5]	validation_0-rmse:64111.89112
[6]	validation_0-rmse:61971.93466
[7]	validation_0-rmse:59919.47904
[8]	validation_0-rmse:57998.88474
[9]	validation_0-rmse:56173.98456
[10]	validation_0-rmse:54432.55296
[11]	validation_0-rmse:52827.70599
[12]	validation_0-rmse:51262.84428
[13]	validation_0-rmse:49831.18695
[14]	validation_0-rmse:48377.98695
[15]	validation_0-rmse:47001.35468
[16]	validation_0-rmse:45653.58627
[17]	validation_0-rmse:44626.70119
[18]	validation_0-rmse:43410.57262
[19]	validation_0-rmse:42285.86983
[20]	validation_0-rmse:41416.00345
[21]	validation_0-rmse:40487.56258
[22]	validation_0-rmse:39736.46765
[23]	validation_0-rmse:38964.34223
[24]	validation_0-rmse:38286.55196
[25]	validation_0-rmse:37655.93722
[26]	validation_0-rmse:37044.49299
[27]	validation_0-rmse:36508.26317
[28]	validation_0-rmse:36020.2

In [26]:
# 5.2 make predictions using the model

In [27]:
predictions = my_model.predict(val_X)

In [28]:
# 5.3 get mean absolute error

In [29]:
initial_mae = mean_absolute_error(predictions, val_y)
print(initial_mae)

17833.93486729452


In [30]:
# 6.1 use preprocessor on all training data

In [31]:
all_encoder = preprocessor.fit(X)
X = all_encoder.transform(X)
X = X.drop(['cat__Utilities_NoSeWa', 'cat__Heating_Floor', 'cat__Heating_OthW', 'cat__Electrical_Mix', 'cat__PoolQC_Fa'], axis=1)


In [32]:
# 6.2 fit the model on all training data

In [33]:
my_model.fit(X = X,
             y = y,
             eval_set = [(X, y)])

[0]	validation_0-rmse:76109.64422
[1]	validation_0-rmse:72996.42853
[2]	validation_0-rmse:70046.57316
[3]	validation_0-rmse:67245.04094
[4]	validation_0-rmse:64572.78212
[5]	validation_0-rmse:62036.06241
[6]	validation_0-rmse:59615.59204
[7]	validation_0-rmse:57337.40322
[8]	validation_0-rmse:55168.07293
[9]	validation_0-rmse:53081.02254
[10]	validation_0-rmse:51113.04684
[11]	validation_0-rmse:49222.58261
[12]	validation_0-rmse:47376.11394
[13]	validation_0-rmse:45613.75194
[14]	validation_0-rmse:44010.84106
[15]	validation_0-rmse:42429.95087
[16]	validation_0-rmse:40931.64599
[17]	validation_0-rmse:39545.72632
[18]	validation_0-rmse:38243.89913
[19]	validation_0-rmse:36947.03624
[20]	validation_0-rmse:35741.35637
[21]	validation_0-rmse:34556.36780
[22]	validation_0-rmse:33471.74139
[23]	validation_0-rmse:32449.91874
[24]	validation_0-rmse:31487.94348
[25]	validation_0-rmse:30566.57826
[26]	validation_0-rmse:29674.31090
[27]	validation_0-rmse:28779.53064
[28]	validation_0-rmse:27962.7

In [34]:
# 6.3 make predictions on test data

In [35]:
test_X = test_df[features]

test_encoder = preprocessor.fit(test_X)
test_X = test_encoder.transform(test_X)

test_preds = my_model.predict(test_X, validate_features = False)

In [36]:
# 7 submit

In [37]:
output = pd.DataFrame({'Id': test_df.Id, 'SalePrice': test_preds})
output.to_csv(r"C:\Users\miria\OneDrive\Documents\DS Learn\output datasets\kagglehousepricessubmission2.csv", index=False)

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 80, dtype: object