In [1]:
import pandas as pd
import numpy as np
import missingno
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.base import TransformerMixin, BaseEstimator, ClassifierMixin, clone
from sklearn.pipeline import Pipeline

In [20]:
class preprocessor():
  def __init__(self):
    pass
  def fit(self, X, test=False):
    y = None
    if not test:
      X = X.dropna(axis=0, subset=['SalePrice'])
      y = X.SalePrice
      X = X.drop(columns=['SalePrice'])
    X = X.drop(columns=['Id'])

    self.na_means_none = ['Street', 'Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
      'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 
      'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'MasVnrType']

    X[self.na_means_none] = X[self.na_means_none].fillna(0)
    X.MiscVal.fillna(0, inplace=True)
    X.LotFrontage.fillna(0, inplace=True)
    X.MasVnrArea.fillna(0, inplace=True)
    X.GarageYrBlt.fillna(X.GarageYrBlt.median(), inplace=True)
    X.Electrical.fillna(0, inplace=True)

    self.numeric_columns = {col for col in X.columns if X[col].dtype in ['int64', 'float64']}
    self.categorical_columns = {col for col in X.columns if X[col].dtype == 'object'}
    self.categorical_columns.add('MSSubClass')
    self.numeric_columns.remove('MSSubClass')

    self.ordinal_columns = {}
    '''{'ExterQual', 
      'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
      'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir',
      'KitchenQual', 'FireplaceQu', 'GarageFinish','GarageQual',
      'GarageCond', 'PavedDrive', 'PoolQC', 'Fence'}'''
    self.ohe_columns = self.categorical_columns #- ordinal_columns

    self.imp_cat = SimpleImputer(strategy='most_frequent')
    self.imp_cat.fit(X[list(self.ohe_columns)])

    self.imp_num = SimpleImputer(strategy='median')
    self.imp_num.fit(X[list(self.numeric_columns)])

    X[list(self.ohe_columns)] = X[list(self.ohe_columns)].astype(str)


    self.cols = self.numeric_columns | self.categorical_columns 
    self.ohe_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    self.ohe_encoder.fit(X[self.ohe_columns])
    self.ord_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=0)
    self.ord_encoder.fit(X[self.ordinal_columns])

  def transform(self, X, test=False):
    y = None
    if not test:
      X = X.dropna(axis=0, subset=['SalePrice'])
      y = X.SalePrice
      X = X.drop(columns=['SalePrice'])
    X = X.drop(columns=['Id'])

    X[self.na_means_none] = X[self.na_means_none].fillna(0)
    X.MiscVal.fillna(0, inplace=True)
    X.LotFrontage.fillna(0, inplace=True)
    X.MasVnrArea.fillna(0, inplace=True)
    X.GarageYrBlt.fillna(X.GarageYrBlt.median(), inplace=True)
    X.Electrical.fillna(0, inplace=True)
    
    X[list(self.ohe_columns)] = self.imp_cat.transform(X[list(self.ohe_columns)])
    X[list(self.numeric_columns)] = self.imp_num.transform(X[list(self.numeric_columns)])

    X[list(self.ohe_columns)] = X[list(self.ohe_columns)].astype(str)
    self.ohe = self.ohe_encoder.transform(X[self.ohe_columns])
    self.ohe = pd.DataFrame(self.ohe, index=X.index, columns=[str(i) + '_ohe' for i in range(self.ohe.shape[1])])
    self.orde = self.ord_encoder.transform(X[self.ordinal_columns])
    self.orde = pd.DataFrame(self.orde, index=X.index, columns=[str(i) + '_ord' for i in range(self.orde.shape[1])])

    return pd.concat([X.drop(columns=self.ohe_columns), self.ohe], axis=1), y

In [34]:
from xgboost import XGBRegressor, train
from sklearn.ensemble import RandomForestRegressor

In [4]:
test_data_for_submission = pd.read_csv('./house-prices-advanced-regression-techniques/test.csv')
train = pd.read_csv('./house-prices-advanced-regression-techniques/train.csv')

In [21]:
P = preprocessor()
P.fit(train)
X, y = P.transform(train)

In [7]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [35]:
model = XGBRegressor(random_state=0, tree_method='gpu_hist', n_jobs=  )

In [40]:
param_dict = {'n_estimators':[i for i in range(100, 1500, 700)],#[i for i in range(100, 1000, 200)],
              'learning_rate':[i/1000 for i in range(10, 100, 50)],
              #'max_depth':[i for i in range(3, 12, 6)],
              #'min_child_weight':[i for i in range(1, 10, 5)],
              #'eta':[i for i in range(3, 12, 6)],
              #'subsample':[i/10 for i in range(5, 10, 3)],
              'colsample_bytree':[i/10 for i in range(1, 10, 5)],
              'objective':['reg:squarederror'],
              'eval_metric':['mae'],
              'early_stopping_rounds':[10],
              'gamma':[i/10 for i in range(1, 10, 5)]
              }#[i / 100.0 for i in range(1, 10, 2)]}

clf = GridSearchCV(estimator=model, param_grid=param_dict, scoring='neg_mean_absolute_error', n_jobs=8)

clf.fit(X=X_train_full, y=y_train)

pred = clf.predict(X_valid_full)

mean_absolute_error(pred, y_valid)

In [23]:
P.transform(test_data_for_submission, test=True)[0]

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,273_ohe,274_ohe,275_ohe,276_ohe,277_ohe,278_ohe,279_ohe,280_ohe,281_ohe,282_ohe
0,80.0,11622.0,5.0,6.0,1961.0,1961.0,0.0,468.0,144.0,270.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,81.0,14267.0,6.0,6.0,1958.0,1958.0,108.0,923.0,0.0,406.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,74.0,13830.0,5.0,5.0,1997.0,1998.0,0.0,791.0,0.0,137.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,78.0,9978.0,6.0,6.0,1998.0,1998.0,20.0,602.0,0.0,324.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,43.0,5005.0,8.0,5.0,1992.0,1992.0,0.0,263.0,0.0,1017.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,21.0,1936.0,4.0,7.0,1970.0,1970.0,0.0,0.0,0.0,546.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1455,21.0,1894.0,4.0,5.0,1970.0,1970.0,0.0,252.0,0.0,294.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1456,160.0,20000.0,5.0,7.0,1960.0,1996.0,0.0,1224.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1457,62.0,10441.0,5.0,5.0,1992.0,1992.0,0.0,337.0,0.0,575.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
pd.Series(clf.predict(P.transform(test_data_for_submission, test=True)[0]), index=test_data_for_submission.Id, name='SalePrice').to_csv('submission.csv')

In [29]:
pd.read_csv('./house-prices-advanced-regression-techniques/sample_submission.csv')

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.683570
3,1464,179317.477511
4,1465,150730.079977
...,...,...
1454,2915,167081.220949
1455,2916,164788.778231
1456,2917,219222.423400
1457,2918,184924.279659


In [51]:
forest.fit(X=X_train_enc, y=y_train)

RandomForestRegressor(n_estimators=350, random_state=0)

In [53]:
f_predictions = forest.predict(X_valid_enc)
mean_absolute_error(f_predictions, y_valid)

17246.714999999997

In [28]:
predictions = clf.predict(X_valid_enc)

In [29]:
mean_absolute_error(predictions, y_valid)

16476.368792808218