# XGBoost Regression

XGBoost with the following configuration reached a root-mean-squared-error of 0.1345.

In [15]:
import pandas as pd
import numpy as np

In [16]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [17]:
train = pd.read_csv('dataset.csv')
# train = train[train['SalePrice'] < 500000]

testset = pd.read_csv('testset.csv')
test_all = pd.read_csv('test.csv')

In [18]:
train = train.drop(['Id'], axis=1)
test = testset.drop(['Id'], axis=1)

In [19]:
X = train.iloc[:, :-1].values
y = train.iloc[:, -1].values
test_data = test.iloc[:,:].values

In [20]:
def transform_columns(X, test_data, columns):
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), columns)], remainder='passthrough')
    # X = np.array(ct.fit_transform(X))
    X = ct.fit_transform(X)
    # test_data = np.array(ct.transform(test_data))
    test_data = ct.transform(test_data)

    return X, test_data

def standardize(X_train, X_test, test_data):
    sc = StandardScaler(with_mean=False)
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    test_data = sc.transform(test_data)

    return X_train, X_test, test_data

def get_model():
    return xgb.XGBRegressor(
    objective ='reg:squarederror', 
    colsample_bytree = 0.75, 
    learning_rate = 0.08, 
    max_depth = 2, 
    alpha = 10, 
    n_estimators = 1230
)
    
def preds_to_file(test_all, preds, filename):
    output = pd.DataFrame({'Id': test_all.Id, 'SalePrice': preds})
    output.to_csv(filename, index=False)
    print(f"File {filename} created!")

In [21]:
 categorical_cols = [1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 36, 37, 38, 39, 50, 52, 54, 56, 59, 60, 61, 71, 72]
 X, test_data = transform_columns(X, test_data, categorical_cols)
 X = X.toarray()
 test_data = test_data.toarray()

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
X_train, X_test, test_data = standardize(X_train, X_test, test_data)

In [23]:
regressor = get_model()
regressor.fit(X_train, y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.75, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.08, max_delta_step=0, max_depth=2,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1230, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=10, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
y_pred = regressor.predict(X_test)
print(r2_score(y_test, y_pred))

0.9365535174320302


In [25]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (rmse))

RMSE: 21284.433245


In [14]:
preds = regressor.predict(test_data)
preds_to_file(test_all, preds, "xboost_reg_02.csv")

File xboost_reg_02.csv created!
