In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, Pool
from ngboost import NGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold, cross_val_score

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [3]:
train = train.iloc[:, 1:]
test = test.iloc[:, 1:]

In [4]:
train = train[train['Garage Yr Blt'] < 2050]

In [5]:
train['new1'] = 2022 - train['Year Remod/Add']
test['new1'] = 2022 - test['Year Remod/Add']

In [6]:
train['new2'] = np.log1p(train['1st Flr SF'] / train['Total Bsmt SF'])
test['new2'] = np.log1p(test['1st Flr SF'] / test['Total Bsmt SF'])

In [7]:
train['new3'] = train['Gr Liv Area'] / train['1st Flr SF']
test['new3'] = test['Gr Liv Area'] / test['1st Flr SF']

In [8]:
train['EQ'] = train['Exter Qual'].map({'Ex' : 5, 'Gd' : 4, 'TA' : 3, 'Fa' : 2, 'Po' : 1})
test['EQ'] = test['Exter Qual'].map({'Ex' : 5, 'Gd' : 4, 'TA' : 3, 'Fa' : 2, 'Po' : 1})

In [9]:
train['KQ'] = train['Kitchen Qual'].map({'Ex' : 5, 'Gd' : 4, 'TA' : 3, 'Fa' : 2, 'Po' : 1})
test['KQ'] = test['Kitchen Qual'].map({'Ex' : 5, 'Gd' : 4, 'TA' : 3, 'Fa' : 2, 'Po' : 1})

In [10]:
train['BQ'] = train['Bsmt Qual'].map({'Ex' : 5, 'Gd' : 4, 'TA' : 3, 'Fa' : 2, 'Po' : 1})
test['BQ'] = test['Bsmt Qual'].map({'Ex' : 5, 'Gd' : 4, 'TA' : 3, 'Fa' : 2, 'Po' : 1})

In [11]:
train['TQ'] = train['Overall Qual'] * 0.5 + train['EQ'] * 0.3 + train['KQ'] * 0.15 + train['BQ'] * 0.05
test['TQ'] = test['Overall Qual'] * 0.5 + test['EQ'] * 0.3 + test['KQ'] * 0.15 + test['BQ'] * 0.05

In [12]:
cat_cols = ['Exter Qual', 'Kitchen Qual', 'Bsmt Qual']

In [13]:
for c in cat_cols :
    ord_df = train.groupby(c).target.mean().reset_index(name = f'ord_{c}')
    train = pd.merge(train, ord_df, how = 'left')
    test = pd.merge(test, ord_df, how = 'left')

In [14]:
train.drop(cat_cols, axis = 1, inplace = True)
test.drop(cat_cols, axis = 1, inplace = True)

In [15]:
X = train.drop(['KQ', 'EQ', 'BQ', 'Overall Qual', 'Garage Area', '1st Flr SF', 'target'], axis = 1)
y = np.log1p(train.target)

target = test[X.columns]

In [16]:
target.fillna(train.mean(), inplace = True)

In [17]:
kf = KFold(n_splits = 10, random_state = 42, shuffle = True)

In [18]:
n_models = 6

In [19]:
def NMAE(true, pred) -> float:
    mae = np.mean(np.abs(true - pred))
    score = mae / np.mean(np.abs(true))
    return score

In [20]:
val_nmae = []
rf_nmae = []
ngb_nmae = []
xgb_nmae = []
cb_nmae = []
gbr_nmae = []
sr_nmae = []
fold_pred = np.zeros(target.shape[0])
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'##### {n + 1} FOLD Training..... #####')
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    tr_data = Pool(data = tr_x, label = tr_y)
    val_data = Pool(data = val_x, label = val_y)
    target_data = Pool(data = target, label = None)
    
    ### Setting the 6models
    rf = RandomForestRegressor(random_state = 2022, criterion = 'mae')
    cb = CatBoostRegressor(depth = 4, random_state = 2, loss_function = 'MAE', n_estimators = 2000, learning_rate = 0.03, verbose = 0)
    ngb = NGBRegressor(random_state = 1, n_estimators = 3000, verbose = 0, learning_rate = 0.03)
    xgb = XGBRegressor(random_state = 28, max_depth = 4, n_estimators = 3000, learning_rate = 0.03, objective = 'reg:squarederror')
    gbr = GradientBoostingRegressor(random_state = 67, max_depth = 4, learning_rate = 0.03, n_estimators = 1500)
    lgbm = LGBMRegressor(random_state = 42, max_depth = 4, n_estimators = 2000, learning_rate = 0.03, objective = 'l1')
    rg = Ridge(random_state = 203)
    
    ### RandomForest
    rf.fit(tr_x, tr_y)
    rf_val = np.expm1(rf.predict(val_x))
    rf_score = NMAE(val_y, rf_val)
    rf_nmae.append(rf_score)
    print(f'{rf.__class__.__name__} Training Completed...NMAE = {NMAE(val_y, rf_val)}')
    
    ### GradientBoosting
    gbr.fit(tr_x, tr_y)
    gbr_val = np.expm1(gbr.predict(val_x))
    gbr_score = NMAE(val_y, gbr_val)
    gbr_nmae.append(gbr_score)
    print(f'{gbr.__class__.__name__} Training Completed...NMAE = {NMAE(val_y, gbr_val)}')
    
    ### CatBoost
    cb.fit(tr_data)#, eval_set = val_data, early_stopping_rounds = 750, verbose = 0)
    cb_val = np.expm1(cb.predict(val_data))
    cb_score = NMAE(val_y, cb_val)
    cb_nmae.append(cb_score)
    print(f'{cb.__class__.__name__} Training Completed...NMAE = {NMAE(val_y, cb_val)}')
    
    ### NGBoost
    ngb.fit(tr_x, tr_y, val_x, val_y, early_stopping_rounds = 500)
    ngb_val = np.expm1(ngb.predict(val_x))
    ngb_score = NMAE(val_y, ngb_val)
    ngb_nmae.append(ngb_score)
    print(f'{ngb.__class__.__name__} Training Completed...NMAE = {NMAE(val_y, ngb_val)}')
    
    ### XGBoost
    xgb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 500, verbose = 0, eval_metric = 'mae')
    xgb_val = np.expm1(xgb.predict(val_x))
    xgb_score = NMAE(val_y, xgb_val)
    xgb_nmae.append(xgb_score)
    print(f'{xgb.__class__.__name__} Training Completed...NMAE = {NMAE(val_y, xgb_val)}')
    
    ### Stacking
    sr = StackingRegressor(estimators = [('RF', rf), ('CB', cb), ('XGB', xgb), ('GBR', gbr), ('LGBM', lgbm)], final_estimator = rg)
    sr.fit(tr_x, tr_y)
    sr_val = np.expm1(sr.predict(val_x))
    sr_score = NMAE(val_y, sr_val)
    sr_nmae.append(sr_score)
    print(f'{sr.__class__.__name__} Training Completed...NMAE = {sr_score}\n')
    
    ### Make ensemble for validation
    val_pred = (sr_val * 0.5 + gbr_val * 0.06 + rf_val * 0.1  + cb_val * 0.2 + ngb_val * 0.1 + xgb_val * 0.04)
    fold_nmae = NMAE(val_y, val_pred)
    val_nmae.append(fold_nmae)
    print(f'********** When Ensemble, {n + 1} FOLD Validation NMAE = {fold_nmae} **********\n')
    
    ### Prediction
    rf_fold = rf.predict(target) / kf.n_splits
    cb_fold = cb.predict(target_data) / kf.n_splits
    ngb_fold = ngb.predict(target) / kf.n_splits
    xgb_fold = xgb.predict(target) / kf.n_splits
    gbr_fold = gbr.predict(target) / kf.n_splits
    sr_fold = sr.predict(target) / kf.n_splits
    
    ### Make ensemble for prediction using test data
    fold_pred += (sr_fold * 0.5 + gbr_fold * 0.06 + rf_fold * 0.1  + cb_fold * 0.2 + ngb_fold * 0.1 + xgb_fold * 0.04)
    
print(f'*-*-*-*-*-*-*-*-*-Training & Prediction Done.....-*-*-*-*-*-*-*-*-*')
print(f'Result : {kf.n_splits} FOLD Mean of NMAE = {np.mean(val_nmae)} & std = {np.std(val_nmae)}')
print(f'##### Average NMAE of Each Model #####')
print(f'{rf.__class__.__name__} = {np.mean(rf_nmae)}')
print(f'{cb.__class__.__name__} = {np.mean(cb_nmae)}')
print(f'{ngb.__class__.__name__} = {np.mean(ngb_nmae)}')
print(f'{gbr.__class__.__name__} = {np.mean(gbr_nmae)}')
print(f'{xgb.__class__.__name__} = {np.mean(xgb_nmae)}')
print(f'{sr.__class__.__name__} = {np.mean(sr_nmae)}')

##### 1 FOLD Training..... #####
RandomForestRegressor Training Completed...NMAE = 0.08597747959431033
GradientBoostingRegressor Training Completed...NMAE = 0.0816727872223358
CatBoostRegressor Training Completed...NMAE = 0.08647790572898846
NGBRegressor Training Completed...NMAE = 0.08775975983950367
XGBRegressor Training Completed...NMAE = 0.092200131779733
StackingRegressor Training Completed...NMAE = 0.08230470315960663

********** When Ensemble, 1 FOLD Validation NMAE = 0.08267089459987931 **********

##### 2 FOLD Training..... #####
RandomForestRegressor Training Completed...NMAE = 0.10898326889883439
GradientBoostingRegressor Training Completed...NMAE = 0.10155448019532137
CatBoostRegressor Training Completed...NMAE = 0.10142747433648916
NGBRegressor Training Completed...NMAE = 0.10252180606999126
XGBRegressor Training Completed...NMAE = 0.11461251601968098
StackingRegressor Training Completed...NMAE = 0.10018480583912069

********** When Ensemble, 2 FOLD Validation NMAE = 0.100

In [24]:
submission['target'] = np.expm1(fold_pred)

In [22]:
submission.to_csv('20220204.csv', index = False)

In [25]:
submission

Unnamed: 0,id,target
0,1,327868.490767
1,2,128105.089789
2,3,180720.513751
3,4,243449.384776
4,5,132904.980494
...,...,...
1345,1346,342573.258506
1346,1347,125931.246181
1347,1348,68666.944478
1348,1349,186039.292738
