In [68]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, Pool
from ngboost import NGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold

- id : 데이터 고유 id
- OverallQual : 전반적 재료와 마감 품질
- YearBuilt : 완공 연도
- YearRemodAdd : 리모델링 연도
- ExterQual : 외관 재료 품질
- BsmtQual : 지하실 높이
- TotalBsmtSF : 지하실 면적 
- 1stFlrSF : 1층 면적 
- GrLivArea : 지상층 생활 면적
- FullBath : 지상층 화장실 개수 
- KitchenQual : 부억 품질 
- GarageYrBlt : 차고 완공 연도
- GarageCars: 차고 자리 개수
- GarageArea: 차고 면적 
- target : 집값(달러 단위)

In [69]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [70]:
train = train.iloc[:, 1:]
test = test.iloc[:, 1:]

In [71]:
train.head()

Unnamed: 0,Overall Qual,Gr Liv Area,Exter Qual,Garage Cars,Garage Area,Kitchen Qual,Total Bsmt SF,1st Flr SF,Bsmt Qual,Full Bath,Year Built,Year Remod/Add,Garage Yr Blt,target
0,10,2392,Ex,3,968,Ex,2392,2392,Ex,2,2003,2003,2003,386250
1,7,1352,Gd,2,466,Gd,1352,1352,Ex,2,2006,2007,2006,194000
2,5,900,TA,1,288,TA,864,900,TA,1,1967,1967,1967,123000
3,5,1174,TA,2,576,Gd,680,680,TA,1,1900,2006,2000,135000
4,7,1958,Gd,3,936,Gd,1026,1026,Gd,2,2005,2005,2005,250000


In [72]:
train = train[train['Garage Yr Blt'] < 2050]

In [73]:
train['new1'] = 2022 - train['Year Remod/Add']
test['new1'] = 2022 - test['Year Remod/Add']

In [74]:
train['new2'] = train['Total Bsmt SF'] / train['1st Flr SF']
test['new2'] = test['Total Bsmt SF'] / test['1st Flr SF']

In [75]:
train['new3'] = train['Gr Liv Area'] / train['1st Flr SF']
test['new3'] = test['Gr Liv Area'] / test['1st Flr SF']

In [76]:
train['EQ'] = train['Exter Qual'].map({'Ex' : 5, 'Gd' : 4, 'TA' : 3, 'Fa' : 2, 'Po' : 1})
test['EQ'] = test['Exter Qual'].map({'Ex' : 5, 'Gd' : 4, 'TA' : 3, 'Fa' : 2, 'Po' : 1})

In [77]:
train['KQ'] = train['Kitchen Qual'].map({'Ex' : 5, 'Gd' : 4, 'TA' : 3, 'Fa' : 2, 'Po' : 1})
test['KQ'] = test['Kitchen Qual'].map({'Ex' : 5, 'Gd' : 4, 'TA' : 3, 'Fa' : 2, 'Po' : 1})

In [78]:
train['BQ'] = train['Bsmt Qual'].map({'Ex' : 5, 'Gd' : 4, 'TA' : 3, 'Fa' : 2, 'Po' : 1})
test['BQ'] = test['Bsmt Qual'].map({'Ex' : 5, 'Gd' : 4, 'TA' : 3, 'Fa' : 2, 'Po' : 1})

In [79]:
train['TQ'] = train['Overall Qual'] * 0.5 + train['EQ'] * 0.3 + train['KQ'] * 0.13 + train['BQ'] * 0.07
test['TQ'] = test['Overall Qual'] * 0.5 + test['EQ'] * 0.3 + test['KQ'] * 0.13 + test['BQ'] * 0.07

In [80]:
cat_cols = ['Exter Qual', 'Kitchen Qual', 'Bsmt Qual']

In [81]:
for c in cat_cols :
    ord_df = train.groupby(c).target.median().reset_index(name = f'ord_{c}')
    train = pd.merge(train, ord_df, how = 'left')
    test = pd.merge(test, ord_df, how = 'left')

In [82]:
train.drop(cat_cols, axis = 1, inplace = True)
test.drop(cat_cols, axis = 1, inplace = True)

In [83]:
X = train.drop(['KQ', 'EQ', 'BQ', 'Overall Qual', 'Garage Area', '1st Flr SF', 'target'], axis = 1)
y = np.log1p(train.target)

target = test[X.columns]

In [84]:
target.fillna(target.mean(), inplace = True)

In [86]:
kf = KFold(n_splits = 10, random_state = 42, shuffle = True)

In [91]:
n_models = 5

In [87]:
def NMAE(true, pred) -> float:
    mae = np.mean(np.abs(true - pred))
    score = mae / np.mean(np.abs(true))
    return score

In [93]:
val_nmae = []
rf_nmae = []
ngb_nmae = []
xgb_nmae = []
cb_nmae = []
gbr_nmae = []
fold_pred = np.zeros(target.shape[0])
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'##### {n + 1} FOLD Training..... #####')
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    tr_data = Pool(data = tr_x, label = tr_y)
    val_data = Pool(data = val_x, label = val_y)
    target_data = Pool(data = target, label = None)
    
    rf = RandomForestRegressor(random_state = 42, criterion = 'mae')
    cb = CatBoostRegressor(depth = 4, random_state = 42, loss_function = 'MAE', n_estimators = 3000, learning_rate = 0.03, verbose = 0)
    ngb = NGBRegressor(random_state = 42, n_estimators = 2000, verbose = 0, learning_rate = 0.03)
    xgb = XGBRegressor(random_state = 42, max_depth = 4, n_estimators = 2000, learning_rate = 0.03, objective = 'reg:squarederror')
    gbr = GradientBoostingRegressor(random_state = 42, max_depth = 4, learning_rate = 0.05, n_estimators = 1500)
    
    rf.fit(tr_x, tr_y)
    rf_val = np.expm1(rf.predict(val_x))
    rf_score = NMAE(val_y, rf_val)
    rf_nmae.append(rf_score)
    print(f'{gbr.__class__.__name__} Training Completed...NMAE = {NMAE(val_y, rf_val)}')
    gbr.fit(tr_x, tr_y)
    gbr_val = np.expm1(gbr.predict(val_x))
    gbr_score = NMAE(val_y, gbr_val)
    gbr_nmae.append(gbr_score)
    print(f'{rf.__class__.__name__} Training Completed...NMAE = {NMAE(val_y, gbr_val)}')
    cb.fit(tr_data, eval_set = val_data, early_stopping_rounds = 750, verbose = 0)
    cb_val = np.expm1(cb.predict(val_data))
    cb_score = NMAE(val_y, cb_val)
    cb_nmae.append(cb_score)
    print(f'{cb.__class__.__name__} Training Completed...NMAE = {NMAE(val_y, cb_val)}')
    ngb.fit(tr_x, tr_y, val_x, val_y, early_stopping_rounds = 500)
    ngb_val = np.expm1(ngb.predict(val_x))
    ngb_score = NMAE(val_y, ngb_val)
    ngb_nmae.append(ngb_score)
    print(f'{ngb.__class__.__name__} Training Completed...NMAE = {NMAE(val_y, ngb_val)}')
    xgb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 500, verbose = 0, eval_metric = 'mae')
    xgb_val = np.expm1(xgb.predict(val_x))
    xgb_score = NMAE(val_y, xgb_val)
    xgb_nmae.append(xgb_score)
    print(f'{xgb.__class__.__name__} Training Completed...NMAE = {NMAE(val_y, xgb_val)}\n')
    val_pred = (rf_val + ngb_val + cb_val + xgb_val + gbr_val) / n_models
    
    fold_nmae = NMAE(val_y, val_pred)
    val_nmae.append(fold_nmae)
    print(f'{n + 1} FOLD Validation NMAE = {fold_nmae}\n')
    
    rf_fold = rf.predict(target) / (n_models * kf.n_splits)
    cb_fold = cb.predict(target_data) / (n_models * kf.n_splits)
    ngb_fold = ngb.predict(target) / (n_models * kf.n_splits)
    xgb_fold = xgb.predict(target) / (n_models * kf.n_splits)
    gbr_fold = gbr.predict(target) / (n_models * kf.n_splits)
    
    fold_pred += (gbr_fold + rf_fold  + cb_fold + ngb_fold + xgb_fold)
    
print(f'{"#" * 30}')
print(f'{kf.n_splits}FOLD Mean of NMAE = {np.mean(val_nmae)} & std = {np.std(val_nmae)}')
print(f'##### 각 모델별 NMAE #####')
print(f'{rf.__class__.__name__} {kf.n_splits} FOLD Mean of NMAE = {np.mean(rf_nmae)}')
print(f'{cb.__class__.__name__} {kf.n_splits} FOLD Mean of NMAE = {np.mean(cb_nmae)}')
print(f'{ngb.__class__.__name__} {kf.n_splits} FOLD Mean of NMAE = {np.mean(ngb_nmae)}')
print(f'{gbr.__class__.__name__} {kf.n_splits} FOLD Mean of NMAE = {np.mean(gbr_nmae)}') 

##### 1 FOLD Training..... #####
GradientBoostingRegressor Training Completed...NMAE = 0.08838915843249007
RandomForestRegressor Training Completed...NMAE = 0.0842268053467435
CatBoostRegressor Training Completed...NMAE = 0.09118989676521634
NGBRegressor Training Completed...NMAE = 0.08706374447283997
XGBRegressor Training Completed...NMAE = 0.08794786559552827

1 FOLD Validation NMAE = 0.08360459935022721

##### 2 FOLD Training..... #####
GradientBoostingRegressor Training Completed...NMAE = 0.10717508791112941
RandomForestRegressor Training Completed...NMAE = 0.10367550210293036
CatBoostRegressor Training Completed...NMAE = 0.10100829929028095
NGBRegressor Training Completed...NMAE = 0.10016189496716514
XGBRegressor Training Completed...NMAE = 0.10707083894109641

2 FOLD Validation NMAE = 0.10116735953971168

##### 3 FOLD Training..... #####
GradientBoostingRegressor Training Completed...NMAE = 0.09010065472551637
RandomForestRegressor Training Completed...NMAE = 0.09152176438696287


In [94]:
final = (rf_fold + xgb_fold + ngb_fold + cb_fold + gbr_fold) / 5

In [95]:
submission['target'] = np.expm1(fold_pred)

In [96]:
submission.to_csv('0201.csv', index = False)

In [97]:
submission

Unnamed: 0,id,target
0,1,322050.409282
1,2,129110.238998
2,3,177486.116158
3,4,239205.253992
4,5,132734.547109
...,...,...
1345,1346,328635.134851
1346,1347,127731.296279
1347,1348,73987.790914
1348,1349,184172.578725
