In [164]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

- id : 데이터 고유 id
- OverallQual : 전반적 재료와 마감 품질
- YearBuilt : 완공 연도
- YearRemodAdd : 리모델링 연도
- ExterQual : 외관 재료 품질
- BsmtQual : 지하실 높이
- TotalBsmtSF : 지하실 면적 
- 1stFlrSF : 1층 면적 
- GrLivArea : 지상층 생활 면적
- FullBath : 지상층 화장실 개수 
- KitchenQual : 부억 품질 
- GarageYrBlt : 차고 완공 연도
- GarageCars: 차고 자리 개수
- GarageArea: 차고 면적 
- target : 집값(달러 단위)

In [207]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [208]:
train = train.iloc[:, 1:]
test = test.iloc[:, 1:]

In [209]:
train = train[train['Garage Yr Blt'] < 2050]

In [210]:
train['new1'] = 2022 - train['Year Remod/Add']
test['new1'] = 2022 - test['Year Remod/Add']

In [211]:
train['new2'] = train['Total Bsmt SF'] / train['1st Flr SF']
test['new2'] = test['Total Bsmt SF'] / test['1st Flr SF']

In [212]:
train['new3'] = train['Gr Liv Area'] / train['1st Flr SF']
test['new3'] = test['Gr Liv Area'] / test['1st Flr SF']

In [216]:
cat_cols = ['Exter Qual', 'Kitchen Qual', 'Bsmt Qual']

In [217]:
for c in cat_cols :
    ord_df = train.groupby(c).target.median().reset_index(name = f'ord_{c}')
    train = pd.merge(train, ord_df, how = 'left')
    test = pd.merge(test, ord_df, how = 'left')

In [219]:
train.drop(cat_cols, axis = 1, inplace = True)
test.drop(cat_cols, axis = 1, inplace = True)

In [220]:
X = train.drop(['Garage Area', '1st Flr SF', 'target'], axis = 1)
y = np.log1p(train.target)

target = test[X.columns]

In [221]:
target.fillna(target.mean(), inplace = True)

In [222]:
fold_pred = np.zeros(target.shape[0])
val_nmae = []

In [177]:
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    tr_data = Pool(data = tr_x, label = tr_y)
    val_data = Pool(data = val_x, label = val_y)
    target_data = Pool(data = target, label = None)
    
    rf = RandomForestRegressor(random_state = 42, criterion = 'mae')
    gbr = GradientBoostingRegressor(random_state = 42, max_depth = 4, learning_rate = 0.05, n_estimators = 1000)
    cb = CatBoostRegressor(depth = 4, random_state = 42, loss_function = 'MAE', n_estimators = 3000, learning_rate = 0.03, verbose = 0)
    ngb = NGBRegressor(random_state = 42, n_estimators = 1000, verbose = 0, learning_rate = 0.03)
    xgb = XGBRegressor(random_state = 42, max_depth = 4, n_estimators = 2000, learning_rate = 0.03, objective = 'reg:squarederror')
    
    print(f'{rf.__class__.__name__} Training...')
    rf.fit(tr_x, tr_y)
    print(f'{gbr.__class__.__name__} Training...')
    gbr.fit(tr_x, tr_y)
    print(f'{cb.__class__.__name__} Training...')
    cb.fit(tr_data, eval_set = val_data, early_stopping_rounds = 750, verbose = 0)
    print(f'{ngb.__class__.__name__} Training...')
    ngb.fit(tr_x, tr_y, val_x, val_y, early_stopping_rounds = 300)
    print(f'{xgb.__class__.__name__} Training...')
    xgb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 300, verbose = 0, eval_metric = 'mae')
    
    rf_val = np.expm1(rf.predict(val_x)) / 5
    gbr_val = np.expm1(gbr.predict(val_x)) / 5
    cb_val = np.expm1(cb.predict(val_data)) / 5
    ngb_val = np.expm1(ngb.predict(val_x)) / 5
    xgb_val = np.expm1(xgb.predict(val_x)) / 5
    val_pred = (rf_val + gbr_val + cb_val + ngb_val + xgb_val)
    
    fold_nmae = NMAE(val_y, val_pred)
    val_nmae.append(fold_nmae)
    print(f'{n + 1} FOLD Training Done...Then Validation NMAE = {fold_nmae}\n')
    
    rf_fold = rf.predict(target) / 50
    gbr_fold = gbr.predict(target) / 50
    cb_fold = cb.predict(target_data) / 50
    ngb_fold = ngb.predict(target) / 50
    xgb_fold = xgb.predict(target) / 50
    fold_pred += (rf_fold + gbr_fold + cb_fold + ngb_fold + xgb_fold)
    
print(f'10FOLD Mean of NMAE = {np.mean(val_nmae)} & std = {np.std(val_nmae)}')

1 FOLD Training.....
RandomForestRegressor Training...
GradientBoostingRegressor Training...
CatBoostRegressor Training...
NGBRegressor Training...
XGBRegressor Training...
1 FOLD Training Done...Then Validation NMAE = 0.08542399105821648

2 FOLD Training.....
RandomForestRegressor Training...
GradientBoostingRegressor Training...
CatBoostRegressor Training...
NGBRegressor Training...
XGBRegressor Training...
2 FOLD Training Done...Then Validation NMAE = 0.1032520237395069

3 FOLD Training.....
RandomForestRegressor Training...
GradientBoostingRegressor Training...
CatBoostRegressor Training...
NGBRegressor Training...
XGBRegressor Training...
3 FOLD Training Done...Then Validation NMAE = 0.08798005132210708

4 FOLD Training.....
RandomForestRegressor Training...
GradientBoostingRegressor Training...
CatBoostRegressor Training...
NGBRegressor Training...
XGBRegressor Training...
4 FOLD Training Done...Then Validation NMAE = 0.10444093250764225

5 FOLD Training.....
RandomForestRegresso

In [178]:
final = (rf_pred + xgb_pred + ngb_pred + cb_pred + gbr_pred) / 5

In [179]:
submission['target'] = np.expm1(fold_pred)

In [180]:
submission.to_csv('1st.csv', index = False)

In [181]:
submission

Unnamed: 0,id,target
0,1,325811.740877
1,2,127029.983100
2,3,177040.326194
3,4,239331.746179
4,5,133204.736743
...,...,...
1345,1346,329106.241305
1346,1347,127072.747177
1347,1348,70728.175651
1348,1349,191231.181824
