In [213]:
import pandas as pd
import numpy as np
import pandas_profiling

import seaborn as sns
import matplotlib.pyplot as plt

In [214]:
train_df = pd.read_csv('../input/daconstudy/train.csv')
test_df = pd.read_csv('../input/daconstudy/test.csv')

sub_id = test_df['id']

train_df = train_df.drop('id', axis = 1)
test_df = test_df.drop('id', axis = 1)
# 중복 데이터 제거
train_df = train_df.drop_duplicates()   
# 데이터의 분포 확인
profile = train_df.profile_report()

# profile

# 알 수 있는 것들
- Gr Liv Area, Garage Area, 1st Flr SF, target  정규화
- Data 범주화
- Garage Yr Blt 2207 삭제

In [215]:
train_df = train_df.drop(train_df[train_df['Garage Yr Blt'] == 2207].index)
print('train_df Shape :', train_df.shape)

In [216]:
train_df.info()

In [217]:
def preprocess_data(data, features):
    category = {'Po' : 1, 'Fa' : 2, 'TA' : 3, 'Gd' : 4, 'Ex' : 5}
    
    # log
    for feature in features:
        data[feature] = np.log1p(data[feature])
    
    # Categorical
    data['Exter Qual'] = data['Exter Qual'].map(category)
    data['Kitchen Qual'] = data['Kitchen Qual'].map(category)
    data['Bsmt Qual'] = data['Bsmt Qual'].map(category)
    
    # Create New Perspective 
    data['Remodeling'] = data['Year Remod/Add'] - data['Year Built']
    data['2nd SF'] = data['Gr Liv Area'] - data['1st Flr SF']
    data['2nd flr'] = data['2nd SF'].apply(lambda x : 1 if x > 0 else 0)
    data['Total_Qual'] = data['Exter Qual'] + data['Kitchen Qual'] + data['Overall Qual'] + data['Bsmt Qual']
    data['Total_Area'] = data['Gr Liv Area'] + data['Garage Area'] + data['Total Bsmt SF']
    
    # 동화님 아이디어 참고
    data['Car Area'] = data['Garage Area'] / data['Garage Cars']
    data['Garage InOut'] = data.apply(lambda x : 1 if x['Gr Liv Area'] != x['1st Flr SF'] else 0, axis=1)    
    
    return data

features_train = ['Total Bsmt SF', '1st Flr SF', 'Gr Liv Area', 'Garage Area', 'target']
features_test = ['Total Bsmt SF', '1st Flr SF', 'Gr Liv Area', 'Garage Area']

train_df = preprocess_data(train_df, features_train)
test_df = preprocess_data(test_df, features_test)

train_label = train_df['target']
train_df = train_df.drop('target', axis = 1)

In [218]:
# Metric
def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score

In [189]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from catboost import CatBoostRegressor, Pool
from ngboost import NGBRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold

In [190]:
nmae_score = make_scorer(NMAE, greater_is_better=False)

kf = KFold(n_splits = 10, random_state = 42, shuffle = True)

# LinearRegression
lr_pred = np.zeros(test_df.shape[0])

lr_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(train_df, train_label)) :
    print(f'{n+1} Fold Training')
    train_x, train_y = train_df.iloc[tr_idx], train_label.iloc[tr_idx]
    valid_x, valid_y = train_df.iloc[val_idx], train_label.iloc[val_idx]
    
    lr = LinearRegression()
    lr.fit(train_x,train_y)
    
    val_pred = np.expm1(lr.predict(valid_x))
    val_NMAE = NMAE(np.expm1(valid_y), val_pred)
    
    lr_val.append(val_NMAE)
    print(f'{n+1} FOLD NMAE = {val_NMAE}')
    
    target_data = Pool(data = test_df, label = None)
    fold_pred = lr.predict(test_df) / 10
    
    lr_pred += fold_pred

print(f'10FOLD Mean of NMAE = {np.mean(lr_val)} & std = {np.std(lr_val)}')

In [202]:
# Ridge
rg_pred = np.zeros(test_df.shape[0])

rg_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(train_df, train_label)) :
    print(f'{n+1} Fold Training')
    train_x, train_y = train_df.iloc[tr_idx], train_label.iloc[tr_idx]
    valid_x, valid_y = train_df.iloc[val_idx], train_label.iloc[val_idx]
    
    rg = Ridge()
    rg.fit(train_x,train_y)
    
    val_pred = np.expm1(rg.predict(valid_x))
    val_NMAE = NMAE(np.expm1(valid_y), val_pred)
    
    rg_val.append(val_NMAE)
    print(f'{n+1} FOLD NMAE = {val_NMAE}')
    
    target_data = Pool(data = test_df, label = None)
    fold_pred = rg.predict(test_df) / 10
    
    rg_pred += fold_pred

print(f'10FOLD Mean of NMAE = {np.mean(rg_val)} & std = {np.std(rg_val)}')

In [203]:
# Lasso
lasso_pred = np.zeros(test_df.shape[0])

lasso_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(train_df, train_label)) :
    print(f'{n+1} Fold Training')
    train_x, train_y = train_df.iloc[tr_idx], train_label.iloc[tr_idx]
    valid_x, valid_y = train_df.iloc[val_idx], train_label.iloc[val_idx]
    
    ls = Lasso()
    ls.fit(train_x,train_y)
    
    val_pred = np.expm1(ls.predict(valid_x))
    val_NMAE = NMAE(np.expm1(valid_y), val_pred)
    
    lasso_val.append(val_NMAE)
    print(f'{n+1} FOLD NMAE = {val_NMAE}')
    
    target_data = Pool(data = test_df, label = None)
    fold_pred = ls.predict(test_df) / 10
    
    lasso_pred += fold_pred

print(f'10FOLD Mean of NMAE = {np.mean(lasso_val)} & std = {np.std(lasso_val)}')

In [204]:
# ElasticNet
els_pred = np.zeros(test_df.shape[0])

els_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(train_df, train_label)) :
    print(f'{n+1} Fold Training')
    train_x, train_y = train_df.iloc[tr_idx], train_label.iloc[tr_idx]
    valid_x, valid_y = train_df.iloc[val_idx], train_label.iloc[val_idx]
    
    els = ElasticNet()
    els.fit(train_x,train_y)
    
    val_pred = np.expm1(els.predict(valid_x))
    val_NMAE = NMAE(np.expm1(valid_y), val_pred)
    
    els_val.append(val_NMAE)
    print(f'{n+1} FOLD NMAE = {val_NMAE}')
    
    target_data = Pool(data = test_df, label = None)
    fold_pred = els.predict(test_df) / 10
    
    els_pred += fold_pred

print(f'10FOLD Mean of NMAE = {np.mean(els_val)} & std = {np.std(els_val)}')

In [191]:
# GBR
gbr_pred = np.zeros(test_df.shape[0])

gbr_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(train_df, train_label)) :
    print(f'{n+1} Fold Training')
    train_x, train_y = train_df.iloc[tr_idx], train_label.iloc[tr_idx]
    valid_x, valid_y = train_df.iloc[val_idx], train_label.iloc[val_idx]
    
    gbr = GradientBoostingRegressor(min_samples_leaf=15)
    gbr.fit(train_x,train_y)
    
    val_pred = np.expm1(gbr.predict(valid_x))
    val_NMAE = NMAE(np.expm1(valid_y), val_pred)
    
    gbr_val.append(val_NMAE)
    print(f'{n+1} FOLD NMAE = {val_NMAE}')
    
    target_data = Pool(data = test_df, label = None)
    fold_pred = gbr.predict(test_df) / 10
    
    gbr_pred += fold_pred

print(f'10FOLD Mean of NMAE = {np.mean(gbr_val)} & std = {np.std(gbr_val)}')

In [195]:
# RF
rf_pred = np.zeros(test_df.shape[0])
rf_val = []

for n, (tr_idx, val_idx) in enumerate(kf.split(train_df, train_label)) :
    print(f'{n+1} Fold Training')
    train_x, train_y = train_df.iloc[tr_idx], train_label.iloc[tr_idx]
    valid_x, valid_y = train_df.iloc[val_idx], train_label.iloc[val_idx]
    
    rf = RandomForestRegressor(random_state= 32, criterion= 'mae')
    rf.fit(train_x,train_y)
    
    val_pred = np.expm1(rf.predict(valid_x))
    val_NMAE = NMAE(np.expm1(valid_y), val_pred)
    
    rf_val.append(val_NMAE)
    print(f'{n+1} FOLD NMAE = {val_NMAE}')
    
    target_data = Pool(data = test_df, label = None)
    fold_pred = rf.predict(test_df) / 10
    
    rf_pred += fold_pred

print(f'10FOLD Mean of NMAE = {np.mean(rf_val)} & std = {np.std(rf_val)}')
# 10FOLD Mean of NMAE = 0.11824903167369834 & std = 0.006347554553427555

In [197]:
# NGBRegressor
ngb_pred = np.zeros(test_df.shape[0])

ngb_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(train_df, train_label)) :
    print(f'{n+1} Fold Training')
    train_x, train_y = train_df.iloc[tr_idx], train_label.iloc[tr_idx]
    valid_x, valid_y = train_df.iloc[val_idx], train_label.iloc[val_idx]
    
    ngb = NGBRegressor(random_state = 42, n_estimators = 1000, verbose = 0, learning_rate = 0.03)
    ngb.fit(train_x, train_y, valid_x, valid_y, early_stopping_rounds = 300)
    
    val_pred = np.expm1(ngb.predict(valid_x))
    val_NMAE = NMAE(np.expm1(valid_y), val_pred)
    
    ngb_val.append(val_NMAE)
    print(f'{n+1} FOLD NMAE = {val_NMAE}')
    
    target_data = Pool(data = test_df, label = None)
    fold_pred = rf.predict(test_df) / 10
    
    ngb_pred += fold_pred

print(f'10FOLD Mean of NMAE = {np.mean(ngb_val)} & std = {np.std(ngb_val)}')
# 0.09461596073922843 & std = 0.009333805300220727

In [205]:
from sklearn.linear_model import BayesianRidge

by_pred = np.zeros(test_df.shape[0])
by_val = []

for n, (tr_idx, val_idx) in enumerate(kf.split(train_df, train_label)) :
    print(f'{n + 1} FOLD Training.....')
    train_x, train_y = train_df.iloc[tr_idx], train_label.iloc[tr_idx]
    valid_x, valid_y = train_df.iloc[val_idx], train_label.iloc[val_idx]
    
    by = BayesianRidge()
    by.fit(train_x, train_y)
    
    val_pred = np.expm1(by.predict(valid_x))
    val_nmae = NMAE(np.expm1(valid_y), val_pred)
    by_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    target_data = Pool(data = test_df, label = None)
    fold_pred = cb.predict(test_df) / 10
    by_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(by_val)} & std = {np.std(by_val)}')

In [199]:
# Catboost
cb_pred = np.zeros(test_df.shape[0])
cb_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(train_df, train_label)) :
    print(f'{n + 1} FOLD Training.....')
    train_x, train_y = train_df.iloc[tr_idx], train_label.iloc[tr_idx]
    valid_x, valid_y = train_df.iloc[val_idx], train_label.iloc[val_idx]
    
    tr_data = Pool(data = train_x, label = train_y)
    val_data = Pool(data = valid_x, label = valid_y)
    
    cb = CatBoostRegressor(depth = 4, random_state = 42, loss_function = 'MAE', n_estimators = 3000, learning_rate = 0.03, verbose = 0)
    cb.fit(tr_data, eval_set = val_data, early_stopping_rounds = 750, verbose = 1000)
    
    val_pred = np.expm1(cb.predict(valid_x))
    val_nmae = NMAE(np.expm1(valid_y), val_pred)
    cb_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    target_data = Pool(data = test_df, label = None)
    fold_pred = cb.predict(test_df) / 10
    cb_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(cb_val)} & std = {np.std(cb_val)}')




In [200]:
target = np.expm1((gbr_pred + rf_pred + ngb_pred + cb_pred) / 4)

In [206]:
# 검증 성능 확인하기
val_list = [lr_val, rg_val, ls_val, el_val,by_val, gbr_val, rf_val, ngb_val, cb_val]
for val in val_list :
    print("{:.8f}".format(np.mean(val))) 

In [201]:
sub_park = pd.DataFrame({
    'id' : sub_id,
    'target': target
})

sub_park.to_csv('sz1.csv', index = False)

In [208]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


In [219]:
# Catboost
xgb_pred = np.zeros(test_df.shape[0])
xgb_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(train_df, train_label)) :
    print(f'{n + 1} FOLD Training')
    train_x, train_y = train_df.iloc[tr_idx], train_label.iloc[tr_idx]
    valid_x, valid_y = train_df.iloc[val_idx], train_label.iloc[val_idx]
    
    xgb = XGBRegressor(depth = 4, random_state = 42, learning_rate = 0.03)
    xgb.fit(train_x, train_y)
    
    val_pred = np.expm1(xgb.predict(valid_x))
    val_nmae = NMAE(np.expm1(valid_y), val_pred)
    xgb_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    target_data = Pool(data = test_df, label = None)
    fold_pred = xgb.predict(test_df) / 10
    xgb_pred += fold_pred
    
print(f'10FOLD Mean of NMAE = {np.mean(xgb_val)} & std = {np.std(xgb_val)}')

In [176]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from catboost import CatBoostRegressor
from ngboost import NGBRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

gbr = GradientBoostingRegressor()
rf = RandomForestRegressor()
ngb = NGBRegressor()
cb = CatBoostRegressor()

estimators = [gbr, rf, ngb, cb]

params_gbr = {'n_estimators' : [100],
             'learning_rate' : [0.01, 0.02, 0.05, 0.1],
             'max_depth' : [1, 2, 3, 4],
             'min_samples_leaf' : [5, 10, 15],
             'min_samples_split' : [2, 5, 10]}

params_rf = {
    'n_estimators': [100], 
    'max_leaf_nodes': list(range(2, 10, 2)), 
    'max_depth': list(range(6, 30, 3))
}

params_ngb = {
    'n_estimators': [100], 
    'learning_rate': [0.01, 0.02, 0.05, 0.3, 0.5, 0.7, 0.1]
}

params_cb = {'depth': [ 6,8,10],
             'learning_rate' : [0.01, 0.02, 0.05, 0.3, 0.5, 0.7, 0.1],
             'iterations'    : [30, 50, 100]
}

params = [params_gbr, params_rf, params_ngb, params_cb]

In [178]:
from tqdm.auto import tqdm

def gridSearhCV(models, params):
    best_models = []
    for i in tqdm(range(0, len(models))):
        model_grid = GridSearchCV(models[i], params[i], n_jobs = -1, cv = 10)
        model_grid.fit(train_df, train_label)
        best_models.append(model_grid.best_estimator_)
    
    return best_models

best_model_list = gridSearhCV(estimators, params)

In [182]:
best_model_list