In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

from flaml import AutoML
from flaml.model import LGBMEstimator, XGBoostEstimator, RandomForestEstimator, ExtraTreesEstimator

import pickle

# EDA

## 변수 설명
  - **int_rate** : 대출자에 부여된 이자율 (Interest rate of the loan the applicant received)
  - **annual_inc** : 연 소득 (annual income)
  - **dti** : 소득 대비 부채 비율 (Debt-to-income ratio)
  - **delinq_2yrs** : 지난 2년 간 체납 발생 횟수 (Delinquencies on lines of credit in the last 2 years)
  - **inq_last_6mths** : 지난 6개월 간 신용 조회 수 (Inquiries into the applicant's credit during the last 6 months)
  - **pub_rec** : 파산 횟수 (Number of bankruptcies listed in the public record)
  - **revol_bal** : 리볼빙 잔액 (Total credit revolving balance)
  - **total_acc** : 지금까지 소유했던 신용카드 개수 (num_total_cc_accounts : Total number of credit card accounts in the applicant's history)
  - **collections_12_mths_ex_med** : 의료부문을 제외한 지난 12개월 간 추심 발생 횟수 (num_collections_last_12m : Number of collections in the last 12 months. This excludes medical collections)
  - **acc_now_delinq** : 대출자가 체납 상태에 있지 않은 계좌의 수 (The number of accounts on which the borrower is now delinquent)
  - **tot_coll_amt** : 대출자에 대한 현재까지의 총 추심액 (total_collection_amount_ever : The total amount that the applicant has had against them in collections)
  - **tot_cur_bal** : 전 계좌의 현재 통합 잔고 (Total current balance of all accounts)
  - **chargeoff_within_12_mths** : 대출 부 신청인의 대출 신청 직전 12개월 간 세금 공제 횟수 (Number of charge-offs within last 12 months at time of application for the secondary applicant)
  - **delinq_amnt** : 체납 금액 (delinquency amount)
  - **tax_liens** : 세금 저당권의 수 (Number of tax liens)
  - **emp_length1 ~ 12** : 고용 연수 (Number of years in the job)
  - **home_ownership1 ~ 6** : 대출 신청자의 주거 소유 형태 (The ownership status of the applicant's residence)
  - **verification_status1 ~ 3** : 공동 소득 발생 여부 및 형태 (verification_income_joint : Type of verification of the joint income)
  - **purpose1 ~ 14** : 대출 목적 (The purpose of the loan)
  - **initial_list_status1 ~ 2** : 최초 대출 상태 (Initial listing status of the loan)
  - **mths_since_last_delinq1 ~ 11** : 마지막 체납이 지금으로부터 몇개월 전에 있었는지를 나타내는 변수 (Months since the last delinquency)
  
  - **funded_amnt** : 대출액 (Funded amount)
  - **funded_amnt_inv** : 사채 대출액 (Funded amount by investors)
  - **total_rec_late_fee** : 총 연체료 중 납부액 (Late fees received to date)
  - **term1** : 상환 기간 (The number of payments on the loan. Values are in months and can be either 36 or 60)
  - **open_acc** : 개설 개좌 수 (The number of open credit lines in the borrower's credit file)
  - **installment** : 대출 발생 시 월 상환액 (The monthly payment owed by the borrower if the loan originates)
  - **revol_util** : 리볼빙 한도 대비 리볼빙 사용 비율 (Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit)
  - **out_prncp** : 대출액 중 원리금 잔액 (Remaining outstanding principal for total amount funded)
  - **out_prncp_inv** : 사채 대출액 중 원리금 잔액 (Remaining outstanding principal for total amount funded by investors)
  - **total_rec_int** : 이자 상환액 (Interest received to date)
  - **fico_range_low** : FICO(일종의 신용점수) 최저값 (The lower boundary range the borrower’s FICO at loan origination belongs to)
  - **fico_range_high** : FICO(일종의 신용점수) 최고값 (The upper boundary range the borrower’s FICO at loan origination belongs to)
  
  - **depvar** : 고객의 부도 여부 (dependent variable)

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,int_rate,annual_inc,dti,delinq_2yrs,inq_last_6mths,pub_rec,revol_bal,total_acc,collections_12_mths_ex_med,acc_now_delinq,...,term1,open_acc,installment,revol_util,out_prncp,out_prncp_inv,total_rec_int,fico_range_low,fico_range_high,depvar
0,0.0824,21000.0,29.19,0,1,0,3016,26,0,0,...,1,18,37.74,0.076,0.0,0.0,157.94,765,769,0
1,0.1299,80000.0,4.82,0,1,1,5722,24,0,0,...,1,8,269.52,0.447,0.0,0.0,1702.42,665,669,0
2,0.1299,38000.0,23.66,0,3,0,6511,18,0,0,...,1,7,168.45,0.88,0.0,0.0,1066.64,670,674,0
3,0.1367,100000.0,16.27,4,2,0,6849,30,0,0,...,1,12,510.27,0.457,0.0,0.0,1256.24,680,684,1
4,0.1269,30000.0,25.28,0,1,2,8197,12,0,0,...,1,8,335.45,0.416,0.0,0.0,871.04,660,664,1


In [38]:
df.shape

(100000, 82)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 76 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   int_rate                    100000 non-null  float64
 1   annual_inc                  100000 non-null  float64
 2   dti                         100000 non-null  float64
 3   delinq_2yrs                 100000 non-null  int64  
 4   inq_last_6mths              100000 non-null  int64  
 5   pub_rec                     100000 non-null  int64  
 6   revol_bal                   100000 non-null  int64  
 7   total_acc                   100000 non-null  int64  
 8   collections_12_mths_ex_med  100000 non-null  int64  
 9   acc_now_delinq              100000 non-null  int64  
 10  tot_coll_amt                100000 non-null  int64  
 11  tot_cur_bal                 100000 non-null  int64  
 12  chargeoff_within_12_mths    100000 non-null  int64  
 13  delinq_amnt    

In [4]:
df.describe()

Unnamed: 0,int_rate,annual_inc,dti,delinq_2yrs,inq_last_6mths,pub_rec,revol_bal,total_acc,collections_12_mths_ex_med,acc_now_delinq,...,term1,open_acc,installment,revol_util,out_prncp,out_prncp_inv,total_rec_int,fico_range_low,fico_range_high,depvar
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,0.130833,74360.61,18.514508,0.3437,0.66888,0.23572,16090.20282,24.67591,0.01881,0.0058,...,0.85592,11.6207,434.077648,0.537233,0.253327,0.253259,2491.282802,692.63055,696.63066,0.32569
std,0.044773,74674.09,8.413049,0.905007,0.952044,0.661468,21569.939271,11.883834,0.150321,0.083585,...,0.351173,5.458774,265.921746,0.239373,18.05329,18.051746,2706.2622,29.668017,29.668584,0.468634
min,0.0532,5360.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,23.36,0.0,0.0,0.0,0.0,660.0,664.0,0.0
25%,0.0975,45000.0,12.2,0.0,0.0,0.0,6009.0,16.0,0.0,0.0,...,1.0,8.0,240.2925,0.361,0.0,0.0,857.2925,670.0,674.0,0.0
50%,0.1274,62000.0,18.06,0.0,0.0,0.0,11030.5,23.0,0.0,0.0,...,1.0,11.0,366.37,0.541,0.0,0.0,1615.16,685.0,689.0,0.0
75%,0.158,90000.0,24.53,0.0,1.0,0.0,19540.0,31.0,0.0,0.0,...,1.0,14.0,575.86,0.72,0.0,0.0,3039.115,705.0,709.0,1.0
max,0.3099,8300000.0,49.93,20.0,6.0,63.0,971736.0,176.0,5.0,6.0,...,1.0,82.0,1584.9,8.923,2330.97,2330.97,28005.96,845.0,850.0,1.0


In [5]:
# 결측치 확인
pd.DataFrame(df.isnull().sum()).rename(columns={0:'Null Count'}).T

Unnamed: 0,int_rate,annual_inc,dti,delinq_2yrs,inq_last_6mths,pub_rec,revol_bal,total_acc,collections_12_mths_ex_med,acc_now_delinq,...,term1,open_acc,installment,revol_util,out_prncp,out_prncp_inv,total_rec_int,fico_range_low,fico_range_high,depvar
Null Count,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# 라벨(depvar) 비율 확인
print('y=1 ratio :', df.depvar.sum()/len(df))

y=1 ratio : 0.32569


In [3]:
X = df.drop('depvar', axis=1)
y = df['depvar']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2) # train-test split 대신 cross validation만 사용

# Single Model

## RandomForest


In [37]:
# 하이퍼 파라미터 튜닝
rf_clf = RandomForestClassifier()

param_grid = {'max_depth': list(range(3, 11)),
              'n_estimators': [50, 100, 150, 200]}

grid_search = GridSearchCV(rf_clf, param_grid, n_jobs=-1, cv=cv, scoring='f1_macro', error_score=0) 

rf_results = grid_search.fit(X, y)

In [38]:
print(rf_results.best_params_)
print(rf_results.best_score_)
'''
{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 100}
0.5770945117943277
'''

{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 100}
0.5770945117943277


성능이 매우 안 좋아서 튜닝 종료

In [39]:
# best parameter를 사용하여 prediction 수행
rf_clf = RandomForestClassifier(**rf_results.best_params_)

# 제출 양식 다운로드
submit = pd.read_csv('sample_submission.csv')

# prediction 수행
df_test = pd.read_csv('test.csv')
rf_clf.fit(X, y)
submit['answer'] = rf_clf.predict(df_test.drop(columns=['ID']))

# 제출 파일 저장
submit.to_csv('prediction_rf.csv', index=False)

## XGBoost

### Hyperparameter Tuning

In [43]:
# 하이퍼 파라미터 튜닝
xgb_clf = XGBClassifier(n_jobs=1, eval_metric='mlogloss', use_label_encoder=False)

param_grid = {'n_estimators': range(100, 301, 50)}

grid_search = GridSearchCV(xgb_clf, param_grid, n_jobs=-1, cv=cv, scoring='f1_macro', error_score=0) 

results = grid_search.fit(X, y)

In [44]:
print(results.best_params_)
print(results.best_score_)
'''
{'n_estimators': 200}
0.6952149410831229
'''

{'n_estimators': 200}
0.6952149410831229


In [45]:
# 하이퍼 파라미터 튜닝
xgb_clf = XGBClassifier(n_estimators=200, n_jobs=1, eval_metric='mlogloss', use_label_encoder=False)

param_grid = {'max_depth': range(3, 10, 2),
              'min_child_weight': range(1, 6, 2)}

grid_search = GridSearchCV(xgb_clf, param_grid, n_jobs=-1, cv=cv, scoring='f1_macro', error_score=0) 

results = grid_search.fit(X, y)

In [52]:
print(results.best_params_)
print(results.best_score_)
'''
{'max_depth': 5, 'min_child_weight': 3}
0.6960451365353214
'''

{'max_depth': 5, 'min_child_weight': 3}
0.6960451365353214


In [53]:
# 하이퍼 파라미터 튜닝
xgb_clf = XGBClassifier(n_estimators=200, n_jobs=1, 
                        eval_metric='mlogloss', use_label_encoder=False)

param_grid = {'max_depth': [4, 5, 6],
              'min_child_weight': [2, 3, 4]}

grid_search = GridSearchCV(xgb_clf, param_grid, n_jobs=-1, cv=cv, scoring='f1_macro', error_score=0) 

results = grid_search.fit(X, y)

In [54]:
print(results.best_params_)
print(results.best_score_)
'''
{'max_depth': 5, 'min_child_weight': 2}
0.6974090575488129
'''

{'max_depth': 5, 'min_child_weight': 2}
0.6974090575488129


In [55]:
# 하이퍼 파라미터 튜닝
xgb_clf = XGBClassifier(n_estimators=200, max_depth=5, min_child_weight=2,
                        n_jobs=1, eval_metric='mlogloss', use_label_encoder=False)

param_grid = {'gamma': [i/10.0 for i in range(0, 6)]}

grid_search = GridSearchCV(xgb_clf, param_grid, n_jobs=-1, cv=cv, scoring='f1_macro', error_score=0) 

results = grid_search.fit(X, y)

In [58]:
print(results.best_params_)
print(results.best_score_)
'''
{'gamma': 0.5}
0.6970477588190709
'''

{'gamma': 0.5}
0.6970477588190709


In [59]:
# 하이퍼 파라미터 튜닝
xgb_clf = XGBClassifier(n_estimators=200, max_depth=5, min_child_weight=2,
                        n_jobs=1, eval_metric='mlogloss', use_label_encoder=False)

param_grid = {'gamma': [i/10.0 for i in range(6, 11)]}

grid_search = GridSearchCV(xgb_clf, param_grid, n_jobs=-1, cv=cv, scoring='f1_macro', error_score=0) 

results = grid_search.fit(X, y)

In [60]:
print(results.best_params_)
print(results.best_score_)
'''
{'gamma': 0.9}
0.696890430008179
'''

{'gamma': 0.9}
0.696890430008179


In [61]:
# 하이퍼 파라미터 튜닝
xgb_clf = XGBClassifier(n_estimators=200, max_depth=5, min_child_weight=2, gamma=0.9,
                        n_jobs=1, eval_metric='mlogloss', use_label_encoder=False)

param_grid = {'subsample':[i/10.0 for i in range(5, 10)],
              'colsample_bytree':[i/10.0 for i in range(5, 10)]}

grid_search = GridSearchCV(xgb_clf, param_grid, n_jobs=-1, cv=cv, scoring='f1_macro', error_score=0) 

results = grid_search.fit(X, y)

In [62]:
print(results.best_params_)
print(results.best_score_)
'''
{'colsample_bytree': 0.8, 'subsample': 0.9}
0.6964680525506814
'''

{'colsample_bytree': 0.8, 'subsample': 0.9}
0.6964680525506814


In [64]:
# 하이퍼 파라미터 튜닝
xgb_clf = XGBClassifier(n_estimators=200, max_depth=5, min_child_weight=2, gamma=0.9,
                        n_jobs=1, eval_metric='mlogloss', use_label_encoder=False)

param_grid = {'subsample':[i/100.0 for i in range(75, 90, 5)],
              'colsample_bytree':[i/100.0 for i in range(85, 100, 5)]}

grid_search = GridSearchCV(xgb_clf, param_grid, n_jobs=-1, cv=cv, scoring='f1_macro', error_score=0) 

results = grid_search.fit(X, y)

In [65]:
print(results.best_params_)
print(results.best_score_)
'''
{'colsample_bytree': 0.95, 'subsample': 0.85}
0.6962682471473779
'''

{'colsample_bytree': 0.95, 'subsample': 0.85}
0.6962682471473779


In [67]:
# 하이퍼 파라미터 튜닝
xgb_clf = XGBClassifier(n_estimators=200, max_depth=5, min_child_weight=2, colsample_bytree=0.95,
                        subsample=0.85, gamma=0.9, n_jobs=1, eval_metric='mlogloss',
                        use_label_encoder=False)

param_grid = {'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]}

grid_search = GridSearchCV(xgb_clf, param_grid, n_jobs=-1, cv=cv, scoring='f1_macro', error_score=0) 

results = grid_search.fit(X, y)

In [68]:
print(results.best_params_)
print(results.best_score_)
'''
{'reg_alpha': 0.1}
0.6965145164518078
'''

{'reg_alpha': 0.1}
0.6965145164518078


In [69]:
# 하이퍼 파라미터 튜닝
xgb_clf = XGBClassifier(n_estimators=200, max_depth=5, min_child_weight=2, colsample_bytree=0.95,
                        subsample=0.85, gamma=0.9, n_jobs=1, eval_metric='mlogloss',
                        use_label_encoder=False)

param_grid = {'reg_alpha': [0.05, 0.1, 0.5]}

grid_search = GridSearchCV(xgb_clf, param_grid, n_jobs=-1, cv=cv, scoring='f1_macro', error_score=0) 

results = grid_search.fit(X, y)

In [70]:
print(results.best_params_)
print(results.best_score_)
'''
{'reg_alpha': 0.5}
0.69634659938252
'''

{'reg_alpha': 0.5}
0.69634659938252


### Model Validation & Submission

In [46]:
# learning_rate를 줄인 만큼 n_estimators를 늘리면 좀 더 정교한 모델이 됨
xgbm_clf = XGBClassifier(learning_rate=0.1, n_estimators=600, max_depth=5, min_child_weight=2, 
                        colsample_bytree=0.95, subsample=0.85, gamma=0.9, reg_alpha=0.5,
                        eval_metric='mlogloss', use_label_encoder=False)

xgbm_cv_score = cross_val_score(xgbm_clf, X, y, n_jobs=1, cv=cv, scoring='f1_macro', error_score=0).mean()
xgbm_cv_score # 0.6990640481314775

0.6990640481314775

In [86]:
# best parameter를 사용하여 prediction 수행
xgbm_clf = XGBClassifier(learning_rate=0.01, n_estimators=6000, max_depth=5, min_child_weight=2, 
                        colsample_bytree=0.95, subsample=0.85, gamma=0.9, reg_alpha=0.5,
                        eval_metric='mlogloss', use_label_encoder=False)
xgbm_clf.fit(X, y)

# 제출 양식 다운로드
submit = pd.read_csv('sample_submission.csv')

# prediction 수행
df_test = pd.read_csv('test.csv')
submit['answer'] = xgbm_clf.predict(df_test.drop(columns=['ID']))

# 제출 파일 저장
submit.to_csv('prediction_xgb_v3.csv', index=False)

In [14]:
# best parameter를 사용하여 prediction 수행
xgbm_clf = XGBClassifier(learning_rate=0.1, n_estimators=600, max_depth=5, min_child_weight=2, 
                        colsample_bytree=0.95, subsample=0.85, gamma=0.9, reg_alpha=0.5,
                        eval_metric='mlogloss', use_label_encoder=False)
xgbm_clf.fit(X, y)

# 제출 양식 다운로드
submit = pd.read_csv('sample_submission.csv')

# prediction 수행
df_test = pd.read_csv('test.csv', index_col='ID')
submit['answer'] = (xgbm_clf.predict_proba(df_test)[:, 1] >= 0.4).astype(int)


# 제출 파일 저장
submit.to_csv('prediction_xgb_v4.csv', index=False)

### DART

In [87]:
# 하이퍼 파라미터 튜닝
xgb_clf = XGBClassifier(n_estimators=200, max_depth=5, min_child_weight=2, colsample_bytree=0.95,
                        subsample=0.85, gamma=0.9, reg_alpha=0.5, n_jobs=1, eval_metric='mlogloss',
                        use_label_encoder=False)

param_grid = {'booster': ['dart'],
              'rate_drop': [0.1, 0.05, 0.01],
              'skip_drop': [0.0, 0.5]}

grid_search = GridSearchCV(xgb_clf, param_grid, n_jobs=-1, cv=cv, scoring='f1_macro', error_score=0) 

results = grid_search.fit(X, y)

In [88]:
print(results.best_params_)
print(results.best_score_)
'''
{'booster': 'dart', 'rate_drop': 0.01, 'skip_drop': 0.5}
0.6975878165409405
'''

{'booster': 'dart', 'rate_drop': 0.01, 'skip_drop': 0.5}
0.6975878165409405


학습속도에 비해 얻는 것이 없는 것 같음

## AutoML with FLAML
LightGBM, XGBoost(no depth limit), Random Forest, Extra Trees hyperparameter tunning

### Hyperparameter Tuning

In [19]:
with open('flaml_7h.pkl', 'rb') as f:
    starting_points = pickle.load(f)

In [20]:
starting_points

{'xgboost': {'n_estimators': 95,
  'max_leaves': 524,
  'min_child_weight': 47.04680130589495,
  'learning_rate': 0.26706677295450554,
  'subsample': 0.9856353341088818,
  'colsample_bylevel': 0.8639198204961607,
  'colsample_bytree': 0.5872216427188794,
  'reg_alpha': 0.25835954921221194,
  'reg_lambda': 52.408554660469626}}

In [79]:
automl = AutoML()

automl_settings = {
    'metric': 'macro_f1',
    'task': 'classification',
    'log_file_name': 'flaml_8h.log',
    'eval_method': 'cv',
    'split_type': cv,
    'time_budget': 3600,
    'early_stop': True,
    'starting_points': starting_points,
    "estimator_list": [
        # 'lgbm',
        'xgboost',
        # 'rf',
        # 'extra_tree',
        # 'kneighbor'
    ]
}

automl.fit(X, y, **automl_settings)


[flaml.automl: 01-27 22:24:00] {2007} INFO - task = classification
[flaml.automl: 01-27 22:24:00] {2009} INFO - Data split method: RepeatedStratifiedKFold(n_repeats=2, n_splits=5, random_state=None)
[flaml.automl: 01-27 22:24:00] {2013} INFO - Evaluation method: cv
[flaml.automl: 01-27 22:24:00] {2113} INFO - Minimizing error metric: 1-macro_f1
[flaml.automl: 01-27 22:24:00] {2170} INFO - List of ML learners in AutoML Run: ['xgboost']
[flaml.automl: 01-27 22:24:00] {2437} INFO - iteration 0, current learner xgboost
[flaml.automl: 01-27 22:28:01] {2550} INFO - Estimated sufficient time budget=2415294s. Estimated necessary time budget=2415s.
[flaml.automl: 01-27 22:28:01] {2597} INFO -  at 243.8s,	estimator xgboost's best error=0.3035,	best estimator xgboost's best error=0.3035
[flaml.automl: 01-27 22:28:01] {2437} INFO - iteration 1, current learner xgboost
[flaml.automl: 01-27 22:29:47] {2597} INFO -  at 349.7s,	estimator xgboost's best error=0.3035,	best estimator xgboost's best error

In [81]:
automl.best_loss

0.6977359127473777

In [82]:
with open('flaml_7h.pkl', 'wb') as f:
    pickle.dump(automl.best_config_per_estimator, f)

### Save Hyperparamters

In [27]:
with open('flaml_7h.pkl', 'rb') as f:
    starting_points = pickle.load(f)
xgb_conf = starting_points['xgboost']

In [28]:
with open('flaml_6h.pkl', 'rb') as f:
    starting_points = pickle.load(f)
rf_conf = starting_points['rf']
et_conf = starting_points['extra_tree']

In [29]:
with open('flaml_5h.pkl', 'rb') as f:
    starting_points = pickle.load(f)
lgbm_conf = starting_points['lgbm']

In [30]:
lgbm_params = LGBMEstimator().config2params(lgbm_conf)
xgb_params = XGBoostEstimator().config2params(xgb_conf)
rf_params = RandomForestEstimator().config2params(rf_conf)
et_params = ExtraTreesEstimator().config2params(et_conf)
lgbm_params, xgb_params, rf_params, et_params

({'n_estimators': 182,
  'num_leaves': 202,
  'min_child_samples': 5,
  'learning_rate': 0.04155948284334181,
  'colsample_bytree': 0.8050328020487125,
  'reg_alpha': 0.0034807476736494047,
  'reg_lambda': 0.24123607382484474,
  'max_bin': 1023},
 {'n_estimators': 95,
  'max_leaves': 524,
  'min_child_weight': 47.04680130589495,
  'learning_rate': 0.26706677295450554,
  'subsample': 0.9856353341088818,
  'colsample_bylevel': 0.8639198204961607,
  'colsample_bytree': 0.5872216427188794,
  'reg_alpha': 0.25835954921221194,
  'reg_lambda': 52.408554660469626,
  'max_depth': 0,
  'grow_policy': 'lossguide',
  'tree_method': 'hist',
  'use_label_encoder': False},
 {'n_estimators': 31,
  'max_features': 1.0,
  'criterion': 'entropy',
  'max_leaf_nodes': 1522},
 {'n_estimators': 53,
  'max_features': 0.9969126323328497,
  'criterion': 'entropy',
  'max_leaf_nodes': 4401})

In [None]:
with open('flaml_params.pkl', 'wb') as f:
    pickle.dump(
        {
            'lgbm': lgbm_params,
            'xgb': xgb_params,
            'rf': rf_params,
            'et': et_params
        },
        f
    )

### Model Validation & Submission

In [5]:
with open('flaml_params.pkl', 'rb') as f:
    params = pickle.load(f)
params

{'lgbm': {'n_estimators': 182,
  'num_leaves': 202,
  'min_child_samples': 5,
  'learning_rate': 0.04155948284334181,
  'colsample_bytree': 0.8050328020487125,
  'reg_alpha': 0.0034807476736494047,
  'reg_lambda': 0.24123607382484474,
  'max_bin': 1023},
 'xgb': {'n_estimators': 95,
  'max_leaves': 524,
  'min_child_weight': 47.04680130589495,
  'learning_rate': 0.26706677295450554,
  'subsample': 0.9856353341088818,
  'colsample_bylevel': 0.8639198204961607,
  'colsample_bytree': 0.5872216427188794,
  'reg_alpha': 0.25835954921221194,
  'reg_lambda': 52.408554660469626,
  'max_depth': 0,
  'grow_policy': 'lossguide',
  'tree_method': 'hist',
  'use_label_encoder': False},
 'rf': {'n_estimators': 31,
  'max_features': 1.0,
  'criterion': 'entropy',
  'max_leaf_nodes': 1522},
 'et': {'n_estimators': 53,
  'max_features': 0.9969126323328497,
  'criterion': 'entropy',
  'max_leaf_nodes': 4401}}

In [None]:
params = {
    'lgbm': {'n_estimators': 182,
             'num_leaves': 202,
             'min_child_samples': 5,
             'learning_rate': 0.04155948284334181,
             'colsample_bytree': 0.8050328020487125,
             'reg_alpha': 0.0034807476736494047,
             'reg_lambda': 0.24123607382484474,
             'max_bin': 1023},
    'xgb': {'n_estimators': 95,
            'max_leaves': 524,
            'min_child_weight': 47.04680130589495,
            'learning_rate': 0.26706677295450554,
            'subsample': 0.9856353341088818,
            'colsample_bylevel': 0.8639198204961607,
            'colsample_bytree': 0.5872216427188794,
            'reg_alpha': 0.25835954921221194,
            'reg_lambda': 52.408554660469626,
            'max_depth': 0,
            'grow_policy': 'lossguide',
            'tree_method': 'hist',
            'use_label_encoder': False},
    'rf': {'n_estimators': 31,
           'max_features': 1.0,
           'criterion': 'entropy',
           'max_leaf_nodes': 1522},
    'et': {'n_estimators': 53,
           'max_features': 0.9969126323328497,
           'criterion': 'entropy',
           'max_leaf_nodes': 4401}
}

#### LightGBM

In [61]:
lgbm_clf = LGBMClassifier()
cross_val_score(lgbm_clf, X, y, cv=cv, scoring='f1_macro', error_score=0).mean()
# 0.6948411023458668
# 0.6949499054596988, 9.5s

0.6949499054596988

In [6]:
lgbm_clf = LGBMClassifier(**params['lgbm'])
cross_val_score(lgbm_clf, X, y, cv=cv, scoring='f1_macro', error_score=0).mean()
# 0.7021724509738523
# 0.7023368203537553, 2m 34.8s

0.7023368203537553

In [21]:
# best parameter를 사용하여 prediction 수행
lgbm_clf = LGBMClassifier(**params['lgbm'])
lgbm_clf.fit(X, y)

# 제출 양식 다운로드
submit = pd.read_csv('sample_submission.csv')

# prediction 수행
df_test = pd.read_csv('test.csv')
submit['answer'] = lgbm_clf.predict(df_test.drop(columns=['ID']))

# 제출 파일 저장
submit.to_csv('prediction_lgbm_v3.csv', index=False)

In [9]:
lgbm_clf = LGBMClassifier(**params['lgbm'])
lgbm_clf.fit(X_train, y_train)
y_pred_proba = lgbm_clf.predict_proba(X_val)[:, 1]

In [10]:
f1_score(y_val, y_pred_proba >= 0.4, average='macro')

0.7216360978262679

In [19]:
threshold = 0.401
f1_score(y_val, y_pred_proba >= threshold, average='macro')

0.7218986515469497

In [None]:
# best parameter를 사용하여 prediction 수행
lgbm_clf = LGBMClassifier(**params['lgbm'])
lgbm_clf.fit(X, y)

# 제출 양식 다운로드
submit = pd.read_csv('sample_submission.csv')

# prediction 수행
df_test = pd.read_csv('test.csv', index_col='ID')
submit['answer'] = (lgbm_clf.predict_proba(df_test)[:, 1] >= threshold).astype(int)


# 제출 파일 저장
submit.to_csv('prediction_lgbm_v3.csv', index=False)

#### XGBoost

In [54]:
xgb_clf = XGBClassifier(eval_metric='logloss', use_label_encoder=False)
cross_val_score(xgb_clf, X, y, cv=cv, scoring='f1_macro', error_score=0).mean()
# 0.6959932093621708

0.6959932093621708

In [39]:
xgb_clf = XGBClassifier(**params['xgb'], eval_metric='logloss')
xgb_cv_score = cross_val_score(xgb_clf, X, y, cv=cv, scoring='f1_macro', error_score=0).mean()
xgb_cv_score # 0.6962022857368901

0.6962022857368901

In [52]:
# best parameter를 사용하여 prediction 수행
xgb_clf = XGBClassifier(**params['xgb'])
xgb_clf.fit(X, y)

# 제출 양식 다운로드
submit = pd.read_csv('sample_submission.csv')

# prediction 수행
df_test = pd.read_csv('test.csv')
submit['answer'] = xgb_clf.predict(df_test.drop(columns=['ID']))

# 제출 파일 저장
submit.to_csv('prediction_xgb_v4.csv', index=False)

#### Random Forest

In [40]:
rf_clf = RandomForestClassifier(**params['rf'], n_jobs=-1)
rf_cv_score = cross_val_score(rf_clf, X, y, cv=cv, scoring='f1_macro', error_score=0).mean()
rf_cv_score # 0.6956405171140276

0.6956405171140276

In [43]:
# best parameter를 사용하여 prediction 수행
rf_clf = RandomForestClassifier(**params['rf'], n_jobs=-1)
rf_clf.fit(X, y)

# 제출 양식 다운로드
submit = pd.read_csv('sample_submission.csv')

# prediction 수행
df_test = pd.read_csv('test.csv')
submit['answer'] = rf_clf.predict(df_test.drop(columns=['ID']))

# 제출 파일 저장
submit.to_csv('prediction_rf_v1.csv', index=False)

#### Extra Trees

In [47]:
et_clf = ExtraTreesClassifier(**params['et'], n_jobs=-1)
et_cv_score = cross_val_score(et_clf, X, y, cv=cv, scoring='f1_macro', error_score=0).mean()
et_cv_score # 0.6853554333706992

0.6853554333706992

In [48]:
# best parameter를 사용하여 prediction 수행
et_clf = ExtraTreesClassifier(**params['et'], n_jobs=-1)
et_clf.fit(X, y)

# 제출 양식 다운로드
submit = pd.read_csv('sample_submission.csv')

# prediction 수행
df_test = pd.read_csv('test.csv')
submit['answer'] = et_clf.predict(df_test.drop(columns=['ID']))

# 제출 파일 저장
submit.to_csv('prediction_et_v1.csv', index=False)

## SVC

In [7]:
svc_clf = SVC()
# cross_val_score(svc_clf, X, y, cv=cv, scoring='f1_macro', error_score=0).mean()
# 0.4027336876494349, 191m 22.4s
# 너무 오래 걸리고 성능도 안 나옴, cv 안 하는게 좋았을 듯, 전체 train dataset fit할 때 25분 가량 걸릴 듯

In [189]:
lsvc_clf = LinearSVC(max_iter=5000)
# cross_val_score(lsvc_clf, X, y, cv=cv, scoring='f1_macro', error_score=0).mean()
lsvc_clf.fit(X_train, y_train)
f1_score(y_val, lsvc_clf.predict(X_val), average='macro')



0.4176140508857313

In [190]:
lsvc_clf.score(X_val, y_val)
# ConvergenceWarning가 max_iter=20000까지도 뜨고 f1_score가 들쭉날쭉하게 나타나는 것을 보면 제대로 된 값을 얻기 힘든 듯 -> SGD로

0.67675

## KNN

In [194]:
knn_clf = KNeighborsClassifier(n_jobs=-1)
knn_clf.fit(X_train, y_train)
f1_score(y_val, knn_clf.predict(X_val), average='macro') 

0.5140810275194755

In [196]:
knn_clf.score(X_val, y_val) # model 특성상 score를 구하는 것도 시간이 오래걸림

0.6201

1. K = 5: 0.5137412251410697, 40.1s
2. K = 3: 0.5152032347803761, 35.8s

빠르지만 성능은 안 나옴, feature 개수, 형식 상 KNN에 어울리지 않는 듯

## Linear Models

In [210]:
lr_clf = LogisticRegression(max_iter=1000)
lr_clf.fit(X_train, y_train)
f1_score(y_val, lr_clf.predict(X_val), average='macro') 
# 0.47888286729716784

0.4793078353049412

In [166]:
lr_clf.score(X_val, y_val)

0.69275

In [173]:
ridge_clf = RidgeClassifier()
ridge_clf.fit(X_train, y_train)
f1_score(y_val, ridge_clf.predict(X_val), average='macro') 
# 0.6113113913143298

0.6020031012208719

In [174]:
ridge_clf.score(X_val, y_val)

0.7149

In [197]:
sgd_clf = SGDClassifier() # default: hinge(linear SVM)
sgd_clf.fit(X_train, y_train)
f1_score(y_val, sgd_clf.predict(X_val), average='macro') 
# 0.4797063896182382

0.46605994036668275

In [198]:
sgd_clf.score(X_val, y_val)

0.66575

## Naive Bayes

In [200]:
gnb_clf = GaussianNB()
gnb_clf.fit(X_train, y_train)
f1_score(y_val, gnb_clf.predict(X_val), average='macro')
# 0.42759860319702614

0.42753366406803683

In [201]:
gnb_clf.score(X_val, y_val)

0.6769

## Decision Tree

In [6]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
f1_score(y_val, dt_clf.predict(X_val), average='macro')
# 0.6275213678707021

0.6275213678707021

## Adaboost


In [202]:
ada_clf = AdaBoostClassifier()
ada_clf.fit(X_train, y_train)
f1_score(y_val, ada_clf.predict(X_val), average='macro') # 0.6488350147578595

0.6476242409034628

In [203]:
ada_clf.score(X_val, y_val)

0.734

In [204]:
ada_clf = AdaBoostClassifier(n_estimators=1000, learning_rate=0.1)
ada_clf.fit(X_train, y_train)
f1_score(y_val, ada_clf.predict(X_val), average='macro') # 0.6467879508900809

0.6447249272662048

In [205]:
ada_clf.score(X_val, y_val)

0.73585

base estimator를 바꾸지 않는 이상 좋은 성능이 나오지는 않을 듯

## Gradient Boosting (scikit-learn)

In [76]:
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)
f1_score(y_val, gb_clf.predict(X_val), average='macro') # 0.6530430353959766

0.6530430353959766

In [111]:
gb_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.3, max_depth=5, subsample=0.85) # XGBoost에서의 best parameter를 일부 빌림
gb_clf.fit(X_train, y_train)
f1_score(y_val, gb_clf.predict(X_val), average='macro') 
# 0.6944904480430252
# 0.6950938618647354

0.6950938618647354

In [112]:
gb_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.3, max_depth=5) # XGBoost에서의 best parameter를 일부 빌림
gb_clf.fit(X_train, y_train)
f1_score(y_val, gb_clf.predict(X_val), average='macro') 
# 0.6972284268197992
# 0.6976019207679309

0.6976019207679309

In [114]:
gb_clf = GradientBoostingClassifier(n_estimators=600, max_depth=5) # XGBoost에서의 best parameter를 일부 빌림
gb_clf.fit(X_train, y_train)
f1_score(y_val, gb_clf.predict(X_val), average='macro')
# 0.6996567334199564, 5m 18.9s

0.6996567334199564

In [115]:
gb_clf = GradientBoostingClassifier(n_estimators=600, max_depth=5)
cross_val_score(gb_clf, X, y, cv=cv, scoring='f1_macro', error_score=0).mean()
# 0.6975864820330628, 57m 32.3s


0.6975864820330628

## HistGradient

In [62]:
hg_clf = HistGradientBoostingClassifier()
cross_val_score(hg_clf, X, y, cv=cv, scoring='f1_macro', error_score=0).mean()
# 0.6931460059940971
# 0.6930662478445825, 39.8s

0.6930662478445825

In [100]:
# 하이퍼 파라미터 튜닝
hg_clf = HistGradientBoostingClassifier(max_leaf_nodes=None, max_iter=500)

param_grid = {'max_depth': [8 ,9 ,10]}

grid_search = GridSearchCV(hg_clf, param_grid, n_jobs=-1, cv=cv, scoring='f1_macro', error_score=0) 

results = grid_search.fit(X, y)

In [101]:
print(results.best_params_)
print(results.best_score_)
'''
{'max_depth': 8}
0.6966512074927655
'''

{'max_depth': 8}
0.6966512074927655


In [102]:
# 하이퍼 파라미터 튜닝
hg_clf = HistGradientBoostingClassifier(max_iter=500)

param_grid = {'max_leaf_nodes': [31, 63, 127]}

grid_search = GridSearchCV(hg_clf, param_grid, n_jobs=-1, cv=cv, scoring='f1_macro', error_score=0) 

results = grid_search.fit(X, y)

In [103]:
print(results.best_params_)
print(results.best_score_)
'''
{'max_leaf_nodes': 127}
0.6991283250059304
'''

{'max_leaf_nodes': 127}
0.6991283250059304


"\n{'max_depth': 8}\n0.6966512074927655\n"

In [104]:
# 하이퍼 파라미터 튜닝
hg_clf = HistGradientBoostingClassifier(max_iter=500)

param_grid = {'max_leaf_nodes': [127, 255, 511]}

grid_search = GridSearchCV(hg_clf, param_grid, n_jobs=-1, cv=cv, scoring='f1_macro', error_score=0) 

results = grid_search.fit(X, y)

In [106]:
print(results.best_params_)
print(results.best_score_)
# {'max_leaf_nodes': 127}; 0.6997668882066014

{'max_leaf_nodes': 127}
0.6997668882066014


In [107]:
# 하이퍼 파라미터 튜닝
hg_clf = HistGradientBoostingClassifier(max_leaf_nodes=127, max_iter=500)

param_grid = {'max_depth': [8 ,9 ,10]}

grid_search = GridSearchCV(hg_clf, param_grid, n_jobs=-1, cv=cv, scoring='f1_macro', error_score=0) 

results = grid_search.fit(X, y)

In [108]:
print(results.best_params_)
print(results.best_score_)
# {'max_depth': 9}; 0.6985986286385122

{'max_depth': 9}
0.6985986286385122


In [109]:
hg_clf = HistGradientBoostingClassifier(max_leaf_nodes=127, max_iter=500)
cross_val_score(hg_clf, X, y, cv=cv, scoring='f1_macro', error_score=0).mean()
# 0.699548832291287

0.699548832291287

In [18]:
# best parameter를 사용하여 prediction 수행
hg_clf = HistGradientBoostingClassifier(max_leaf_nodes=127, max_iter=500)
hg_clf.fit(X, y)

# 제출 양식 다운로드
submit = pd.read_csv('sample_submission.csv')

# prediction 수행
df_test = pd.read_csv('test.csv', index_col='ID')
submit['answer'] = (hg_clf.predict_proba(df_test)[:, 1] >= 0.4).astype(int)


# 제출 파일 저장
submit.to_csv('prediction_hg_v1.csv', index=False)

## CatBoost

In [59]:
cat_clf = CatBoostClassifier(verbose=False)
cross_val_score(cat_clf, X, y, cv=cv, scoring='f1_macro', error_score=0).mean()
# 0.6998443592755684
# 0.7000925469105203, 1m 56.1s

0.7000925469105203

In [19]:
# best parameter를 사용하여 prediction 수행
cat_clf = CatBoostClassifier(verbose=False)
cat_clf.fit(X, y)

# 제출 양식 다운로드
submit = pd.read_csv('sample_submission.csv')

# prediction 수행
df_test = pd.read_csv('test.csv', index_col='ID')
submit['answer'] = (cat_clf.predict_proba(df_test)[:, 1] >= 0.4).astype(int)


# 제출 파일 저장
submit.to_csv('prediction_cat_v1.csv', index=False)

## Model With Categorical Feature Support

In [46]:
categorical_features = {'emp_length': 12, 'home_ownership': 6, 'verification_status': 3, 'purpose': 14, 'initial_list_status': 2, 'mths_since_last_delinq': 11}

X_c = X.copy()

for feature, length in categorical_features.items():
    enc = OneHotEncoder()
    enc.fit([[i] for i in range(length)])
    onehot_features = [feature + str(i) for i in range(1, length+1)]
    X_c[feature] = enc.inverse_transform(X_c[onehot_features])
    X_c = X_c.drop(columns=onehot_features)
X_c.shape

(100000, 33)

### HistGradient

In [57]:
hgc_clf = HistGradientBoostingClassifier(categorical_features=[X_c.columns.get_loc(feature) for feature in categorical_features.keys()])
cross_val_score(hgc_clf, X_c, y, cv=cv, scoring='f1_macro', error_score=0).mean()
# 0.6930570468515302, 47.8s

0.6930570468515302

### CatBoost

In [58]:
catc_clf = CatBoostClassifier(cat_features=list(categorical_features.keys()), verbose=False)
cross_val_score(catc_clf, X_c, y, cv=cv, scoring='f1_macro', error_score=0).mean() 
# 0.6993330831068032, 7m 21.2s

0.6993330831068032

### LightGBM

In [65]:
lgbmc_clf = LGBMClassifier()
cross_val_score(lgbmc_clf, X_c, y, cv=cv, scoring='f1_macro', error_score=0, fit_params={'categorical_feature': list(categorical_features.keys())}).mean()
# 0.6947542145393216, 10.1s 



0.6948484793958484

### Notes
- default parameter에서는 score에 의미있는 차이가 없는 것 같음
- dimension이 줄어드는 만큼 속도에 이득이 있지 않을까 싶지만 categorical feature를 다루는 과정에서 시간이 더 걸려서 그런지 큰 차이가 없었거나 CatBoost의 경우 크게 더 오래 걸렸음
- parameter tuning 후에는 의미있는 차이가 나올지?

# Ensemble

## Voting

In [61]:
rf_clf = RandomForestClassifier(**RandomForestEstimator().config2params(starting_points['rf']), n_jobs=-1)
et_clf = ExtraTreesClassifier(**ExtraTreesEstimator().config2params(starting_points['extra_tree']), n_jobs=-1)
xgb_clf = XGBClassifier(**XGBoostEstimator().config2params(starting_points['xgboost']))
lgbm_clf = LGBMClassifier(**LGBMEstimator().config2params(starting_points['lgbm']))
# xgbld_clf = XGBClassifier(**XGBoostLimitDepthEstimator().config2params(starting_points['xgb_limitdepth']))
xgbm_clf = XGBClassifier(n_estimators=200, max_depth=5, min_child_weight=2, colsample_bytree=0.95,
                        subsample=0.85, gamma=0.9, reg_alpha=0.5, eval_metric='mlogloss',
                        use_label_encoder=False)

# ridge_clf = RidgeClassifier()
# svc_clf = SVC()
# ada_clf = AdaBoostClassifier()
# gbc_clf = GradientBoostingClassifier()
# bag_clf = BaggingClassifier()
# knn_clf = KNeighborsClassifier()
# svc_clf = SVC()

estimators = [
    ('rf', rf_clf),
    ('lgbm', lgbm_clf),
    ('xgb', xgb_clf),
    # ('xgbld', xgb_clf),
    ('et', et_clf),
    ('xgbm', xgbm_clf)
]

voting_clf = VotingClassifier(estimators=estimators, voting='soft')

for _, clf in estimators:
    clf.fit(X, y)
voting_clf.fit(X, y)

# 제출 양식 다운로드
submit = pd.read_csv('sample_submission.csv')

# prediction 수행
df_test = pd.read_csv('test.csv')
submit['answer'] = voting_clf.predict(df_test.drop(columns=['ID']))

# 제출 파일 저장
submit.to_csv('prediction_voting_v3.csv', index=False)

Stacking에 비해 성능이 떨어지는 것 같아서 v3로 종료

## Stacking


In [20]:
estimators = [
    ('knn', KNeighborsClassifier(n_jobs=-1)),
    ('lr', LogisticRegression(max_iter=1000)),
    ('ridge', RidgeClassifier()),
    ('sgdh', SGDClassifier(n_jobs=-1)), # hinge loss (linear SVM)
    ('gnb', GaussianNB()),
    # ('rf', RandomForestClassifier(**params['rf'], n_jobs=-1)),
    # ('et', ExtraTreesClassifier(**params['et'], n_jobs=-1)),
    # ('xgb', XGBClassifier(**params['xgb'], eval_metric='logloss')), # auto tuning, no depth limit
    ('lgbm', LGBMClassifier(**params['lgbm'])),
    ('xgbm', XGBClassifier(learning_rate=0.1, n_estimators=600, max_depth=5, min_child_weight=2,
                           colsample_bytree=0.95, subsample=0.85, gamma=0.9, reg_alpha=0.5,
                           eval_metric='mlogloss', use_label_encoder=False)), # manual tuning
    # ('ada', AdaBoostClassifier()),
    # ('gb', GradientBoostingClassifier(n_estimators=600, max_depth=5)),
    ('hg', HistGradientBoostingClassifier(max_leaf_nodes=127, max_iter=500)),
    ('cat', CatBoostClassifier(verbose=False)),
]
stacking_clf = StackingClassifier(estimators=estimators, 
                                  final_estimator=LGBMClassifier(), 
                                  passthrough=False)

# stacking_cv_score = cross_val_score(stacking_clf, X, y, cv=cv, scoring='f1_macro', error_score=0).mean()
# stacking_cv_score
# rf, et, xgb, lgbm, xgbm 모두 사용: 0.70592025868523, 1시간 걸려서 cv 말고 train-test split으로 stacking clf 간 비교 해야 할 듯

stacking_clf.fit(X_train, y_train)
f1_score(y_val, stacking_clf.predict(X_val), average='macro')

0.7113111035113191

**F1-macro**  
passthrough 사용시 feature수가 많아서 final_estimator의 부담이 크고 tuning도 힘든 것 같아서 passthrough 없이 default lgbm을 final_estimator로 사용
1. rf, et, xgb, lgbm, xgbm 사용: 0.7061841189924176 (v7)
2. xgb, lgbm, xgbm: 0.7051372940745185 (v8)
3. lgbm, xgbm: 0.7033553713730225 (v9)
4. rf, et, xgb, lgbm, xgbm, gb, hg: 0.7125830702848788 (v10)
5. rf, et, xgb, lgbm, xgbm, gb, hg, ada: 0.7100702404299758 (v11)
---
6. rf, et, xgb, lgbm, xgbm, gb, hg, cat: 0.707046769855159 (v12)
7. rf, xgb, lgbm, xgbm, gb, hg, cat: 0.7085827879697644 (v13)
8. xgb, lgbm, xgbm, cat: 0.7062648096813076 (v14)
---
9. rf, xgb, lgbm, xgbm, gb, cat: 0.7091370222502784 (v15)
10. rf, et, xgb, lgbm, xgbm, gb, cat: 0.7074188983449927 (v16)
11. rf, et, lgbm, xgbm, gb, hg, cat: 0.7098949539288282 (v18)
---
default catboost를 final_estimator로 사용
1. rf, et, xgb, lgbm, xgbm, gb, hg, cat: 0.7074634748287714 (v17) - 성능 향상 없음
---
gb, hg hyperparameter tuning 후 (이전에는 default paramater 사용)
1. rf, et, xgb, lgbm, xgbm, gb, hg, cat: 0.709916553826978 (v19)
2. rf, lgbm, xgbm, gb, hg, cat: 0.7075401652138642 (v20)
---
3. knn, lr, ridge, sgdh, gnb, rf, et, xgb, lgbm, xgbm, ada, gb, hg, cat: 0.7084409394177986 (v21)
4. knn, lr, ridge, sgdh, gnb, lgbm, xgbm, hg, cat (selecting tree based models): 0.7113111035113191 (v22) 

### Threshold 조정

In [21]:
y_pred_proba = stacking_clf.predict_proba(X_val)[:, 1]

In [22]:
f1_score(y_val, y_pred_proba >= 0.4, average='macro')

0.7196625818951845

In [41]:
threshold = 0.446
f1_score(y_val, y_pred_proba >= threshold, average='macro')

0.7214527056354827

**F1-macro with threshold moving**  
1. rf, et, xgb, lgbm, xgbm, gb, hg: threshold 0.401, 0.7198882617966558 (v10)
2. rf, et, xgb, lgbm, xgbm, gb, hg, ada: threshold 0.423, 0.7206856053471306 (v11)
---
3. rf, et, xgb, lgbm, xgbm, gb, hg, cat: threshold 0.415, 0.7228804690386343 (v12)
4. rf, xgb, lgbm, xgbm, gb, hg, cat: threshold 0.4, 0.7222799232585366 (v13)
5. xgb, lgbm, xgbm, cat: threshold 0.411, 0.7208854962522806 (v14)
6. rf, xgb, lgbm, xgbm, gb, cat: 0.7237874268036681 (v15)
7. rf, et, xgb, lgbm, xgbm, gb, cat: 0.724076910952762 (v16)
8. rf, et, lgbm, xgbm, gb, hg, cat: 0.401, 0.7244480004843666 (v18)
---
default catboost를 final_estimator로 사용
1. rf, et, xgb, lgbm, xgbm, gb, hg, cat: 0.429, 0.7218860929140762 (v17) - 성능 향상 딱히 없음
---
gb, hg hyperparameter tuning 후 (threshold 0.4로 고정)
1. rf, et, xgb, lgbm, xgbm, gb, hg, cat: 0.7231338741612161 (v19)
2. rf, lgbm, xgbm, gb, hg, cat: 0.7218720125944591 (v20)
3. knn, lr, ridge, sgdh, gnb, rf, et, xgb, lgbm, xgbm, ada, gb, hg, cat: 0.7221416221984438 (v21)
4. knn, lr, ridge, sgdh, gnb, lgbm, xgbm, hg, cat: 0.7196625818951845; threshold 0.446, 0.7214527056354827 (v22)


### Submission

In [16]:
estimators = [
    ('knn', KNeighborsClassifier(n_jobs=-1)),
    ('lr', LogisticRegression(max_iter=1000)),
    ('ridge', RidgeClassifier()),
    ('sgdh', SGDClassifier(n_jobs=-1)),
    ('gnb', GaussianNB()),
    # ('rf', RandomForestClassifier(**params['rf'], n_jobs=-1)),
    # ('et', ExtraTreesClassifier(**params['et'], n_jobs=-1)),
    # ('xgb', XGBClassifier(**params['xgb'], eval_metric='logloss')),
    ('lgbm', LGBMClassifier(**params['lgbm'])),
    ('xgbm', XGBClassifier(learning_rate=0.1, n_estimators=600, max_depth=5, min_child_weight=2, 
                           colsample_bytree=0.95, subsample=0.85, gamma=0.9, reg_alpha=0.5,
                           eval_metric='mlogloss', use_label_encoder=False)),
    # ('ada', AdaBoostClassifier()),
    # ('gb', GradientBoostingClassifier(n_estimators=600, max_depth=5)),
    ('hg', HistGradientBoostingClassifier(max_leaf_nodes=127, max_iter=500)),
    ('cat', CatBoostClassifier(verbose=False)),
]
stacking_clf = StackingClassifier(estimators=estimators, 
                                  final_estimator=LGBMClassifier(), 
                                  passthrough=False)
stacking_clf.fit(X, y)

# 제출 양식 다운로드
submit = pd.read_csv('sample_submission.csv')

# prediction 수행
df_test = pd.read_csv('test.csv', index_col='ID')
submit['answer'] = (stacking_clf.predict_proba(df_test)[:, 1] >= threshold).astype(int)

# 제출 파일 저장
submit.to_csv('prediction_stacking_v22.csv', index=False)