In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv')

In [4]:
test = pd.read_csv('test.csv')

In [10]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

In [15]:
X = train.drop(['ID', '전화해지여부'], axis = 1)
y = train['전화해지여부']

In [16]:
skf = StratifiedKFold(n_splits = 8, random_state = 42, shuffle = True)

In [22]:
target = test[X.columns]

### Catboost

In [27]:
cb_pred = np.zeros(target.shape[0])
for i, (tr_idx, val_idx) in enumerate(skf.split(X, y)) :
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    print(f'{i + 1} Fold....')
    cb = CatBoostClassifier(random_state = 42, max_depth = 6, learning_rate = 0.02, iterations = 10000, use_best_model = True, eval_metric = 'F1')
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    
    val_pred = cb.predict_proba(val_x)[:, 1]

    for threshold in np.arange(0.3, 0.7, 0.05) :
        val_cls = [1 if p >= threshold else 0 for p in val_pred]
        val_f1 = f1_score(val_y, val_cls, average = 'macro')
        print(f'When threshold is {threshold}, F1 Score : {val_f1}')
    
    
    fold_pred = cb.predict_proba(target)[:, 1] / skf.n_splits
    cb_pred += fold_pred

1 Fold....
When threshold is 0.3, F1 Score : 0.8137182334073525
When threshold is 0.35, F1 Score : 0.8057626499066156
When threshold is 0.39999999999999997, F1 Score : 0.7922498211422302
When threshold is 0.44999999999999996, F1 Score : 0.7744203649111625
When threshold is 0.49999999999999994, F1 Score : 0.7587367674875629
When threshold is 0.5499999999999999, F1 Score : 0.7188474607829447
When threshold is 0.5999999999999999, F1 Score : 0.6929548996688096
When threshold is 0.6499999999999999, F1 Score : 0.6727556199905382
2 Fold....
When threshold is 0.3, F1 Score : 0.8114748533069827
When threshold is 0.35, F1 Score : 0.8058144407357637
When threshold is 0.39999999999999997, F1 Score : 0.7897345291748394
When threshold is 0.44999999999999996, F1 Score : 0.765505591530051
When threshold is 0.49999999999999994, F1 Score : 0.7513383517160391
When threshold is 0.5499999999999999, F1 Score : 0.7349034213213947
When threshold is 0.5999999999999999, F1 Score : 0.7115436251454613
When thresh

In [31]:
submission = pd.read_csv('sample_submission.csv')

In [36]:
submission['전화해지여부'] = [1 if p >= 0.3 else 0 for p in cb_pred]

In [38]:
submission['전화해지여부'].value_counts()

0    11639
1     1304
Name: 전화해지여부, dtype: int64

In [39]:
submission.to_csv('1st.csv', index = False)

### LGBM

In [41]:
X.columns = [f'col_{i}' for i in range(1, X.shape[1] + 1)]

In [43]:
target.columns = [f'col_{i}' for i in range(1, X.shape[1] + 1)]

In [46]:
lgbm_pred = np.zeros(target.shape[0])
for i, (tr_idx, val_idx) in enumerate(skf.split(X, y)) :
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    lgbm = LGBMClassifier(random_state = 42, max_depth = 6, learning_rate = 0.02, n_estimators = 10000, eval_metric = 'F1')
    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    
    val_pred = lgbm.predict_proba(val_x)[:, 1]
    val_cls = [1 if p >= 0.3 else 0 for p in val_pred]
    val_f1 = f1_score(val_y, val_cls, average = 'macro')
    print(f'{i + 1} Fold....F1 Score : {val_f1}')    
    
    fold_pred = lgbm.predict_proba(target)[:, 1] / skf.n_splits
    lgbm_pred += fold_pred

1 Fold....F1 Score : 0.8116984999365149
2 Fold....F1 Score : 0.8016310916332986
3 Fold....F1 Score : 0.7964197438394127
4 Fold....F1 Score : 0.8049022545127706
5 Fold....F1 Score : 0.7824806489407335
6 Fold....F1 Score : 0.8188399057770344
7 Fold....F1 Score : 0.8183889138627518
8 Fold....F1 Score : 0.8102023811205106


In [54]:
submission['전화해지여부'] = [1 if p >= 0.3 else 0 for p in cb_pred * 0.5 + lgbm_pred * 0.5]

In [55]:
submission['전화해지여부'].value_counts()

0    11654
1     1289
Name: 전화해지여부, dtype: int64

In [49]:
submission.to_csv('catboost_lgbm.csv', index = False)

### XGBoost

In [58]:
def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = 1-f1_score(y_true, np.round(y_pred))
    return 'f1_err', err

In [63]:
xgb_pred = np.zeros(target.shape[0])
for i, (tr_idx, val_idx) in enumerate(skf.split(X, y)) :
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    xgb = XGBClassifier(random_state = 42, max_depth = 6, learning_rate = 0.02, n_estimators = 10000)
    xgb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    
    val_pred = xgb.predict_proba(val_x)[:, 1]
    val_cls = [1 if p >= 0.3 else 0 for p in val_pred]
    val_f1 = f1_score(val_y, val_cls, average = 'macro')
    print(f'{i + 1} Fold....F1 Score : {val_f1}')    
    
    fold_pred = xgb.predict_proba(target)[:, 1] / skf.n_splits
    xgb_pred += fold_pred

1 Fold....F1 Score : 0.8216485894083982
2 Fold....F1 Score : 0.8077917046580345
3 Fold....F1 Score : 0.8243671465728835
4 Fold....F1 Score : 0.8179118898783317
5 Fold....F1 Score : 0.7839454257155435
6 Fold....F1 Score : 0.8155334466339696
7 Fold....F1 Score : 0.8246065719306006
8 Fold....F1 Score : 0.8050586115168024


In [64]:
submission['전화해지여부'] = [1 if p >= 0.3 else 0 for p in (cb_pred + xgb_pred) / 2]

In [65]:
submission['전화해지여부'].value_counts()

0    11645
1     1298
Name: 전화해지여부, dtype: int64

In [66]:
submission.to_csv('catboost_xgboost.csv', index = False)