In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [53]:
train = pd.read_csv('train.csv')

In [54]:
test = pd.read_csv('test.csv')

In [55]:
train.head()

Unnamed: 0,ID,가입일,음성사서함이용,주간통화시간,주간통화횟수,주간통화요금,저녁통화시간,저녁통화횟수,저녁통화요금,밤통화시간,밤통화횟수,밤통화요금,상담전화건수,전화해지여부
0,TRAIN_00000,329,0,99.2,93,27.3,268.8,68,28.92,262.9,328,32.89,2,0
1,TRAIN_00001,2,80,323.9,323,83.7,269.4,326,32.09,322.8,209,32.32,2,0
2,TRAIN_00002,93,28,282.4,323,34.2,207.0,322,32.82,280.8,328,8.28,0,0
3,TRAIN_00003,223,1,221.4,223,25.1,233.0,61,23.9,203.8,234,9.36,0,0
4,TRAIN_00004,222,0,96.3,222,28.7,223.9,69,28.08,263.1,223,2.8,8,0


In [56]:
train['new_feature1'] = train['상담전화건수'] / train.iloc[:, [4, 7, 10]].sum(axis = 1)
test['new_feature1'] = test['상담전화건수'] / test.iloc[:, [4, 7, 10]].sum(axis = 1)

In [57]:
train['max_time_idx'] = np.argmax(train.iloc[:, [3, 6, 9]].values, axis = 1)
test['max_time_idx'] = np.argmax(test.iloc[:, [3, 6, 9]].values, axis = 1)

In [58]:
train['max_cnt_idx'] = np.argmax(train.iloc[:, [4, 7, 10]].values, axis = 1)
test['max_cnt_idx'] = np.argmax(test.iloc[:, [4, 7, 10]].values, axis = 1)

In [59]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

In [60]:
X = train.drop(['ID', '전화해지여부'], axis = 1)
y = train['전화해지여부']

In [61]:
skf = StratifiedKFold(n_splits = 8, random_state = 42, shuffle = True)

In [62]:
target = test[X.columns]

In [65]:
cb_pred = np.zeros(target.shape[0])
cb_score = 0
for i, (tr_idx, val_idx) in enumerate(skf.split(X, y)) :
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    cb = CatBoostClassifier(random_state = 42, max_depth = 7, learning_rate = 0.02, iterations = 10000, use_best_model = True, eval_metric = 'F1')
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    
    val_pred = cb.predict_proba(val_x)[:, 1]

    val_cls = [1 if p >= 0.3 else 0 for p in val_pred]
    val_f1 = f1_score(val_y, val_cls, average = 'macro')
    cb_score += val_f1 / skf.n_splits
    print(f'{i + 1} Fold F1 Score : {val_f1}')
    
    fold_pred = cb.predict_proba(target)[:, 1] / skf.n_splits
    cb_pred += fold_pred
    
print(f'{cb.__class__.__name__} avg of f1 : {cb_score}')

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

1 Fold F1 Score : 0.8204535240805932
2 Fold F1 Score : 0.8174355639613526
3 Fold F1 Score : 0.8106287274675099
4 Fold F1 Score : 0.8179359337830352
5 Fold F1 Score : 0.7922498489515599
6 Fold F1 Score : 0.8178480407423816
7 Fold F1 Score : 0.8171746916461504
8 Fold F1 Score : 0.8030938560688445
CatBoostClassifier avg of f1 : 0.8121025233376785


0.82574 : 0.8112228450886133, 0.8250 : 0.8097721252504557

In [66]:
submission = pd.read_csv('sample_submission.csv')

In [67]:
submission['전화해지여부'] = [1 if p >= 0.3 else 0 for p in cb_pred]

In [68]:
submission['전화해지여부'].value_counts()

0    11631
1     1312
Name: 전화해지여부, dtype: int64

In [69]:
submission.to_csv('1st.csv', index = False)