In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import catboost
from catboost import CatBoostClassifier, Pool

In [90]:
SEED = 42

In [2]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')

In [3]:
# ID : 샘플 별 고유 ID
# 가입일 : 서비스에 가입한 일수
# 음성사서함이용 : 음성사서함 이용 건수
# 주간통화시간 : 8시 ~ 16시까지의 통화 시간
# 주간통화횟수 : 8시 ~ 16시까지의 통화 횟수
# 주간통화요금 : 8시 ~ 16시까지의 통화 요금
# 저녁통화시간 : 16시 ~ 0시까지의 통화 시간
# 저녁통화횟수 : 16시 ~ 0시까지의 통화 횟수
# 저녁통화요금 : 16시 ~ 0시까지의 통화 요금
# 밤통화시간 : 0시 ~ 8시까지의 통화 시간
# 밤통화횟수 : 0시 ~ 8시까지의 통화 횟수
# 밤통화요금 : 0시 ~ 8시까지의 통화 요금
# 상담전화건수 : 고객센터에 전화를 건 횟수
# 전화해지여부 : 0(서비스 유지)/ 1(서비스 해지)

In [4]:
tr_rename_dict = {
    'ID':'id',
    '가입일':'days_subscription',
    '음성사서함이용':'n_voicemail',
    '주간통화시간':'time_call_morning',
    '주간통화횟수':'n_call_morning',
    '주간통화요금':'pay_call_morning',
    '저녁통화시간':'time_call_evening',
    '저녁통화횟수':'n_call_evening',
    '저녁통화요금':'pay_call_evening',
    '밤통화시간':'time_call_night',
    '밤통화횟수':'n_call_night',
    '밤통화요금':'pay_call_night',
    '상담전화건수':'n_call',
    '전화해지여부':'target',
}
te_rename_dict = tr_rename_dict.copy()
del te_rename_dict['전화해지여부']

train_df = train_df.rename(columns=tr_rename_dict)
test_df  = test_df .rename(columns=te_rename_dict)

In [5]:
# train_df.describe()
# train_df.isnull().sum()

In [6]:
train_df.shape, test_df.shape

((30200, 14), (12943, 13))

In [7]:
train_df.head()

Unnamed: 0,id,days_subscription,n_voicemail,time_call_morning,n_call_morning,pay_call_morning,time_call_evening,n_call_evening,pay_call_evening,time_call_night,n_call_night,pay_call_night,n_call,target
0,TRAIN_00000,329,0,99.2,93,27.3,268.8,68,28.92,262.9,328,32.89,2,0
1,TRAIN_00001,2,80,323.9,323,83.7,269.4,326,32.09,322.8,209,32.32,2,0
2,TRAIN_00002,93,28,282.4,323,34.2,207.0,322,32.82,280.8,328,8.28,0,0
3,TRAIN_00003,223,1,221.4,223,25.1,233.0,61,23.9,203.8,234,9.36,0,0
4,TRAIN_00004,222,0,96.3,222,28.7,223.9,69,28.08,263.1,223,2.8,8,0


In [8]:
target_feature = 'target'
unuse_features = ['id']
cat_features = [] #['n_call']
num_features = [col for col in train_df.columns if col not in [target_feature]+unuse_features+cat_features]

In [103]:
train_df.apply(lambda x: x.nunique())

id                   30200
days_subscription      345
n_voicemail             76
time_call_morning     2771
n_call_morning         239
pay_call_morning       735
time_call_evening     2606
n_call_evening         241
pay_call_evening      1840
time_call_night       2545
n_call_night           233
pay_call_night        1216
n_call                  11
target                   2
dtype: int64

In [13]:
train_df.describe().round(1).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
days_subscription,30200.0,159.7,123.8,1.0,63.0,118.0,228.0,2212.0
n_voicemail,30200.0,12.0,24.9,0.0,0.0,1.0,22.0,1112.0
time_call_morning,30200.0,250.6,84.4,0.0,222.8,234.0,289.6,481.9
n_call_morning,30200.0,159.2,101.6,0.0,88.0,117.5,223.0,489.0
pay_call_morning,30200.0,41.1,23.2,0.0,24.4,30.8,45.9,118.6
time_call_evening,30200.0,263.2,74.1,0.1,223.7,243.0,290.3,481.6
n_call_evening,30200.0,158.5,102.2,0.0,87.0,112.0,223.0,489.0
pay_call_evening,30200.0,25.4,8.1,0.0,22.2,23.3,28.8,50.0
time_call_night,30200.0,263.4,73.8,20.8,223.7,242.8,290.5,481.8
n_call_night,30200.0,157.0,101.7,20.0,87.0,108.0,222.0,490.0


In [42]:
y.value_counts()

0    26882
1     3318
Name: target, dtype: int64

In [14]:
# i=0
# for col in num_features:
#     i+=1
#     print('\n({}/{}) {}'.format(i,len(num_features),col))
#     plt.figure(figsize=(15,7))
#     sns.boxplot(x=train_df[target_feature],y=train_df[col])
#     plt.grid()
#     plt.show()

In [65]:
# https://stackoverflow.com/questions/65462220/how-to-create-custom-eval-metric-for-catboost
class CustomEvalMetric:
    def is_max_optimal(self):
        return True # greater is better
    
    def get_final_error(self, error, weight):
        return error

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        y_pred = np.array(approx)
        y_true = np.array(target)

        output_weight = 1 # weight is not used

        score = f1_score(y_true,y_pred,average='macro')
 
        return score, output_weight

In [126]:
display(y_train.value_counts() / len(y_train))
print('')
display(y_valid.value_counts() / len(y_valid))
# stratify

0    0.888535
1    0.111465
Name: target, dtype: float64




0    0.896523
1    0.103477
Name: target, dtype: float64

In [127]:
X = train_df[num_features+cat_features]
y = train_df[target_feature]

X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2,random_state=SEED,stratify=y)

train_dataset = Pool(X_train,y_train)
valid_dataset = Pool(X_valid,y_valid)

model = CatBoostClassifier(
    iterations=5000,
    metric_period=500,
    #learning_rate=0.2,
    #eval_metric=CustomEvalMetric(),
    #eval_metric='TotalF1',
    use_best_model=True,
    random_state=SEED,
)
model.fit(train_dataset,eval_set=valid_dataset)

Learning rate set to 0.034553
0:	learn: 0.6640878	test: 0.6643365	best: 0.6643365 (0)	total: 8.88ms	remaining: 44.4s
500:	learn: 0.2424610	test: 0.2772312	best: 0.2772312 (500)	total: 2.62s	remaining: 23.5s
1000:	learn: 0.1907160	test: 0.2498529	best: 0.2498529 (1000)	total: 5.11s	remaining: 20.4s
1500:	learn: 0.1559802	test: 0.2311058	best: 0.2311058 (1500)	total: 7.66s	remaining: 17.9s
2000:	learn: 0.1330452	test: 0.2196852	best: 0.2196852 (2000)	total: 10.1s	remaining: 15.2s
2500:	learn: 0.1148975	test: 0.2125384	best: 0.2125384 (2500)	total: 12.6s	remaining: 12.6s
3000:	learn: 0.1008474	test: 0.2067260	best: 0.2067260 (3000)	total: 15.1s	remaining: 10.1s
3500:	learn: 0.0896437	test: 0.2029976	best: 0.2029976 (3500)	total: 17.6s	remaining: 7.55s
4000:	learn: 0.0802318	test: 0.2000155	best: 0.2000155 (4000)	total: 20.1s	remaining: 5.02s
4500:	learn: 0.0722930	test: 0.1991293	best: 0.1991293 (4500)	total: 22.6s	remaining: 2.51s
4999:	learn: 0.0655785	test: 0.1980701	best: 0.1980701 (4

<catboost.core.CatBoostClassifier at 0x15fd62b50>

In [128]:
y_pred = model.predict(X_valid)
y_true = y_valid.copy()

print('> Macro F1 Score: {:.4f}'.format(f1_score(y_pred,y_true,average='macro')))
print('> Cross Table:')
pd.crosstab(y_true,y_pred)

> Macro F1 Score: 0.7322
> Cross Table:


col_0,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5290,86
1,408,256
