In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)

    accuracy = accuracy_score(y_test, pred)

    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)

    f1 = f1_score(y_test, pred)

    # ROC-AUC 추가
    roc_auc = roc_auc_score(y_test, pred_proba)

    print('오차 행렬')
    print(confusion)

    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

def precision_recall_curve_plot(y_test=None, pred_proba_c1=None):

    # threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출.
    precisions, recalls, thresholds = precision_recall_curve( y_test, pred_proba_c1)

    # X축을 threshold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')

    # threshold 값 X 축의 Scale을 0.1 단위로 변경
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))

    # x축, y축 label과 legend, 그리고 grid 설정
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend(); plt.grid()
    plt.show()

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import warnings
warnings.filterwarnings('ignore')

cust_df = pd.read_csv("../../../data/santander-customer-satisfaction/train.csv", encoding='latin-1')
predict_df = pd.read_csv("../../../data/santander-customer-satisfaction/test.csv", encoding='latin-1')
santander_submission_df = pd.read_csv("../../../data/santander-customer-satisfaction/sample_submission.csv", encoding='latin-1')

print('dataset shape:', cust_df.shape)
cust_df.head(3)

dataset shape: (76020, 371)


Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0


In [None]:
predict_df.head()

In [3]:
from sklearn.model_selection import train_test_split

# var3 피처 값 대체(2가 많으니 2로 대체) 및 ID 피처 드롭 -> 고민이 필요함
cust_df['var3'].replace(-999999, 2, inplace=True)
cust_df.drop('ID', axis=1, inplace=True)

# predict_df에서의 ID 드롭
predict_df.drop('ID', axis=1, inplace=True)

cust_df['var3'].value_counts()

var3
2      74281
8        138
9        110
3        108
1        105
       ...  
231        1
188        1
168        1
135        1
87         1
Name: count, Length: 207, dtype: int64

In [4]:
from sklearn.preprocessing import StandardScaler

# 피처 세트와 레이블 세트분리. 레이블 컬럼은 DataFrame의 맨 마지막에 위치해 컬럼 위치 -1로 분리
X = cust_df.iloc[:, :-1]
y = cust_df.iloc[:, -1]
print('피처 데이터 shape:{0}'.format(X.shape))
print('피처 데이터 shape:{0}'.format(y.shape))

# 표준화 => 2차원 데이터
sc = StandardScaler()
X = sc.fit_transform(X)
print('피처 데이터 shape:{0}'.format(X.shape))

print('피처 데이터 shape:{0}'.format(predict_df.shape))
predict_df = sc.transform(predict_df)
print('피처 데이터 shape:{0}'.format(predict_df.shape))

피처 데이터 shape:(76020, 369)
피처 데이터 shape:(76020,)
학습 세트 Shape:(60816, 369), 테스트 세트 Shape:(15204, 369) 

학습 세트 레이블 값 분포 비율
TARGET
0    96.096422
1     3.903578
Name: count, dtype: float64

 테스트 세트 레이블 값 분포 비율
TARGET
0    95.830045
1     4.169955
Name: count, dtype: float64 

학습 세트 Shape:(42571, 369), 검증 세트 Shape:(18245, 369)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
train_cnt = y_train.count()
test_cnt = y_test.count()
print('학습 세트 Shape:{0}, 테스트 세트 Shape:{1}'.format(X_train.shape , X_test.shape), '\n')

print('학습 세트 레이블 값 분포 비율')
print(y_train.value_counts()/train_cnt * 100)

print('\n 테스트 세트 레이블 값 분포 비율')
print(y_test.value_counts()/test_cnt * 100, '\n')

# X_train, y_train을 다시 학습과 검증 데이터 세트로 분리.
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=0)
print('학습 세트 Shape:{0}, 검증 세트 Shape:{1}'.format(X_tr.shape , X_val.shape))

## ADASYN

In [5]:
from imblearn.over_sampling import ADASYN

adasyn = ADASYN(random_state=1)
X_2, y_2 = adasyn.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.2, random_state=0)
train_cnt = y_train.count()
test_cnt = y_test.count()
print('학습 세트 Shape:{0}, 테스트 세트 Shape:{1}'.format(X_train.shape , X_test.shape), '\n')

print('학습 세트 레이블 값 분포 비율')
print(y_train.value_counts()/train_cnt * 100)

print('\n 테스트 세트 레이블 값 분포 비율')
print(y_test.value_counts()/test_cnt * 100, '\n')

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=0)
print('학습 세트 Shape:{0}, 검증 세트 Shape:{1}'.format(X_tr.shape , X_val.shape))

resampled_Y value count: 
TARGET
1    59028
0    58442
Name: count, dtype: int64 



In [8]:
from lightgbm import early_stopping
from lightgbm import LGBMClassifier

lgbm_wrapper = LGBMClassifier(n_estimators=400, learning_rate=0.05)

evals = [(X_tr, y_tr), (X_val, y_val)]
lgbm_wrapper.fit(X_tr, y_tr, callbacks=[early_stopping(stopping_rounds=50)], eval_metric="logloss", eval_set=evals)

preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1]

get_clf_eval(y_test , preds, pred_proba)

[LightGBM] [Info] Number of positive: 1658, number of negative: 40913
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015724 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13308
[LightGBM] [Info] Number of data points in the train set: 42571, number of used features: 242
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.038947 -> initscore=-3.205836
[LightGBM] [Info] Start training from score -3.205836
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[97]	training's binary_logloss: 0.110494	valid_1's binary_logloss: 0.134677
오차 행렬
[[14565     5]
 [  630     4]]
정확도: 0.9582, 정밀도: 0.4444, 재현율: 0.0063,    F1: 0.0124, AUC:0.8404


In [10]:
predict_santander_pred_xgb = lgbm_wrapper.predict(predict_df)
print(f"predict_santander 예측: {predict_santander_pred_xgb[:10]}")

santander_submission_df['TARGET'] = predict_santander_pred_xgb
santander_submission_df.to_csv('santander_submission_lgbm_ADASYN_sc.csv',index=False)
santander_submission_df

predict_santander 예측: [0 0 0 0 0 1 0 1 0 0]


Unnamed: 0,ID,TARGET
0,2,0
1,5,0
2,6,0
3,7,0
4,9,0
...,...,...
75813,151831,0
75814,151832,0
75815,151833,0
75816,151834,1


## SMOTETomek

In [11]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

smoteto = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'), random_state=1)
X_3, y_3 = smoteto.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, random_state=0)
train_cnt = y_train.count()
test_cnt = y_test.count()
print('학습 세트 Shape:{0}, 테스트 세트 Shape:{1}'.format(X_train.shape , X_test.shape), '\n')

print('학습 세트 레이블 값 분포 비율')
print(y_train.value_counts()/train_cnt * 100)

print('\n 테스트 세트 레이블 값 분포 비율')
print(y_test.value_counts()/test_cnt * 100, '\n')

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=0)
print('학습 세트 Shape:{0}, 검증 세트 Shape:{1}'.format(X_tr.shape , X_val.shape))

resampled_Y value count: 
TARGET
1    58442
0    56734
Name: count, dtype: int64 



In [12]:
from lightgbm import early_stopping
from lightgbm import LGBMClassifier

lgbm_wrapper = LGBMClassifier(n_estimators=400, learning_rate=0.05)

evals = [(X_tr, y_tr), (X_val, y_val)]
lgbm_wrapper.fit(X_tr, y_tr, callbacks=[early_stopping(stopping_rounds=50)], eval_metric="logloss", eval_set=evals)

preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1]

get_clf_eval(y_test , preds, pred_proba)

[LightGBM] [Info] Number of positive: 58442, number of negative: 56734
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033677 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 18499
[LightGBM] [Info] Number of data points in the train set: 115176, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.507415 -> initscore=0.029661
[LightGBM] [Info] Start training from score 0.029661
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[400]	training's binary_logloss: 0.144623	valid_1's binary_logloss: 0.196599
오차 행렬
[[13462  1108]
 [  378   256]]
정확도: 0.9023, 정밀도: 0.1877, 재현율: 0.4038,    F1: 0.2563, AUC:0.8104


In [13]:
predict_santander_pred_xgb = lgbm_wrapper.predict(predict_df)

santander_submission_df['TARGET'] = predict_santander_pred_xgb
santander_submission_df.to_csv('santander_submission_lgbm_SMOTETomek_sc.csv',index=False)
santander_submission_df

Unnamed: 0,ID,TARGET
0,2,0
1,5,0
2,6,0
3,7,0
4,9,0
...,...,...
75813,151831,0
75814,151832,0
75815,151833,0
75816,151834,1


## SMOTE

In [14]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=1)
X_1, y_1 = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.2, random_state=0)
train_cnt = y_train.count()
test_cnt = y_test.count()
print('학습 세트 Shape:{0}, 테스트 세트 Shape:{1}'.format(X_train.shape , X_test.shape), '\n')

print('학습 세트 레이블 값 분포 비율')
print(y_train.value_counts()/train_cnt * 100)

print('\n 테스트 세트 레이블 값 분포 비율')
print(y_test.value_counts()/test_cnt * 100, '\n')

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=0)
print('학습 세트 Shape:{0}, 검증 세트 Shape:{1}'.format(X_tr.shape , X_val.shape))

완


In [15]:
from lightgbm import early_stopping
from lightgbm import LGBMClassifier

lgbm_wrapper = LGBMClassifier(n_estimators=400, learning_rate=0.05)

evals = [(X_tr, y_tr), (X_val, y_val)]
lgbm_wrapper.fit(X_tr, y_tr, callbacks=[early_stopping(stopping_rounds=50)], eval_metric="logloss", eval_set=evals)

preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1]

get_clf_eval(y_test , preds, pred_proba)

[LightGBM] [Info] Number of positive: 58442, number of negative: 58442
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032338 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 18500
[LightGBM] [Info] Number of data points in the train set: 116884, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[400]	training's binary_logloss: 0.148906	valid_1's binary_logloss: 0.188062
오차 행렬
[[13504  1066]
 [  375   259]]
정확도: 0.9052, 정밀도: 0.1955, 재현율: 0.4085,    F1: 0.2644, AUC:0.8107


In [16]:
predict_santander_pred_xgb = lgbm_wrapper.predict(predict_df)

santander_submission_df['TARGET'] = predict_santander_pred_xgb
santander_submission_df.to_csv('santander_submission_lgbm_SMOTE_sc.csv',index=False)
santander_submission_df

Unnamed: 0,ID,TARGET
0,2,0
1,5,0
2,6,0
3,7,0
4,9,0
...,...,...
75813,151831,0
75814,151832,0
75815,151833,0
75816,151834,1
