In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    
    accuracy = accuracy_score(y_test, pred)
    
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    
    f1 = f1_score(y_test, pred)
    
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    
    print('오차 행렬')
    print(confusion)
    
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

def precision_recall_curve_plot(y_test=None, pred_proba_c1=None):
    
    # threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출. 
    precisions, recalls, thresholds = precision_recall_curve( y_test, pred_proba_c1)
    
    # X축을 threshold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')
    
    # threshold 값 X 축의 Scale을 0.1 단위로 변경
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    
    # x축, y축 label과 legend, 그리고 grid 설정
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend(); plt.grid()
    plt.show()

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import warnings
warnings.filterwarnings('ignore')

cust_df = pd.read_csv("../../../data/santander-customer-satisfaction/train.csv", encoding='latin-1')
print('dataset shape:', cust_df.shape)
cust_df.head(3)

dataset shape: (76020, 371)


Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0


# 0 => 만족
# 1 => 불만족

In [29]:
cust_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Columns: 371 entries, ID to TARGET
dtypes: float64(111), int64(260)
memory usage: 215.2 MB


In [30]:
cust_df.describe( )

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
count,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,...,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0
mean,75964.050723,-1523.199277,33.212865,86.208265,72.363067,119.529632,3.55913,6.472698,0.412946,0.567352,...,7.935824,1.365146,12.21558,8.784074,31.505324,1.858575,76.026165,56.614351,117235.8,0.039569
std,43781.947379,39033.462364,12.956486,1614.757313,339.315831,546.266294,93.155749,153.737066,30.604864,36.513513,...,455.887218,113.959637,783.207399,538.439211,2013.125393,147.786584,4040.337842,2852.579397,182664.6,0.194945
min,1.0,-999999.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5163.75,0.0
25%,38104.75,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67870.61,0.0
50%,76043.0,2.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106409.2,0.0
75%,113748.75,2.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118756.3,0.0
max,151838.0,238.0,105.0,210000.0,12888.03,21024.81,8237.82,11073.57,6600.0,6600.0,...,50003.88,20385.72,138831.63,91778.73,438329.22,24650.01,681462.9,397884.3,22034740.0,1.0


In [3]:
print(cust_df['TARGET'].value_counts())

total_cnt = cust_df.TARGET.count()
unsatisfied_cnt = cust_df[cust_df['TARGET'] == 1].TARGET.count()
print('unsatisfied 비율은 {0:.2f}'.format((unsatisfied_cnt / total_cnt * 100)))

TARGET
0    73012
1     3008
Name: count, dtype: int64
unsatisfied 비율은 3.96


In [4]:
print('santander customer satisfaction: 데이터 세트 Null 값 갯수 ',cust_df.isnull().sum().sum())

santander customer satisfaction: 데이터 세트 Null 값 갯수  0


In [5]:
cust_df['var3'].value_counts()

var3
 2         74165
 8           138
-999999      116
 9           110
 3           108
           ...  
 231           1
 188           1
 168           1
 135           1
 87            1
Name: count, Length: 208, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

# var3 피처 값 대체(2가 많으니 2로 대체) 및 ID 피처 드롭 -> 고민이 필요함
cust_df['var3'].replace(-999999, 2, inplace=True)
cust_df.drop('ID', axis=1, inplace=True)

# 피처 세트와 레이블 세트분리. 레이블 컬럼은 DataFrame의 맨 마지막에 위치해 컬럼 위치 -1로 분리
X = cust_df.iloc[:, :-1]
y = cust_df.iloc[:, -1]
print('피처 데이터 shape:{0}'.format(X.shape))
print('피처 데이터 shape:{0}'.format(y.shape))


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
train_cnt = y_train.count()
test_cnt = y_test.count()
print('학습 세트 Shape:{0}, 테스트 세트 Shape:{1}'.format(X_train.shape , X_test.shape), '\n')

print('학습 세트 레이블 값 분포 비율')
print(y_train.value_counts()/train_cnt)

print('\n 테스트 세트 레이블 값 분포 비율')
print(y_test.value_counts()/test_cnt, '\n')

# X_train, y_train을 다시 학습과 검증 데이터 세트로 분리. 
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=0)
print('학습 세트 Shape:{0}, 검증 세트 Shape:{1}'.format(X_tr.shape , X_val.shape))

피처 데이터 shape:(76020, 369)
피처 데이터 shape:(76020,)
학습 세트 Shape:(60816, 369), 테스트 세트 Shape:(15204, 369) 

학습 세트 레이블 값 분포 비율
TARGET
0    0.960964
1    0.039036
Name: count, dtype: float64

 테스트 세트 레이블 값 분포 비율
TARGET
0    0.9583
1    0.0417
Name: count, dtype: float64 

학습 세트 Shape:(42571, 369), 검증 세트 Shape:(18245, 369)


In [7]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)

print(lr.score(X_train, y_train)) 
print(lr.score(X_test, y_test), "\n")
# 훈련 데이터에 대한 정확도와 테스트 데이터에 대한 정확도를 비교하면, 모델의 과적합(overfitting) 여부를 확인할 수 있다.
# 훈련 데이터 정확도가 높고 테스트 데이터 정확도가 낮다면, 모델이 과적합되었을 가능성
# 모델의 성능과 일반화 능력을 평가

lr_pred = lr.predict(X_test) 
print('LogisticRegression 정확도: {0:.4f}'.format(accuracy_score(y_test, lr_pred))) 
# 테스트 데이터에 대한 정확도 점수를 계산 = lr.score(X_test, y_test) = lr이 테스트 데이터에 대한 정확도
# lr_pred: 로지스틱 회귀 모델 lr이 테스트 데이터에 대해 예측한 라벨 값

pred_proba = lr.predict_proba(X_test)[:, 1]
print(pred_proba[:4], "\n")
# 로지스틱 회귀 모델 lr이 테스트 데이터 X_test에 대해 예측한 클래스 확률 값을 반환하는 것
# 이진 분류 문제에서는 [[0.3, 0.7], [0.6, 0.4], ...]와 같은 형태로 반환
# 첫 번째 열은 클래스 0의 확률, 두 번째 열은 클래스 1의 확률
# 클래스 확률 값은 모델이 각 데이터 포인트를 어느 클래스로 분류할지 결정하는 데 사용됩니다. 일반적으로 확률 값이 0.5 이상이면 해당 클래스로 분류

get_clf_eval(y_test, lr_pred, pred_proba)

0.9609642199421204
0.9583004472507235 

LogisticRegression 정확도: 0.9583
[0.04354073 0.01972306 0.00458048 0.15948392] 

오차 행렬
[[14570     0]
 [  634     0]]
정확도: 0.9583, 정밀도: 0.0000, 재현율: 0.0000,    F1: 0.0000, AUC:0.6090


### 데이터의 불균형 
> - 클래스 간 데이터의 분포가 균형을 이루지 않아 모델의 성능이 제대로 나타나지 않는  것

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': np.arange(0.01, 5, 0.01),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'], # 최적화 알고리즘
    'max_iter': [1000] # 최대 반복 횟수
}

# 그리드 서치 객체 생성 및 학습
lr_cv = GridSearchCV(LogisticRegression(), param_grid, cv=5)
lr_cv.fit(X_train, y_train)

In [None]:
# 최적의 하이퍼파라미터 확인
print('Best Parameters:', lr_cv.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(lr_cv.best_score_))
print("---------------------------------------------------")

best_lr_cv = lr_cv.best_estimator_
print(f"best_lr_cv: {best_lr_cv}")
print("---------------------------------------------------")


# GridSearchCV의 최적 하이퍼 파라미터로 학습된 Estimator로 예측 및 평가 수행. 
y_pred = best_lr_cv.predict(X_test)
accuracy = accuracy_score(y_test , y_pred)
print('Test Accuracy:', accuracy)
print("---------------------------------------------------")

print(best_lr_cv.score(X_train, y_train))
print(best_lr_cv.score(X_test, y_test), "\n")


pred_proba = best_lr_cv.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, y_pred, pred_proba)

In [None]:
predict_titanic_pred_hyper = best_lr_cv.predict(predict_titanic)
gender_submission_df['Survived'] = predict_titanic_pred_hyper
gender_submission_df.to_csv('titanic_submission_hyper.csv',index=False)