# 필요한 라이브러리 Import하기

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import optuna 
import lightgbm as lgb 
from sklearn.metrics import accuracy_score, roc_auc_score,precision_score, confusion_matrix,recall_score 
from sklearn.model_selection import cross_val_score, train_test_split
import os 

In [2]:
# Train data Test data 로드해주기

train_data=pd.read_csv('/dshome/WoongLab/heo/construction_oil/preprocessed_data/breastcancer_train_data.csv')
test_data=pd.read_csv('/dshome/WoongLab/heo/construction_oil/preprocessed_data/breastcancer_test_data.csv')


In [3]:
train_stage_features=['A Stage','differentiate','Race','T Stage ','Grade','Estrogen Status','Progesterone Status','Marital Status','N Stage',
                      '6th Stage','Regional Node Examined','Tumor Size','Reginol Node Positive','Age','Survival Months']

In [4]:
train_X=train_data[train_stage_features]
train_y=train_data['Status']

new_test_data_X=test_data[train_stage_features]
new_test_data_y=test_data['Status']

In [5]:
train_y.value_counts()

0    2726
1     493
Name: Status, dtype: int64

In [6]:
train_X

Unnamed: 0,A Stage,differentiate,Race,T Stage,Grade,Estrogen Status,Progesterone Status,Marital Status,N Stage,6th Stage,Regional Node Examined,Tumor Size,Reginol Node Positive,Age,Survival Months
0,1,3,2,1,1,1,1,1,0,1,13,40,1,50,50
1,1,1,2,0,3,1,1,1,0,0,11,12,2,59,99
2,1,0,2,0,2,1,1,1,0,0,8,11,2,44,77
3,1,0,2,1,2,1,0,0,1,2,32,25,5,59,107
4,1,3,2,0,1,1,1,1,0,0,4,18,1,69,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3214,0,1,2,1,3,1,1,3,2,4,18,35,18,37,77
3215,1,0,2,1,2,1,1,1,1,2,12,50,7,47,71
3216,1,3,2,1,1,1,1,3,0,1,2,32,1,65,80
3217,1,1,2,1,3,1,1,1,0,1,17,40,1,53,8


In [12]:
# class weight 적용하기 위해 y label의 비율을 구하기

ratio=train_y.value_counts()[0]/train_y.value_counts()[1]

## 베이지안 최적화해주기 - LightgbmClassifier

**1. 5-fold 교차검증 이용해서 Train data로 Validation set을 Auc가 최고였을 때의 하이퍼파라미터 구하기**

**2. Learning rate 0.01~0.1, max_depth 3~9, n_estimators 100~1000이었을 때에서 가장 최적의 하이퍼파라미터 구하기**

In [13]:
def objective(trial):
    # Define hyperparameters to optimize 
    params={
        'boosting_type':'gbdt',
        'objective':'binary',
        'metric':'binary_logloss',
        'learning_rate':trial.suggest_loguniform('learning_rate',0.01,0.1),
        'max_depth':trial.suggest_int('max_depth',3,9),
        'n_estimators':trial.suggest_int("n_estimators",100,1000)
    }
    # Train and evaluate model 
    lgb_cv=lgb.LGBMClassifier(**params, random_state=42,scale_pos_weight=ratio)
    scores=cross_val_score(lgb_cv,train_X,train_y,cv=5,scoring='roc_auc')
    auc=scores.mean()
    return auc 

# Define study object and optimize 

study=optuna.create_study(direction='maximize',study_name='lgb_boost_opt',load_if_exists=True)
study.optimize(objective, n_trials=50)

# Print best hyperparameters and auc
print(f'Best hyperparameters: {study.best_params}')
print(f'Best AUC: {study.best_value:.4f}')

[32m[I 2023-04-15 01:36:18,487][0m A new study created in memory with name: lgb_boost_opt[0m
  'learning_rate':trial.suggest_loguniform('learning_rate',0.01,0.1),
[32m[I 2023-04-15 01:36:23,331][0m Trial 0 finished with value: 0.8220789485327963 and parameters: {'learning_rate': 0.06167605327070991, 'max_depth': 6, 'n_estimators': 623}. Best is trial 0 with value: 0.8220789485327963.[0m
  'learning_rate':trial.suggest_loguniform('learning_rate',0.01,0.1),
[32m[I 2023-04-15 01:36:25,050][0m Trial 1 finished with value: 0.8346720036085322 and parameters: {'learning_rate': 0.07532416556304412, 'max_depth': 4, 'n_estimators': 420}. Best is trial 1 with value: 0.8346720036085322.[0m
  'learning_rate':trial.suggest_loguniform('learning_rate',0.01,0.1),
[32m[I 2023-04-15 01:36:28,888][0m Trial 2 finished with value: 0.8303069482474088 and parameters: {'learning_rate': 0.04298682549232013, 'max_depth': 9, 'n_estimators': 378}. Best is trial 1 with value: 0.8346720036085322.[0m
  'l

Best hyperparameters: {'learning_rate': 0.025961944894828903, 'max_depth': 3, 'n_estimators': 202}
Best AUC: 0.8711


In [14]:
# AUC값이 가장 높은 하이퍼파라미터로 파인튜닝해주기

lgb=lgb.LGBMClassifier(learning_rate=0.025961944894828903,max_depth=3,n_estimators=202,scale_pos_weight=ratio,random_state=42)

In [15]:
# 모델 학습시키기

lgb.fit(train_X,train_y)

In [16]:
# AUC를 구하기 위해 클래스의 확률 값 구해주기

prob=lgb.predict_proba(new_test_data_X)[:,1]

In [17]:
# 모델 성능을 평가하기 위해 성능지표 함수 만들어주기

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

def get_clf_prob(y_test, probability):
  pred=np.where(probability > 0.5,1,0)
  confusion=confusion_matrix(y_test, pred)
  accuracy=accuracy_score(y_test,pred)
  precision=precision_score(y_test,pred) 
  recall=recall_score(y_test,pred) 
  # F1 스코어 추가 
  f1=f1_score(y_test,pred,average='macro')
  Roc_score=roc_auc_score(y_test,probability)
  print('임계값: ', 0.5) 
  print('오차행렬')
  print(confusion) 
  # f1 score print 추가 
  print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1:{3:.4f}, AUC:{4: .4f}'.format(accuracy,precision,recall,f1,Roc_score))

In [18]:
get_clf_prob(new_test_data_y,prob)

임계값:  0.5
오차행렬
[[596  86]
 [ 39  84]]
정확도: 0.8447, 정밀도: 0.4941, 재현율: 0.6829, F1:0.7392, AUC: 0.8510
