In [1]:
# 필요한 라이브러리 import하기

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import optuna 
import lightgbm as lgb 
from sklearn.metrics import accuracy_score, roc_auc_score,precision_score, confusion_matrix,recall_score 
from sklearn.model_selection import cross_val_score, train_test_split
import os 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# train data와 test data 로드하기

train_data=pd.read_csv('/dshome/WoongLab/heo/construction_oil/preprocessed_data/pima_train_data.csv')
test_data=pd.read_csv('/dshome/WoongLab/heo/construction_oil/preprocessed_data/pima_test_data.csv')

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               614 non-null    int64  
 1   Glucose                   614 non-null    int64  
 2   BloodPressure             614 non-null    int64  
 3   SkinThickness             614 non-null    int64  
 4   Insulin                   614 non-null    int64  
 5   BMI                       614 non-null    float64
 6   DiabetesPedigreeFunction  614 non-null    float64
 7   Age                       614 non-null    int64  
 8   Outcome                   614 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 43.3 KB


In [4]:
train_X=train_data.drop('Outcome',axis=1)
train_y=train_data['Outcome']

new_test_data_X=test_data.drop('Outcome',axis=1)
new_test_data_y=test_data['Outcome']

In [5]:
train_y.value_counts()

0    400
1    214
Name: Outcome, dtype: int64

In [6]:
# class weight 적용하기 위해 y label의 비율을 구하기

ratio=train_y.value_counts()[0]/train_y.value_counts()[1]

## 베이지안 최적화해주기 - LightgbmClassifier

**1. 5-fold 교차검증 이용해서 Train data로 Validation set을 Auc가 최고였을 때의 하이퍼파라미터 구하기**

**2. Learning rate 0.01~0.1, max_depth 3~9, n_estimators 100~1000이었을 때에서 가장 최적의 하이퍼파라미터 구하기**

In [7]:
def objective(trial):
    # Define hyperparameters to optimize 
    params={
        'boosting_type':'gbdt',
        'objective':'binary',
        'metric':'binary_logloss',
        'learning_rate':trial.suggest_loguniform('learning_rate',0.01,0.1),
        'max_depth':trial.suggest_int('max_depth',3,9),
        'n_estimators':trial.suggest_int("n_estimators",100,1000)
    }
    # Train and evaluate model 
    lgb_cv=lgb.LGBMClassifier(**params, random_state=42,scale_pos_weight=ratio)
    scores=cross_val_score(lgb_cv,train_X,train_y,cv=5,scoring='roc_auc')
    auc=scores.mean()
    return auc 

# Define study object and optimize 

study=optuna.create_study(direction='maximize',study_name='lgb_boost_opt',load_if_exists=True)
study.optimize(objective, n_trials=50)

# Print best hyperparameters and auc
print(f'Best hyperparameters: {study.best_params}')
print(f'Best AUC: {study.best_value:.4f}')

[32m[I 2023-04-28 15:29:35,029][0m A new study created in memory with name: lgb_boost_opt[0m
  'learning_rate':trial.suggest_loguniform('learning_rate',0.01,0.1),
[32m[I 2023-04-28 15:29:37,660][0m Trial 0 finished with value: 0.7891971207087487 and parameters: {'learning_rate': 0.012852177105517506, 'max_depth': 7, 'n_estimators': 603}. Best is trial 0 with value: 0.7891971207087487.[0m
  'learning_rate':trial.suggest_loguniform('learning_rate',0.01,0.1),
[32m[I 2023-04-28 15:29:40,429][0m Trial 1 finished with value: 0.7564797895902549 and parameters: {'learning_rate': 0.08674094387313179, 'max_depth': 4, 'n_estimators': 653}. Best is trial 0 with value: 0.7891971207087487.[0m
  'learning_rate':trial.suggest_loguniform('learning_rate',0.01,0.1),
[32m[I 2023-04-28 15:29:41,452][0m Trial 2 finished with value: 0.7699570874861573 and parameters: {'learning_rate': 0.08301657046246268, 'max_depth': 4, 'n_estimators': 241}. Best is trial 0 with value: 0.7891971207087487.[0m
  '

Best hyperparameters: {'learning_rate': 0.011354550288050548, 'max_depth': 8, 'n_estimators': 149}
Best AUC: 0.8152


In [7]:
lgb=lgb.LGBMClassifier(learning_rate= 0.011354550288050548,max_depth=8,n_estimators=149,scale_pos_weight=ratio,random_state=42)

In [8]:
# 모델 학습하기

lgb.fit(train_X,train_y)

In [9]:
# 모델의 성능(AUC)을 평가하기 위해 예측된 클래스가 당뇨병환자일 확률 구하기
train_prob=pd.DataFrame()
train_prob['prob']=lgb.predict_proba(train_X)[:,1]

In [10]:
# 오즈(odds) 구하기

train_prob['1_prob']=1-train_prob['prob']
train_prob['odds']=train_prob['prob']/train_prob['1_prob']

In [11]:
# 앞에서 구한 오즈로 로짓을 구하기

train_prob['loogits']=np.log(train_prob['odds'])

In [28]:
# loogits으로 test data의 예측확률 값을 구하기

train_prob['new_prob']=1/(1+np.exp(-train_prob['loogits']/0.1))

In [29]:
import optuna
from lightgbm import LGBMRegressor

model2 = LGBMRegressor(n_estimators=653,max_depth=4,learning_rate=0.08674094387313179)

In [30]:
len(list(train_prob['new_prob']))

614

In [31]:
len(train_X)

614

In [32]:
model2.fit(train_X,train_prob['new_prob'])

In [33]:
# model로 new_test_data_X로 predict하기

test_data=pd.DataFrame()
test_data['prob']=model2.predict(new_test_data_X)

In [34]:
# 모델 성능 평가함수 만들기

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

def get_clf_prob(y_test, probability):
  pred=np.where(probability > 0.5,1,0)
  confusion=confusion_matrix(y_test, pred)
  accuracy=accuracy_score(y_test,pred)
  precision=precision_score(y_test,pred) 
  recall=recall_score(y_test,pred) 
  # F1 스코어 추가 
  f1=f1_score(y_test,pred,average='macro')
  Roc_score=roc_auc_score(y_test,probability)
  print('임계값: ', 0.5) 
  print('오차행렬')
  print(confusion) 
  # f1 score print 추가 
  print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1:{3:.4f}, AUC:{4: .4f}'.format(accuracy,precision,recall,f1,Roc_score))

In [35]:
# 모델 평가하기

get_clf_prob(new_test_data_y,test_data['prob'])

임계값:  0.5
오차행렬
[[81 19]
 [19 35]]
정확도: 0.7532, 정밀도: 0.6481, 재현율: 0.6481, F1:0.7291, AUC: 0.8143
