# 필요한 라이브러리 Import하기

In [None]:
import pandas as pd 
import numpy as np 
import os 
from sklearn.preprocessing import LabelEncoder 
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score,roc_auc_score,mean_squared_error
import warnings
warnings.filterwarnings(action='ignore')
import random
from lightgbm import LGBMClassifier
from lightgbm import LGBMRegressor

In [17]:
## Train data와 Test data 로드하기

train_data=pd.read_csv('/dshome/WoongLab/heo/construction_oil/preprocessed_data/heart_train_data.csv')
test_data=pd.read_csv('/dshome/WoongLab/heo/construction_oil/preprocessed_data/heart_test_data.csv')

In [18]:
train_stage_features=['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',
       'Diabetes', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump',
       'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth',
       'DiffWalk', 'Sex', 'Age', 'Education', 'Income']

test_stage_features=['CholCheck','HvyAlcoholConsump','AnyHealthcare','Veggies','Fruits',
                     'NoDocbcCost','PhysActivity','Education','MentHlth','Smoker']

In [19]:
train_data

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,HeartDiseaseorAttack
0,1.0,1.0,1.0,27.0,0.0,0.0,0,1.0,1.0,1.0,...,0.0,2,2.0,1.0,0.0,0.0,13.0,4,1,0.0
1,1.0,1.0,1.0,25.0,0.0,0.0,0,1.0,1.0,1.0,...,0.0,0,0.0,0.0,0.0,0.0,10.0,5,6,0.0
2,0.0,0.0,1.0,24.0,1.0,0.0,0,1.0,1.0,1.0,...,0.0,0,0.0,0.0,0.0,1.0,7.0,3,6,0.0
3,1.0,1.0,1.0,24.0,1.0,0.0,0,0.0,1.0,1.0,...,0.0,2,0.0,30.0,1.0,0.0,11.0,3,3,0.0
4,0.0,0.0,1.0,20.0,0.0,0.0,0,1.0,1.0,1.0,...,0.0,1,15.0,0.0,0.0,0.0,2.0,5,5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202939,1.0,0.0,1.0,41.0,1.0,0.0,0,1.0,1.0,1.0,...,0.0,2,0.0,0.0,0.0,1.0,9.0,3,4,0.0
202940,1.0,1.0,1.0,35.0,1.0,1.0,0,1.0,1.0,1.0,...,0.0,3,30.0,30.0,1.0,1.0,9.0,5,4,1.0
202941,1.0,1.0,1.0,25.0,0.0,0.0,0,1.0,1.0,1.0,...,0.0,3,0.0,20.0,0.0,0.0,11.0,1,0,0.0
202942,0.0,0.0,1.0,21.0,0.0,0.0,0,1.0,1.0,1.0,...,0.0,0,0.0,0.0,0.0,0.0,13.0,5,7,0.0


In [20]:
train_X=train_data.loc[:,train_stage_features]
train_y=train_data['HeartDiseaseorAttack']
new_test_data_X=test_data.loc[:,test_stage_features]
new_test_data_y=test_data['HeartDiseaseorAttack']

In [21]:
ratio=train_y.value_counts()[0]/train_y.value_counts()[1]

In [22]:
# train에는 있는데 test data에는 없는 컬럼 찾기

train_data_columns=np.array(train_X.columns)
test_data_columns=np.array(new_test_data_X.columns)
np.setdiff1d(train_data_columns,test_data_columns)

array(['Age', 'BMI', 'Diabetes', 'DiffWalk', 'GenHlth', 'HighBP',
       'HighChol', 'Income', 'PhysHlth', 'Sex', 'Stroke'], dtype=object)

## 누락된 피처들 생성해주기

## Test data에 없는 변수들을 다 생성했으므로 LightgbmRegressor 분류모델을 만듬

### 베이지안 최적화해주기 - LightgbmRegressor

**1. 5-fold 교차검증 이용해서 Train data로 Validation set을 RMSE가 최저였을 때의 하이퍼파라미터 구하기**

**2. Learning rate 0.01~0.1, max_depth 3~9, n_estimators 100~1000이었을 때에서 가장 최적의 하이퍼파라미터 구하기**


In [23]:
# 변수 Age을 예측하는 모형 만들어 주기
import optuna
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score

test_data_columns=test_stage_features
y='Age'

train_X_new=train_X.loc[:,test_data_columns]
train_y_new=train_X.loc[:,y]



# Objective 함수 정의
def objective(trial):
    
    # 하이퍼파라미터 탐색할 공간 정의
    params = {
        'n_estimators': trial.suggest_int('n_estimators',100,1000),
        'max_depth': trial.suggest_int('max_depth',3,9),
        'learning_rate': trial.suggest_loguniform('learning_rate',0.01,0.1),
    }
    
    # LGBMRegressor 모델 객체 생성
    model = LGBMRegressor(**params, random_state=42)
    
    # 교차검증 수행하여 모델 성능 측정
    scores = -1 * cross_val_score(model, train_X_new, train_y_new,
                                  cv=5, scoring='neg_mean_squared_error')
    
    # 교차검증 평균 점수 리턴
    return np.mean(scores)


# Optuna study 생성
study = optuna.create_study(direction='minimize')

# study 실행 (n_trials는 시도 횟수)
study.optimize(objective, n_trials=10)

# 최적화된 하이퍼파라미터 값 출력
print(study.best_params)



[32m[I 2023-04-27 00:29:54,457][0m A new study created in memory with name: no-name-e368d52b-7bc3-4ae4-8ccb-4b99ae691759[0m
[32m[I 2023-04-27 00:30:08,228][0m Trial 0 finished with value: 8.35596655480609 and parameters: {'n_estimators': 895, 'max_depth': 7, 'learning_rate': 0.09952093499073741}. Best is trial 0 with value: 8.35596655480609.[0m
[32m[I 2023-04-27 00:30:15,791][0m Trial 1 finished with value: 8.310763597493995 and parameters: {'n_estimators': 472, 'max_depth': 7, 'learning_rate': 0.03806288198144968}. Best is trial 1 with value: 8.310763597493995.[0m
[32m[I 2023-04-27 00:30:20,290][0m Trial 2 finished with value: 8.30676830257934 and parameters: {'n_estimators': 251, 'max_depth': 9, 'learning_rate': 0.048680371106398855}. Best is trial 2 with value: 8.30676830257934.[0m
[32m[I 2023-04-27 00:30:28,164][0m Trial 3 finished with value: 8.299919083982399 and parameters: {'n_estimators': 778, 'max_depth': 4, 'learning_rate': 0.01507871270051649}. Best is trial 3

{'n_estimators': 778, 'max_depth': 4, 'learning_rate': 0.01507871270051649}


In [24]:
# Print best hyperparameters and auc

print(f'Best hyperparameters: {study.best_params}')
print(f'Best RMSE: {study.best_value:.4f}')

Best hyperparameters: {'n_estimators': 778, 'max_depth': 4, 'learning_rate': 0.01507871270051649}
Best RMSE: 8.2999


In [25]:
(est,depth,rate)=study.best_params.values()

In [26]:
def bestreg_parametertuning(rate,depth,est,test_data_columns,y):
    best_lgbmreg=LGBMRegressor(learning_rate=rate,max_depth=depth,n_estimators=est,random_state=42)
    best_lgbmreg.fit(train_X[test_data_columns], train_X[y])
    new_test_data_X[y]=best_lgbmreg.predict(new_test_data_X[test_data_columns])
    

In [27]:
bestreg_parametertuning(rate,depth,est,test_data_columns,'Age')

In [28]:
new_test_data_X

Unnamed: 0,CholCheck,HvyAlcoholConsump,AnyHealthcare,Veggies,Fruits,NoDocbcCost,PhysActivity,Education,MentHlth,Smoker,Age
0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,5,0.0,1.0,9.033004
1,1.0,0.0,1.0,1.0,0.0,0.0,0.0,5,4.0,0.0,7.139105
2,1.0,0.0,1.0,1.0,0.0,0.0,0.0,5,3.0,0.0,7.044871
3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,5,0.0,1.0,8.696760
4,1.0,0.0,1.0,1.0,1.0,0.0,0.0,4,0.0,1.0,9.621035
...,...,...,...,...,...,...,...,...,...,...,...
50731,1.0,0.0,1.0,1.0,1.0,0.0,0.0,5,1.0,0.0,7.929674
50732,1.0,0.0,0.0,1.0,1.0,1.0,0.0,3,10.0,0.0,6.148429
50733,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3,10.0,0.0,8.383784
50734,1.0,0.0,1.0,1.0,1.0,0.0,0.0,5,0.0,0.0,8.596866


In [29]:
y='BMI'
train_X_new=train_X.loc[:,test_data_columns]
train_y_new=train_X.loc[:,y]
study = optuna.create_study(direction='minimize')

# study 실행 (n_trials는 시도 횟수)
study.optimize(objective, n_trials=10)

# 최적화된 하이퍼파라미터 값 출력
print(study.best_params)

(est,depth,rate)=study.best_params.values()
print(rate,depth,est)

[32m[I 2023-04-27 00:31:33,497][0m A new study created in memory with name: no-name-5adb0500-aba6-4006-b1a4-a6e1d25a5ae4[0m
[32m[I 2023-04-27 00:31:46,556][0m Trial 0 finished with value: 41.800324544652696 and parameters: {'n_estimators': 817, 'max_depth': 8, 'learning_rate': 0.01596049923134578}. Best is trial 0 with value: 41.800324544652696.[0m
[32m[I 2023-04-27 00:31:59,951][0m Trial 1 finished with value: 42.16701982819702 and parameters: {'n_estimators': 801, 'max_depth': 9, 'learning_rate': 0.0964240840937961}. Best is trial 0 with value: 41.800324544652696.[0m
[32m[I 2023-04-27 00:32:01,565][0m Trial 2 finished with value: 42.01757713946688 and parameters: {'n_estimators': 118, 'max_depth': 4, 'learning_rate': 0.011863591527918552}. Best is trial 0 with value: 41.800324544652696.[0m
[32m[I 2023-04-27 00:32:14,075][0m Trial 3 finished with value: 42.049802713786946 and parameters: {'n_estimators': 771, 'max_depth': 9, 'learning_rate': 0.0693187999342597}. Best is 

{'n_estimators': 546, 'max_depth': 5, 'learning_rate': 0.011142522459068235}
0.011142522459068235 5 546


In [30]:
bestreg_parametertuning(rate,depth,est,test_data_columns,'BMI')

# BMI 열 추가하기
new_test_data_X

Unnamed: 0,CholCheck,HvyAlcoholConsump,AnyHealthcare,Veggies,Fruits,NoDocbcCost,PhysActivity,Education,MentHlth,Smoker,Age,BMI
0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,5,0.0,1.0,9.033004,29.620222
1,1.0,0.0,1.0,1.0,0.0,0.0,0.0,5,4.0,0.0,7.139105,30.964247
2,1.0,0.0,1.0,1.0,0.0,0.0,0.0,5,3.0,0.0,7.044871,30.643909
3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,5,0.0,1.0,8.696760,28.160780
4,1.0,0.0,1.0,1.0,1.0,0.0,0.0,4,0.0,1.0,9.621035,29.623515
...,...,...,...,...,...,...,...,...,...,...,...,...
50731,1.0,0.0,1.0,1.0,1.0,0.0,0.0,5,1.0,0.0,7.929674,29.534726
50732,1.0,0.0,0.0,1.0,1.0,1.0,0.0,3,10.0,0.0,6.148429,31.697114
50733,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3,10.0,0.0,8.383784,32.189805
50734,1.0,0.0,1.0,1.0,1.0,0.0,0.0,5,0.0,0.0,8.596866,29.024121


In [31]:
y='Diabetes'
train_X_new=train_X.loc[:,test_data_columns]
train_y_new=train_X.loc[:,y]
study = optuna.create_study(direction='minimize')

# study 실행 (n_trials는 시도 횟수)
study.optimize(objective, n_trials=10)

# 최적화된 하이퍼파라미터 값 출력
print(study.best_params)

(est,depth,rate)=study.best_params.values()
print(rate,depth,est)


[32m[I 2023-04-27 00:33:06,943][0m A new study created in memory with name: no-name-81d13703-23bf-4635-b7e9-0b54fabeeeac[0m
[32m[I 2023-04-27 00:33:21,745][0m Trial 0 finished with value: 0.4687333656466352 and parameters: {'n_estimators': 985, 'max_depth': 6, 'learning_rate': 0.04304040528252487}. Best is trial 0 with value: 0.4687333656466352.[0m
[32m[I 2023-04-27 00:33:36,846][0m Trial 1 finished with value: 0.4712903374797685 and parameters: {'n_estimators': 972, 'max_depth': 8, 'learning_rate': 0.09818917401212222}. Best is trial 0 with value: 0.4687333656466352.[0m
[32m[I 2023-04-27 00:33:48,960][0m Trial 2 finished with value: 0.47042051875352503 and parameters: {'n_estimators': 787, 'max_depth': 7, 'learning_rate': 0.09738084452838408}. Best is trial 0 with value: 0.4687333656466352.[0m
[32m[I 2023-04-27 00:33:51,818][0m Trial 3 finished with value: 0.4672365740679162 and parameters: {'n_estimators': 176, 'max_depth': 5, 'learning_rate': 0.0733392487560017}. Best 

{'n_estimators': 808, 'max_depth': 4, 'learning_rate': 0.018848045707723787}
0.018848045707723787 4 808


In [32]:
bestreg_parametertuning(rate,depth,est,test_data_columns,'Diabetes')

In [33]:
y='DiffWalk'
train_X_new=train_X.loc[:,test_data_columns]
train_y_new=train_X.loc[:,y]
study = optuna.create_study(direction='minimize')

# study 실행 (n_trials는 시도 횟수)
study.optimize(objective, n_trials=10)

# 최적화된 하이퍼파라미터 값 출력
print(study.best_params)

(est,depth,rate)=study.best_params.values()
print(rate,depth,est)
bestreg_parametertuning(rate,depth,est,test_data_columns,'DiffWalk')

[32m[I 2023-04-27 00:34:49,338][0m A new study created in memory with name: no-name-dc204895-082b-44fc-8ffb-28d51bdad451[0m
[32m[I 2023-04-27 00:34:55,971][0m Trial 0 finished with value: 0.1200026223147477 and parameters: {'n_estimators': 420, 'max_depth': 6, 'learning_rate': 0.06209141587203206}. Best is trial 0 with value: 0.1200026223147477.[0m
[32m[I 2023-04-27 00:35:07,820][0m Trial 1 finished with value: 0.12068781100284279 and parameters: {'n_estimators': 747, 'max_depth': 9, 'learning_rate': 0.07744203330278472}. Best is trial 0 with value: 0.1200026223147477.[0m
[32m[I 2023-04-27 00:35:10,486][0m Trial 2 finished with value: 0.11960270146319255 and parameters: {'n_estimators': 142, 'max_depth': 6, 'learning_rate': 0.029170467942792908}. Best is trial 2 with value: 0.11960270146319255.[0m
[32m[I 2023-04-27 00:35:18,140][0m Trial 3 finished with value: 0.12012979819656314 and parameters: {'n_estimators': 568, 'max_depth': 5, 'learning_rate': 0.0998302142793063}. B

{'n_estimators': 660, 'max_depth': 3, 'learning_rate': 0.06742787816636775}
0.06742787816636775 3 660


In [34]:

y='GenHlth'
train_X_new=train_X.loc[:,test_data_columns]
train_y_new=train_X.loc[:,y]
study = optuna.create_study(direction='minimize')

# study 실행 (n_trials는 시도 횟수)
study.optimize(objective, n_trials=5)

# 최적화된 하이퍼파라미터 값 출력
print(study.best_params)

(est,depth,rate)=study.best_params.values()
print(rate,depth,est)
bestreg_parametertuning(rate,depth,est,test_data_columns,'GenHlth')

[32m[I 2023-04-27 00:36:06,626][0m A new study created in memory with name: no-name-7ee20bfc-00b8-4681-a346-789aaf423807[0m
[32m[I 2023-04-27 00:36:18,530][0m Trial 0 finished with value: 0.8926778114450024 and parameters: {'n_estimators': 896, 'max_depth': 5, 'learning_rate': 0.036633818446817346}. Best is trial 0 with value: 0.8926778114450024.[0m
[32m[I 2023-04-27 00:36:29,673][0m Trial 1 finished with value: 0.8919955899030253 and parameters: {'n_estimators': 663, 'max_depth': 9, 'learning_rate': 0.018654343233590498}. Best is trial 1 with value: 0.8919955899030253.[0m
[32m[I 2023-04-27 00:36:36,911][0m Trial 2 finished with value: 0.8915820646439508 and parameters: {'n_estimators': 718, 'max_depth': 4, 'learning_rate': 0.045858624511359886}. Best is trial 2 with value: 0.8915820646439508.[0m
[32m[I 2023-04-27 00:36:40,618][0m Trial 3 finished with value: 0.8916539299655802 and parameters: {'n_estimators': 208, 'max_depth': 7, 'learning_rate': 0.04741569157746367}. Be

{'n_estimators': 718, 'max_depth': 4, 'learning_rate': 0.045858624511359886}
0.045858624511359886 4 718


In [35]:
y='HighBP'
train_X_new=train_X.loc[:,test_data_columns]
train_y_new=train_X.loc[:,y]
study = optuna.create_study(direction='minimize')

# study 실행 (n_trials는 시도 횟수)
study.optimize(objective, n_trials=10)

# 최적화된 하이퍼파라미터 값 출력
print(study.best_params)

(est,depth,rate)=study.best_params.values()
print(rate,depth,est)
bestreg_parametertuning(rate,depth,est,test_data_columns,'HighBP')

[32m[I 2023-04-27 00:36:48,931][0m A new study created in memory with name: no-name-14faeeb7-feac-4631-91a3-b038dba24cbc[0m
[32m[I 2023-04-27 00:37:00,230][0m Trial 0 finished with value: 0.2321150589533491 and parameters: {'n_estimators': 675, 'max_depth': 9, 'learning_rate': 0.012625468442624266}. Best is trial 0 with value: 0.2321150589533491.[0m
[32m[I 2023-04-27 00:37:07,399][0m Trial 1 finished with value: 0.23195217633141155 and parameters: {'n_estimators': 962, 'max_depth': 3, 'learning_rate': 0.01573633483082614}. Best is trial 1 with value: 0.23195217633141155.[0m
[32m[I 2023-04-27 00:37:19,143][0m Trial 2 finished with value: 0.23289841103818412 and parameters: {'n_estimators': 729, 'max_depth': 9, 'learning_rate': 0.051470300637347664}. Best is trial 1 with value: 0.23195217633141155.[0m
[32m[I 2023-04-27 00:37:22,916][0m Trial 3 finished with value: 0.23205426810761165 and parameters: {'n_estimators': 216, 'max_depth': 6, 'learning_rate': 0.018413065950644934

{'n_estimators': 962, 'max_depth': 3, 'learning_rate': 0.01573633483082614}
0.01573633483082614 3 962


In [36]:
y='HighChol'
train_X_new=train_X.loc[:,test_data_columns]
train_y_new=train_X.loc[:,y]
study = optuna.create_study(direction='minimize')

# study 실행 (n_trials는 시도 횟수)
study.optimize(objective, n_trials=10)

# 최적화된 하이퍼파라미터 값 출력
print(study.best_params)

(est,depth,rate)=study.best_params.values()
print(rate,depth,est)
bestreg_parametertuning(rate,depth,est,test_data_columns,'HighChol')

[32m[I 2023-04-27 00:38:15,555][0m A new study created in memory with name: no-name-dd9fd530-548c-44e9-b0aa-1e9fcd9c57de[0m
[32m[I 2023-04-27 00:38:24,821][0m Trial 0 finished with value: 0.23794070619779992 and parameters: {'n_estimators': 700, 'max_depth': 5, 'learning_rate': 0.07012948556393786}. Best is trial 0 with value: 0.23794070619779992.[0m
[32m[I 2023-04-27 00:38:34,278][0m Trial 1 finished with value: 0.23761873861888141 and parameters: {'n_estimators': 580, 'max_depth': 9, 'learning_rate': 0.030300833393382836}. Best is trial 1 with value: 0.23761873861888141.[0m
[32m[I 2023-04-27 00:38:40,471][0m Trial 2 finished with value: 0.2371067113408561 and parameters: {'n_estimators': 830, 'max_depth': 3, 'learning_rate': 0.015431096490448493}. Best is trial 2 with value: 0.2371067113408561.[0m
[32m[I 2023-04-27 00:38:42,696][0m Trial 3 finished with value: 0.2374770710143868 and parameters: {'n_estimators': 108, 'max_depth': 8, 'learning_rate': 0.016674555027644124}

{'n_estimators': 830, 'max_depth': 3, 'learning_rate': 0.015431096490448493}
0.015431096490448493 3 830


In [37]:
y='Income'
train_X_new=train_X.loc[:,test_data_columns]
train_y_new=train_X.loc[:,y]
study = optuna.create_study(direction='minimize')

# study 실행 (n_trials는 시도 횟수)
study.optimize(objective, n_trials=10)

# 최적화된 하이퍼파라미터 값 출력
print(study.best_params)

(est,depth,rate)=study.best_params.values()
print(rate,depth,est)
bestreg_parametertuning(rate,depth,est,test_data_columns,'Income')

[32m[I 2023-04-27 00:39:22,672][0m A new study created in memory with name: no-name-c83090a9-f94d-4484-b2ca-e75c78c4a9ef[0m
[32m[I 2023-04-27 00:39:24,788][0m Trial 0 finished with value: 3.1100784031859656 and parameters: {'n_estimators': 237, 'max_depth': 3, 'learning_rate': 0.050186130194675056}. Best is trial 0 with value: 3.1100784031859656.[0m
[32m[I 2023-04-27 00:39:35,675][0m Trial 1 finished with value: 3.1235000102534007 and parameters: {'n_estimators': 694, 'max_depth': 8, 'learning_rate': 0.048489700370042764}. Best is trial 0 with value: 3.1100784031859656.[0m
[32m[I 2023-04-27 00:39:45,891][0m Trial 2 finished with value: 3.111650266201802 and parameters: {'n_estimators': 583, 'max_depth': 8, 'learning_rate': 0.010411678370555125}. Best is trial 0 with value: 3.1100784031859656.[0m
[32m[I 2023-04-27 00:39:51,611][0m Trial 3 finished with value: 3.1134605739504844 and parameters: {'n_estimators': 305, 'max_depth': 9, 'learning_rate': 0.012909673653994189}. Be

{'n_estimators': 237, 'max_depth': 3, 'learning_rate': 0.050186130194675056}
0.050186130194675056 3 237


In [38]:
y='PhysHlth'
train_X_new=train_X.loc[:,test_data_columns]
train_y_new=train_X.loc[:,y]
study = optuna.create_study(direction='minimize')

# study 실행 (n_trials는 시도 횟수)
study.optimize(objective, n_trials=10)

# 최적화된 하이퍼파라미터 값 출력
print(study.best_params)

(est,depth,rate)=study.best_params.values()
print(rate,depth,est)
bestreg_parametertuning(rate,depth,est,test_data_columns,'PhysHlth')

[32m[I 2023-04-27 00:40:32,509][0m A new study created in memory with name: no-name-bbc7fab4-8cc8-4244-a124-cf304647fa69[0m
[32m[I 2023-04-27 00:40:37,004][0m Trial 0 finished with value: 61.78433223644205 and parameters: {'n_estimators': 255, 'max_depth': 7, 'learning_rate': 0.022373607318996507}. Best is trial 0 with value: 61.78433223644205.[0m
[32m[I 2023-04-27 00:40:43,683][0m Trial 1 finished with value: 61.79953212068367 and parameters: {'n_estimators': 407, 'max_depth': 6, 'learning_rate': 0.013953221734594451}. Best is trial 0 with value: 61.78433223644205.[0m
[32m[I 2023-04-27 00:40:58,196][0m Trial 2 finished with value: 61.99871392746927 and parameters: {'n_estimators': 937, 'max_depth': 8, 'learning_rate': 0.029420616955494545}. Best is trial 0 with value: 61.78433223644205.[0m
[32m[I 2023-04-27 00:40:59,679][0m Trial 3 finished with value: 61.81297027832204 and parameters: {'n_estimators': 144, 'max_depth': 3, 'learning_rate': 0.05859057266624568}. Best is t

{'n_estimators': 255, 'max_depth': 7, 'learning_rate': 0.022373607318996507}
0.022373607318996507 7 255


In [39]:
y='Sex'
train_X_new=train_X.loc[:,test_data_columns]
train_y_new=train_X.loc[:,y]
study = optuna.create_study(direction='minimize')

# study 실행 (n_trials는 시도 횟수)
study.optimize(objective, n_trials=10)

# 최적화된 하이퍼파라미터 값 출력
print(study.best_params)

(est,depth,rate)=study.best_params.values()
print(rate,depth,est)
bestreg_parametertuning(rate,depth,est,test_data_columns,'Sex')

[32m[I 2023-04-27 00:41:47,157][0m A new study created in memory with name: no-name-9bf4ed8a-f437-413c-b871-576fcf603fec[0m
[32m[I 2023-04-27 00:41:55,821][0m Trial 0 finished with value: 0.23431474836302574 and parameters: {'n_estimators': 841, 'max_depth': 4, 'learning_rate': 0.019550346200793146}. Best is trial 0 with value: 0.23431474836302574.[0m
[32m[I 2023-04-27 00:41:57,940][0m Trial 1 finished with value: 0.23513317999781683 and parameters: {'n_estimators': 166, 'max_depth': 4, 'learning_rate': 0.018098119883696376}. Best is trial 0 with value: 0.23431474836302574.[0m
[32m[I 2023-04-27 00:42:07,704][0m Trial 2 finished with value: 0.23448318896173198 and parameters: {'n_estimators': 658, 'max_depth': 5, 'learning_rate': 0.02181129018026924}. Best is trial 0 with value: 0.23431474836302574.[0m
[32m[I 2023-04-27 00:42:10,247][0m Trial 3 finished with value: 0.2353732664581948 and parameters: {'n_estimators': 128, 'max_depth': 6, 'learning_rate': 0.01474718702096261

{'n_estimators': 841, 'max_depth': 4, 'learning_rate': 0.019550346200793146}
0.019550346200793146 4 841


In [40]:
y='Stroke'
train_X_new=train_X.loc[:,test_data_columns]
train_y_new=train_X.loc[:,y]
study = optuna.create_study(direction='minimize')

# study 실행 (n_trials는 시도 횟수)
study.optimize(objective, n_trials=10)

# 최적화된 하이퍼파라미터 값 출력
print(study.best_params)

(est,depth,rate)=study.best_params.values()
print(rate,depth,est)
bestreg_parametertuning(rate,depth,est,test_data_columns,'Stroke')

[32m[I 2023-04-27 00:42:54,750][0m A new study created in memory with name: no-name-f99d1d50-6164-4000-8586-e4e074e6b0b1[0m
[32m[I 2023-04-27 00:42:59,663][0m Trial 0 finished with value: 0.03811570398524013 and parameters: {'n_estimators': 275, 'max_depth': 8, 'learning_rate': 0.03332157794840853}. Best is trial 0 with value: 0.03811570398524013.[0m
[32m[I 2023-04-27 00:43:05,607][0m Trial 1 finished with value: 0.03807806288488745 and parameters: {'n_estimators': 333, 'max_depth': 8, 'learning_rate': 0.011590286112903856}. Best is trial 1 with value: 0.03807806288488745.[0m
[32m[I 2023-04-27 00:43:08,324][0m Trial 2 finished with value: 0.03813423259987839 and parameters: {'n_estimators': 141, 'max_depth': 8, 'learning_rate': 0.07834847240572329}. Best is trial 1 with value: 0.03807806288488745.[0m
[32m[I 2023-04-27 00:43:19,206][0m Trial 3 finished with value: 0.03809534812853856 and parameters: {'n_estimators': 770, 'max_depth': 5, 'learning_rate': 0.01288840143917183

{'n_estimators': 520, 'max_depth': 4, 'learning_rate': 0.012423259157251077}
0.012423259157251077 4 520


In [41]:
import lightgbm as lgb

In [42]:
def objective(trial):
    # Define hyperparameters to optimize 
    params={
        'boosting_type':'gbdt',
        'objective':'binary',
        'metric':'binary_logloss',
        'learning_rate':trial.suggest_loguniform('learning_rate',0.01,0.1),
        'max_depth':trial.suggest_int('max_depth',3,9),
        'n_estimators':trial.suggest_int("n_estimators",100,1000)
    }
    # Train and evaluate model 
    lgb_cv=lgb.LGBMClassifier(**params, random_state=42,scale_pos_weight=ratio)
    scores=cross_val_score(lgb_cv,train_X,train_y,cv=5,scoring='roc_auc')
    auc=scores.mean()
    return auc 

# Define study object and optimize 

study=optuna.create_study(direction='maximize',study_name='lgb_boost_opt',load_if_exists=True)
study.optimize(objective, n_trials=50)

# Print best hyperparameters and auc
print(f'Best hyperparameters: {study.best_params}')
print(f'Best AUC: {study.best_value:.4f}')

[32m[I 2023-04-27 00:44:04,438][0m A new study created in memory with name: lgb_boost_opt[0m
[32m[I 2023-04-27 00:44:12,497][0m Trial 0 finished with value: 0.8498533524981641 and parameters: {'learning_rate': 0.019357281877591417, 'max_depth': 4, 'n_estimators': 596}. Best is trial 0 with value: 0.8498533524981641.[0m
[32m[I 2023-04-27 00:44:19,863][0m Trial 1 finished with value: 0.8495065163268751 and parameters: {'learning_rate': 0.012436240302912219, 'max_depth': 7, 'n_estimators': 312}. Best is trial 0 with value: 0.8498533524981641.[0m
[32m[I 2023-04-27 00:44:37,335][0m Trial 2 finished with value: 0.8476675242653979 and parameters: {'learning_rate': 0.020408483246054838, 'max_depth': 7, 'n_estimators': 988}. Best is trial 0 with value: 0.8498533524981641.[0m
[32m[I 2023-04-27 00:44:50,975][0m Trial 3 finished with value: 0.8411509509412142 and parameters: {'learning_rate': 0.07652914595594465, 'max_depth': 5, 'n_estimators': 828}. Best is trial 0 with value: 0.849

Best hyperparameters: {'learning_rate': 0.03851343740983625, 'max_depth': 4, 'n_estimators': 240}
Best AUC: 0.8500


In [43]:
lgb=lgb.LGBMClassifier(learning_rate=  0.01586784169962525,max_depth= 3,n_estimators=992,scale_pos_weight=ratio,random_state=42)

In [44]:
lgb.fit(train_X,train_y)

In [45]:
pred=lgb.predict_proba(new_test_data_X)[:,1]

In [46]:
new_test_data_X.shape

(50736, 21)

In [47]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

def get_clf_prob(y_test, probability):
  pred=np.where(probability > 0.50,1,0)
  confusion=confusion_matrix(y_test, pred)
  accuracy=accuracy_score(y_test,pred)
  precision=precision_score(y_test,pred) 
  recall=recall_score(y_test,pred) 
  # F1 스코어 추가 
  f1=f1_score(y_test,pred,average='macro')
  Roc_score=roc_auc_score(y_test,probability)
  print('임계값: ', 0.5) 
  print('오차행렬')
  print(confusion) 
  # f1 score print 추가 
  print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1:{3:.4f}, AUC:{4: .4f}'.format(accuracy,precision,recall,f1,Roc_score))

In [48]:
get_clf_prob(new_test_data_y,pred) 

임계값:  0.5
오차행렬
[[34714 11243]
 [ 3312  1467]]
정확도: 0.7131, 정밀도: 0.1154, 재현율: 0.3070, F1:0.4972, AUC: 0.5236
