# 필요한 라이브러리 Import하기

In [None]:
import pandas as pd 
import numpy as np 
import os 
from sklearn.preprocessing import LabelEncoder 
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score,roc_auc_score,mean_squared_error
import warnings
warnings.filterwarnings(action='ignore')
import random
from lightgbm import LGBMClassifier
from lightgbm import LGBMRegressor

In [5]:
## Train data와 Test data를 로드해준다.

train_data=pd.read_csv('/dshome/WoongLab/heo/construction_oil/preprocessed_data/breastcancer_train_data.csv')
test_data=pd.read_csv('/dshome/WoongLab/heo/construction_oil/preprocessed_data/breastcancer_test_data.csv')

In [6]:
train_stage_features=['A Stage','differentiate','Race','T Stage ','Grade','Estrogen Status','Progesterone Status','Marital Status','N Stage','Survival Months']
test_stage_features=['A Stage','differentiate','Race','T Stage ','Grade','Estrogen Status','Progesterone Status','Marital Status','N Stage']

In [7]:
## Train data와 Test data의 독립변수와 종속변수 설정하기

train_X=train_data.loc[:,train_stage_features]
train_y=train_data['Status']
new_test_data_X=test_data.loc[:,test_stage_features]
new_test_data_y=test_data['Status']

In [8]:
# Class weight을 구하기 위해 y label의 비율을 뜻하는 ratio 변수 생성해주기

ratio=train_y.value_counts()[0]/train_y.value_counts()[1]

In [9]:
# train에는 있는데 test data에는 없는 컬럼 찾기

train_data_columns=np.array(train_X.columns)
test_data_columns=np.array(new_test_data_X.columns)
np.setdiff1d(train_data_columns,test_data_columns)

array(['Survival Months'], dtype=object)

## Test data에 없는 변수들을 다 생성했으므로 LightgbmRegressor 분류모델을 만듬

### 베이지안 최적화해주기 - LightgbmRegressor

**1. 5-fold 교차검증 이용해서 Train data로 Validation set을 RMSE가 최저였을 때의 하이퍼파라미터 구하기**

**2. Learning rate 0.01~0.1, max_depth 3~9, n_estimators 100~1000이었을 때에서 가장 최적의 하이퍼파라미터 구하기**

**3. Test data에 Survival Months 피처를 하나를 생성해준다.**

In [10]:
# 변수중요도가 가장 높은 Survival Months을 예측하는 모형 만들어 주기

import optuna
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score

test_data_columns=test_stage_features
y='Survival Months'

train_X_new=train_X.loc[:,test_data_columns]
train_y_new=train_X.loc[:,y]



# Objective 함수 정의
def objective(trial):
    
    # 하이퍼파라미터 탐색할 공간 정의
    params = {
        'n_estimators': trial.suggest_int('n_estimators',100,1000),
        'max_depth': trial.suggest_int('max_depth',3,9),
        'learning_rate': trial.suggest_loguniform('learning_rate',0.01,0.1),
    }
    
    # LGBMRegressor 모델 객체 생성
    model = LGBMRegressor(**params, random_state=42)
    
    # 교차검증 수행하여 모델 성능 측정
    scores = -1 * cross_val_score(model, train_X_new, train_y_new,
                                  cv=5, scoring='neg_mean_squared_error')
    
    # 교차검증 평균 점수 리턴
    return np.mean(scores)


# Optuna study 생성
study = optuna.create_study(direction='minimize')

# study 실행 (n_trials는 시도 횟수)
study.optimize(objective, n_trials=10)

# 최적화된 하이퍼파라미터 값 출력
print(study.best_params)



[32m[I 2023-04-19 12:12:03,019][0m A new study created in memory with name: no-name-0cd332f6-57ca-4906-81fd-0b855b524147[0m
[32m[I 2023-04-19 12:12:06,281][0m Trial 0 finished with value: 536.3548642148135 and parameters: {'n_estimators': 716, 'max_depth': 5, 'learning_rate': 0.0633900438740771}. Best is trial 0 with value: 536.3548642148135.[0m
[32m[I 2023-04-19 12:12:15,198][0m Trial 1 finished with value: 544.3246950510107 and parameters: {'n_estimators': 986, 'max_depth': 9, 'learning_rate': 0.027833255509034382}. Best is trial 0 with value: 536.3548642148135.[0m
[32m[I 2023-04-19 12:12:24,030][0m Trial 2 finished with value: 567.1977679902386 and parameters: {'n_estimators': 989, 'max_depth': 9, 'learning_rate': 0.0719126387927958}. Best is trial 0 with value: 536.3548642148135.[0m
[32m[I 2023-04-19 12:12:24,643][0m Trial 3 finished with value: 511.0212399947296 and parameters: {'n_estimators': 267, 'max_depth': 3, 'learning_rate': 0.061639523506063464}. Best is tria

{'n_estimators': 267, 'max_depth': 3, 'learning_rate': 0.061639523506063464}


In [11]:
# Print best hyperparameters and auc

print(f'Best hyperparameters: {study.best_params}')
print(f'Best RMSE: {study.best_value:.4f}')

Best hyperparameters: {'n_estimators': 267, 'max_depth': 3, 'learning_rate': 0.061639523506063464}
Best RMSE: 511.0212


In [12]:
(est,depth,rate)=study.best_params.values()

In [13]:
# 앞서 구한 최적의 파라미터들로 파인튜닝하여 Test data에 Survival Months 피처 만들어주기

def bestreg_parametertuning(rate,depth,est,test_data_columns,y):
    best_lgbmreg=LGBMRegressor(learning_rate=rate,max_depth=depth,n_estimators=est,random_state=42)
    best_lgbmreg.fit(train_X[test_data_columns], train_X[y])
    new_test_data_X[y]=best_lgbmreg.predict(new_test_data_X[test_data_columns])
    

In [14]:
bestreg_parametertuning(rate,depth,est,test_data_columns,'Survival Months')

In [15]:
new_test_data_X

Unnamed: 0,A Stage,differentiate,Race,T Stage,Grade,Estrogen Status,Progesterone Status,Marital Status,N Stage,Survival Months
0,1,3,2,1,1,1,1,1,1,73.214154
1,1,0,2,2,2,1,0,1,0,71.470363
2,1,0,2,1,2,1,1,1,1,71.925537
3,1,1,2,1,3,1,1,1,0,72.913847
4,1,0,2,1,2,1,1,0,0,70.073051
...,...,...,...,...,...,...,...,...,...,...
800,1,1,2,1,3,1,1,1,0,72.913847
801,1,0,2,2,2,1,1,1,2,67.543090
802,1,0,2,0,2,1,1,0,0,74.143556
803,1,3,2,0,1,1,1,1,0,74.228422




### 베이지안 최적화해주기 - LightgbmClassifier

**1. 5-fold 교차검증 이용해서 Train data로 Validation set을 Auc가 최고였을 때의 하이퍼파라미터 구하기**

**2. Learning rate 0.01~0.1, max_depth 3~9, n_estimators 100~1000이었을 때에서 가장 최적의 하이퍼파라미터 구하기**


In [16]:
import lightgbm as lgb

In [17]:
def objective(trial):
    # Define hyperparameters to optimize 
    params={
        'boosting_type':'gbdt',
        'objective':'binary',
        'metric':'binary_logloss',
        'learning_rate':trial.suggest_loguniform('learning_rate',0.01,0.1),
        'max_depth':trial.suggest_int('max_depth',3,9),
        'n_estimators':trial.suggest_int("n_estimators",100,1000)
    }
    # Train and evaluate model 
    lgb_cv=lgb.LGBMClassifier(**params, random_state=42,scale_pos_weight=ratio)
    scores=cross_val_score(lgb_cv,train_X,train_y,cv=5,scoring='roc_auc')
    auc=scores.mean()
    return auc 

# Define study object and optimize 

study=optuna.create_study(direction='maximize',study_name='lgb_boost_opt',load_if_exists=True)
study.optimize(objective, n_trials=10)

# Print best hyperparameters and auc
print(f'Best hyperparameters: {study.best_params}')
print(f'Best AUC: {study.best_value:.4f}')

[32m[I 2023-04-19 12:14:09,510][0m A new study created in memory with name: lgb_boost_opt[0m
[32m[I 2023-04-19 12:14:11,658][0m Trial 0 finished with value: 0.8509068928325622 and parameters: {'learning_rate': 0.07993393703676338, 'max_depth': 4, 'n_estimators': 505}. Best is trial 0 with value: 0.8509068928325622.[0m
[32m[I 2023-04-19 12:14:14,580][0m Trial 1 finished with value: 0.843737084962511 and parameters: {'learning_rate': 0.04722512813320992, 'max_depth': 8, 'n_estimators': 313}. Best is trial 0 with value: 0.8509068928325622.[0m
[32m[I 2023-04-19 12:14:19,028][0m Trial 2 finished with value: 0.8428905094988226 and parameters: {'learning_rate': 0.03065838907224806, 'max_depth': 6, 'n_estimators': 753}. Best is trial 0 with value: 0.8509068928325622.[0m
[32m[I 2023-04-19 12:14:20,383][0m Trial 3 finished with value: 0.869105342638389 and parameters: {'learning_rate': 0.011531144266024003, 'max_depth': 4, 'n_estimators': 335}. Best is trial 3 with value: 0.8691053

Best hyperparameters: {'learning_rate': 0.011531144266024003, 'max_depth': 4, 'n_estimators': 335}
Best AUC: 0.8691


In [18]:
lgb=lgb.LGBMClassifier(learning_rate= 0.011531144266024003,max_depth=4,n_estimators=335,scale_pos_weight=ratio,random_state=42)

In [19]:
lgb.fit(train_X,train_y)

In [20]:
pred=lgb.predict_proba(new_test_data_X)[:,1]

In [21]:
# 모델 성능을 평가할 수 있는 함수 만들어주기

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

def get_clf_prob(y_test, probability):
  pred=np.where(probability > 0.50,1,0)
  confusion=confusion_matrix(y_test, pred)
  accuracy=accuracy_score(y_test,pred)
  precision=precision_score(y_test,pred) 
  recall=recall_score(y_test,pred) 
  # F1 스코어 추가 
  f1=f1_score(y_test,pred,average='macro')
  Roc_score=roc_auc_score(y_test,probability)
  print('임계값: ', 0.5) 
  print('오차행렬')
  print(confusion) 
  # f1 score print 추가 
  print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1:{3:.4f}, AUC:{4: .4f}'.format(accuracy,precision,recall,f1,Roc_score))

In [22]:
get_clf_prob(new_test_data_y,pred) 

임계값:  0.5
오차행렬
[[612  70]
 [ 82  41]]
정확도: 0.8112, 정밀도: 0.3694, 재현율: 0.3333, F1:0.6200, AUC: 0.6822
