In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import optuna 
import lightgbm as lgb 
from sklearn.metrics import accuracy_score, roc_auc_score,precision_score, confusion_matrix,recall_score 
from sklearn.model_selection import cross_val_score, train_test_split
import os 

In [2]:
train_data=pd.read_csv('/home/iai/son/lastcheck/spambase_train_data.csv')
test_data=pd.read_csv('/home/iai/son/lastcheck/spambase_test_data.csv')


In [3]:
train_stage_features=['0.7', '278', '61', '0', '0.34', '0.30', '0.5', '0.44', '0.40', '0.778',
                        '0.24', '1.93', '0.39', '1.29', '0.43', '0.2', '0.36', '0.17', '0.20', 
                        '0.28', '0.21', '0.32.1', '0.41', '0.29', '3.756', '0.35', '0.25', '0.15', 
                        '0.64.1', '0.38', '0.45', '0.6', '0.33', '0.42', '0.14', '0.9', '0.10', 
                        '0.3', '0.13', '0.8', '0.19', '0.23', '0.18', '0.22', '0.16', '0.64.2', 
                        '0.1', '0.31', '0.27', '0.11', '0.4', '0.12', '0.26', '0.64', '0.37', '0.32', '0.96']

In [4]:
train_X=train_data[train_stage_features]
train_y=train_data['1']

new_test_data_X=test_data[train_stage_features]
new_test_data_y=test_data['1']

In [5]:
ratio=train_y.value_counts()[0]/train_y.value_counts()[1]

In [6]:
def objective(trial):
    # Define hyperparameters to optimize 
    params={
        'boosting_type':'gbdt',
        'objective':'binary',
        'metric':'binary_logloss',
        'learning_rate':trial.suggest_loguniform('learning_rate',0.01,0.1),
        'max_depth':trial.suggest_int('max_depth',3,9),
        'n_estimators':trial.suggest_int("n_estimators",100,1000)
    }
    # Train and evaluate model 
    lgb_cv=lgb.LGBMClassifier(**params, random_state=42,scale_pos_weight=ratio)
    scores=cross_val_score(lgb_cv,train_X,train_y,cv=5,scoring='roc_auc')
    auc=scores.mean()
    return auc 

# Define study object and optimize 

study=optuna.create_study(direction='maximize',study_name='lgb_boost_opt',load_if_exists=True)
study.optimize(objective, n_trials=50)

# Print best hyperparameters and auc
print(f'Best hyperparameters: {study.best_params}')
print(f'Best AUC: {study.best_value:.4f}')

[32m[I 2023-04-25 20:46:00,990][0m A new study created in memory with name: lgb_boost_opt[0m
  'learning_rate':trial.suggest_loguniform('learning_rate',0.01,0.1),
[32m[I 2023-04-25 20:46:01,512][0m Trial 0 finished with value: 0.9885943816364184 and parameters: {'learning_rate': 0.05127846891344373, 'max_depth': 6, 'n_estimators': 128}. Best is trial 0 with value: 0.9885943816364184.[0m
  'learning_rate':trial.suggest_loguniform('learning_rate',0.01,0.1),
[32m[I 2023-04-25 20:46:02,675][0m Trial 1 finished with value: 0.9902125011735308 and parameters: {'learning_rate': 0.028155676279664976, 'max_depth': 5, 'n_estimators': 576}. Best is trial 1 with value: 0.9902125011735308.[0m
  'learning_rate':trial.suggest_loguniform('learning_rate',0.01,0.1),
[32m[I 2023-04-25 20:46:03,109][0m Trial 2 finished with value: 0.9874451838174958 and parameters: {'learning_rate': 0.040268382446243235, 'max_depth': 7, 'n_estimators': 112}. Best is trial 1 with value: 0.9902125011735308.[0m
  

Best hyperparameters: {'learning_rate': 0.024654640500232194, 'max_depth': 6, 'n_estimators': 686}
Best AUC: 0.9907


In [7]:
lgb=lgb.LGBMClassifier(learning_rate=0.024654640500232194,max_depth=6,n_estimators=686,scale_pos_weight=ratio,random_state=42)

In [8]:
lgb.fit(train_X,train_y)

In [9]:
prob=lgb.predict_proba(new_test_data_X)[:,1]

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

def get_clf_prob(y_test, probability):
  pred=np.where(probability > 0.5,1,0)
  confusion=confusion_matrix(y_test, pred)
  accuracy=accuracy_score(y_test,pred)
  precision=precision_score(y_test,pred) 
  recall=recall_score(y_test,pred) 
  # F1 스코어 추가 
  f1=f1_score(y_test,pred,average='macro')
  Roc_score=roc_auc_score(y_test,probability)
  print('임계값: ', 0.5) 
  print('오차행렬')
  print(confusion) 
  # f1 score print 추가 
  print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1:{3:.4f}, AUC:{4: .4f}'.format(accuracy,precision,recall,f1,Roc_score))

In [11]:
get_clf_prob(new_test_data_y,prob)

임계값:  0.5
오차행렬
[[506  24]
 [ 32 358]]
정확도: 0.9391, 정밀도: 0.9372, 재현율: 0.9179, F1:0.9375, AUC: 0.9829
