https://dacon.io/competitions/official/235848/codeshare/4040?page=1&dtype=recent

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler , LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from scipy.stats import mode


from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from tensorflow.keras.utils import to_categorical

from matplotlib import ticker
import time
import warnings

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('./_data/train.csv',encoding = 'utf8')
test = pd.read_csv('./_data/test.csv',encoding = 'utf8')

submission = pd.read_csv('./_data/sample_submission.csv')

In [5]:
train.drop(["id"] , axis = 1 , inplace = True)
test.drop(["id"] , axis = 1 , inplace = True)

train.drop(['trestbps'], axis = 1, inplace=True)
test.drop(['trestbps'], axis = 1, inplace=True)

train.drop(['chol'], axis = 1, inplace=True)
test.drop(['chol'], axis = 1, inplace=True)

train.drop(['fbs'], axis = 1, inplace=True)
test.drop(['fbs'], axis = 1, inplace=True)

train.drop(['restecg'], axis = 1, inplace=True)
test.drop(['restecg'], axis = 1, inplace=True)

### catboost 기본 학습 결과

In [6]:
accuracy =[]
model_names =[]

X= train.drop('target', axis=1)
y= train['target']
categorical_features_indices = np.where(X.dtypes != np.float)[0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=1111)

model = CatBoostClassifier(verbose=False,random_state=1111)
model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_test, y_test))
y_pred = model.predict(X_test)
accuracy.append(round(accuracy_score(y_test, y_pred),4))

model_names = ['Catboost_default']
result_df5 = pd.DataFrame({'Accuracy':accuracy}, index=model_names)
result_df5

Unnamed: 0,Accuracy
Catboost_default,0.8113


### optuna를 활용한 파라미터 최적화

In [7]:
def objective(trial):
    X= train.drop('target', axis=1)
    y= train['target']
    categorical_features_indices = np.where(X.dtypes != np.float)[0]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=1111)

    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5),
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    cat_cls = CatBoostClassifier(**param)

    cat_cls.fit(X_train, y_train, eval_set=[(X_test, y_test)], cat_features=categorical_features_indices,verbose=0, early_stopping_rounds=100)

    preds = cat_cls.predict(X_test)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_test, pred_labels)
    return accuracy

In [12]:
import optuna

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=200, timeout=600)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2021-12-26 23:21:55,706][0m A new study created in memory with name: no-name-8d2f996a-99dc-4bb9-90ed-10789860a31e[0m
[32m[I 2021-12-26 23:21:57,620][0m Trial 0 finished with value: 0.9245283018867925 and parameters: {'learning_rate': 0.2277484690903912, 'objective': 'Logloss', 'colsample_bylevel': 0.059369211604834216, 'depth': 11, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.9245283018867925.[0m
[32m[I 2021-12-26 23:21:58,544][0m Trial 1 finished with value: 0.9056603773584906 and parameters: {'learning_rate': 0.03524111260349894, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.013533524691805607, 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.9245283018867925.[0m
[32m[I 2021-12-26 23:22:00,128][0m Trial 2 finished with value: 0.8490566037735849 and parameters: {'learning_rate': 0.49215306616355187, 'objective': 'Logloss', 'colsample_bylevel': 0.07567758118822693, 'depth': 7, 'boo

[32m[I 2021-12-26 23:22:40,644][0m Trial 23 finished with value: 0.9056603773584906 and parameters: {'learning_rate': 0.08923468560408226, 'objective': 'Logloss', 'colsample_bylevel': 0.018350432135411977, 'depth': 11, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.6974638754888547}. Best is trial 0 with value: 0.9245283018867925.[0m
[32m[I 2021-12-26 23:22:41,713][0m Trial 24 finished with value: 0.8679245283018868 and parameters: {'learning_rate': 0.2738051583388331, 'objective': 'Logloss', 'colsample_bylevel': 0.03796667673164427, 'depth': 11, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.6639223742250515}. Best is trial 0 with value: 0.9245283018867925.[0m
[32m[I 2021-12-26 23:22:43,591][0m Trial 25 finished with value: 0.8867924528301887 and parameters: {'learning_rate': 0.17450930343671373, 'objective': 'Logloss', 'colsample_bylevel': 0.054202675855417486, 'depth': 5, 'boosting_type': 'Ordered', 'bootstrap_type': 'Ber

[32m[I 2021-12-26 23:24:02,347][0m Trial 46 finished with value: 0.9056603773584906 and parameters: {'learning_rate': 0.25534253027303705, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.0426431121667819, 'depth': 2, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.075384889494537}. Best is trial 42 with value: 0.9622641509433962.[0m
[32m[I 2021-12-26 23:24:05,900][0m Trial 47 finished with value: 0.8867924528301887 and parameters: {'learning_rate': 0.12779681347749594, 'objective': 'Logloss', 'colsample_bylevel': 0.07668451764571411, 'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.1919731442351454}. Best is trial 42 with value: 0.9622641509433962.[0m
[32m[I 2021-12-26 23:24:10,026][0m Trial 48 finished with value: 0.8679245283018868 and parameters: {'learning_rate': 0.1962502889697539, 'objective': 'Logloss', 'colsample_bylevel': 0.08571440918523922, 'depth': 9, 'boosting_type': 'Ordered', 'bootstrap_typ

[32m[I 2021-12-26 23:24:50,587][0m Trial 69 finished with value: 0.9245283018867925 and parameters: {'learning_rate': 0.1455693738572711, 'objective': 'Logloss', 'colsample_bylevel': 0.07690411332155944, 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.5256357833974614}. Best is trial 42 with value: 0.9622641509433962.[0m
[32m[I 2021-12-26 23:24:53,955][0m Trial 70 finished with value: 0.9056603773584906 and parameters: {'learning_rate': 0.013134686243300886, 'objective': 'Logloss', 'colsample_bylevel': 0.06426071985729483, 'depth': 10, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 42 with value: 0.9622641509433962.[0m
[32m[I 2021-12-26 23:24:55,407][0m Trial 71 finished with value: 0.8867924528301887 and parameters: {'learning_rate': 0.1530323211429375, 'objective': 'Logloss', 'colsample_bylevel': 0.04693818931749934, 'depth': 11, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.308614121433

[32m[I 2021-12-26 23:25:45,318][0m Trial 92 finished with value: 0.9245283018867925 and parameters: {'learning_rate': 0.0063327348240957565, 'objective': 'Logloss', 'colsample_bylevel': 0.09081896997861981, 'depth': 10, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.7310555264746352}. Best is trial 42 with value: 0.9622641509433962.[0m
[32m[I 2021-12-26 23:25:46,275][0m Trial 93 finished with value: 0.8867924528301887 and parameters: {'learning_rate': 0.19765976979293168, 'objective': 'Logloss', 'colsample_bylevel': 0.018606341923458086, 'depth': 11, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.31470793083208026}. Best is trial 42 with value: 0.9622641509433962.[0m
[32m[I 2021-12-26 23:25:50,843][0m Trial 94 finished with value: 0.8867924528301887 and parameters: {'learning_rate': 0.16564393868167696, 'objective': 'Logloss', 'colsample_bylevel': 0.06559360201594944, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'

[32m[I 2021-12-26 23:26:39,322][0m Trial 115 finished with value: 0.9056603773584906 and parameters: {'learning_rate': 0.219605823432156, 'objective': 'Logloss', 'colsample_bylevel': 0.08040742822145239, 'depth': 2, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.698231898665797}. Best is trial 42 with value: 0.9622641509433962.[0m
[32m[I 2021-12-26 23:26:40,974][0m Trial 116 finished with value: 0.9245283018867925 and parameters: {'learning_rate': 0.10337422516147085, 'objective': 'Logloss', 'colsample_bylevel': 0.06271476064631004, 'depth': 11, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.2389149867481121}. Best is trial 42 with value: 0.9622641509433962.[0m
[32m[I 2021-12-26 23:26:41,642][0m Trial 117 finished with value: 0.9056603773584906 and parameters: {'learning_rate': 0.1770292865758541, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.01934847193658054, 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_ty

[32m[I 2021-12-26 23:27:51,351][0m Trial 138 finished with value: 0.9056603773584906 and parameters: {'learning_rate': 0.2071825454871219, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.02457451337549485, 'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.24145645697773652}. Best is trial 42 with value: 0.9622641509433962.[0m
[32m[I 2021-12-26 23:27:52,477][0m Trial 139 finished with value: 0.8867924528301887 and parameters: {'learning_rate': 0.24077242609999713, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.02136119649999589, 'depth': 11, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.38751101123049647}. Best is trial 42 with value: 0.9622641509433962.[0m
[32m[I 2021-12-26 23:27:53,479][0m Trial 140 finished with value: 0.8679245283018868 and parameters: {'learning_rate': 0.2730428466861668, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.03535840198024719, 'depth': 12, 'boosting_type': 'Ordered', 'boot

[32m[I 2021-12-26 23:28:54,440][0m Trial 161 finished with value: 0.9245283018867925 and parameters: {'learning_rate': 0.07373770182997455, 'objective': 'Logloss', 'colsample_bylevel': 0.08923917857049272, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.17497228024547}. Best is trial 42 with value: 0.9622641509433962.[0m
[32m[I 2021-12-26 23:28:55,520][0m Trial 162 finished with value: 0.9433962264150944 and parameters: {'learning_rate': 0.036013810530542784, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.022207239891797743, 'depth': 11, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.58232050118844}. Best is trial 42 with value: 0.9622641509433962.[0m
[32m[I 2021-12-26 23:28:56,575][0m Trial 163 finished with value: 0.9056603773584906 and parameters: {'learning_rate': 0.07846513266430982, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.017195619066124605, 'depth': 9, 'boosting_type': 'Plain'

[32m[I 2021-12-26 23:29:51,524][0m Trial 184 finished with value: 0.8679245283018868 and parameters: {'learning_rate': 0.09928085785629705, 'objective': 'Logloss', 'colsample_bylevel': 0.027201412348953347, 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.48852419808305025}. Best is trial 42 with value: 0.9622641509433962.[0m
[32m[I 2021-12-26 23:29:51,961][0m Trial 185 finished with value: 0.8867924528301887 and parameters: {'learning_rate': 0.23984737189052838, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.024989240551878716, 'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.12719146971437456}. Best is trial 42 with value: 0.9622641509433962.[0m
[32m[I 2021-12-26 23:29:52,334][0m Trial 186 finished with value: 0.8679245283018868 and parameters: {'learning_rate': 0.42463510391466797, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.026704986590276154, 'depth': 10, 'boosting_type': 'Plain', 'bootstrap

Number of finished trials: 200
Best trial:
  Value: 0.9622641509433962
  Params: 
    learning_rate: 0.17535595403535204
    objective: Logloss
    colsample_bylevel: 0.0774526052408897
    depth: 10
    boosting_type: Plain
    bootstrap_type: Bernoulli
    subsample: 0.4097550363316602


### optuna를 통해 나온 파라미터를 사용하여 학습

In [13]:
from sklearn.metrics import accuracy_score, classification_report

accuracy =[]
model_names =[]


X= train.drop('target', axis=1)
y= train['target']
categorical_features_indices = np.where(X.dtypes != np.float)[0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=1111)
        
model = CatBoostClassifier(
    verbose=False,
    learning_rate=0.11318298060181874,
    objective= 'Logloss',
    colsample_bylevel= 0.04636567857984003,
    depth= 3,
    boosting_type= 'Ordered',
    bootstrap_type= 'Bayesian',
    bagging_temperature=0.39557095371902345
)

model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_test, y_test))

y_pred = model.predict(X_test)
accuracy.append(round(accuracy_score(y_test, y_pred),4))
print(classification_report(y_test, y_pred))

model_names = ['Catboost_tuned']
result_df6 = pd.DataFrame({'Accuracy':accuracy}, index=model_names)
result_df6

              precision    recall  f1-score   support

           0       1.00      0.92      0.96        24
           1       0.94      1.00      0.97        29

    accuracy                           0.96        53
   macro avg       0.97      0.96      0.96        53
weighted avg       0.96      0.96      0.96        53



Unnamed: 0,Accuracy
Catboost_tuned,0.9623


### 최종 결과 확인

In [16]:
y_pred = model.predict(test)

In [17]:
submission = submission.copy()
submission['target'] = y_pred
submission.to_csv("catboost_optuna.csv",index=None)
submission.head()

Unnamed: 0,id,target
0,1,0
1,2,1
2,3,0
3,4,0
4,5,1
