<a href="https://colab.research.google.com/github/kg4-ken1ro/mypandas_tutorial_5/blob/main/study_tutorial_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
!pip install optuna # ライブラリーのインストールコマンド
import optuna
import numpy as np
import pandas as pd
import xgboost as xgb

from tqdm import tqdm_notebook as tqdm
from IPython.display import display
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold


# optunaの出力をsupressする
# https://optuna.readthedocs.io/en/stable/faq.html#how-to-suppress-log-messages-of-optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
def objective(trial):
    params = {
        'seed': 0,
        'learning_rate': 0.1,
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'min_child_weight': trial.suggest_int('min_child_weight', 3, 10),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.8, 1.0),
        'colsample_bylevel': trial.suggest_loguniform('colsample_bylevel', 0.8, 1.0),
    }

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
    accuracies = []
    for train_idx, test_idx in cv.split(train_x, train_y):
        trn_x = train_x.iloc[train_idx, :]
        val_x = train_x.iloc[test_idx, :]

        trn_y = train_y.iloc[train_idx]
        val_y = train_y.iloc[test_idx]

        # main - Predict
        clf = xgb.XGBClassifier(**params)
        clf.fit(trn_x, trn_y)

        pred_y = clf.predict(val_x)
        accuracies.append(accuracy_score(val_y, pred_y))

    return 1.0 - np.mean(accuracies)

In [11]:
def preprocess_df(df):
    # Cabinは後でDropするので、削除
    df["Age"] = df["Age"].fillna(df["Age"].mean())
    df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode())
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
   
    # 列の削除
    df.drop(["Name", "Ticket", "Cabin", "PassengerId"], axis=1, inplace=True)

    # Sexの置換とEmbarkedのダミー化 
    df["Sex"] = df["Sex"].replace({"male": 0, "female": 1})
    df = pd.get_dummies(df)

    return df

In [12]:
# main
df_train = pd.read_csv("drive/MyDrive/train.csv")
train_y = df_train["Survived"]
train_x = df_train.drop("Survived", axis=1)
train_x = preprocess_df(train_x)

# random_stateを固定
# 実際は要らないが、今回はチュートリアルのため導入。
# https://optuna.readthedocs.io/en/stable/faq.html#how-can-i-obtain-reproducible-optimization-results
sampler = optuna.samplers.TPESampler(seed=100) # Make the sampler behave in a deterministic way.
study = optuna.create_study(sampler=sampler)
study.optimize(objective, n_trials=100, n_jobs=1)
print(study.best_trial.value)
print(study.best_trial.params)

  'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.8, 1.0),
  'colsample_bylevel': trial.suggest_loguniform('colsample_bylevel', 0.8, 1.0),
  'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.8, 1.0),
  'colsample_bylevel': trial.suggest_loguniform('colsample_bylevel', 0.8, 1.0),
  'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.8, 1.0),
  'colsample_bylevel': trial.suggest_loguniform('colsample_bylevel', 0.8, 1.0),
  'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.8, 1.0),
  'colsample_bylevel': trial.suggest_loguniform('colsample_bylevel', 0.8, 1.0),
  'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.8, 1.0),
  'colsample_bylevel': trial.suggest_loguniform('colsample_bylevel', 0.8, 1.0),
  'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.8, 1.0),
  'colsample_bylevel': trial.suggest_loguniform('colsample_bylevel', 0.8, 1.0),
  'colsample_bytree': trial.suggest_loguniform('cols

0.16722783389450058
{'max_depth': 5, 'min_child_weight': 4, 'colsample_bytree': 0.8389592897949022, 'colsample_bylevel': 0.9946590759043116}
