<a href="https://colab.research.google.com/github/kg4-ken1ro/mypandas_tutorial_4/blob/main/study_tutorial_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%matplotlib inline
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pylab as plt

from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

# 組み合わせが多いので、進捗を可視化するツールを入れる。
from tqdm import tqdm_notebook as tqdm

warnings.filterwarnings('ignore')

from sklearn.model_selection import ParameterGrid

# all_paramsはグローバル変数
all_params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1],
    'min_child_weight': [3, 5, 10],
    'n_estimetors': [10000],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'colsample_bylevel': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1],
    'random_state': [0],
    'n_jobs': [1],
    }

for params in ParameterGrid(all_params):
  print(params)
  break

{'colsample_bylevel': 0.8, 'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 3, 'n_estimetors': 10000, 'n_jobs': 1, 'random_state': 0, 'reg_alpha': 0}


In [3]:
def validate(train_x, train_y, params):
  accuracies = []
  feature_importances = []

  cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
  for train_idx, test_idx in cv.split(train_x, train_y):
    trn_x = train_x.iloc[train_idx, :]
    val_x = train_x.iloc[test_idx, :]

    trn_y = train_y.iloc[train_idx]
    val_y = train_y.iloc[test_idx]

    clf = xgb.XGBClassifier(**params)
    clf.fit(trn_x, trn_y)

    pred_y = clf.predict(val_x)
    feature_importances.append(clf.feature_importances_)
    accuracies.append(accuracy_score(val_y, pred_y))
  return accuracies, feature_importances

In [4]:
def plot_feature_importances(feature_importances, cols):
  df_fimp = pd.DataFrame(feature_importances, columns=cols)
  df_fimp.plot(kind='box', rot=90)

In [5]:
def preprocess_df(df):
  # CabinはこのあとDropするので、コードから削除
  df["Age"] = df["Age"].fillna(df["Age"].mean())
  df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode())
  df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

  # 列の削除
  df.drop(["Name", "Ticket", "Cabin", "PassengerId"], axis=1, inplace=True)

  # Sexの置換とEmbarkedのダミー化
  df["Sex"] = df["Sex"].replace({"male": 0, "female": 1})
  df = pd.get_dummies(df)

  return df

In [6]:
def predict_df(train_x,train_y,test_x, df_test_raw, path_output="result.csv"):
  params = {
      'learning_rate': 0.008306052798923729,
      'max_depth': 7,
      'min_child_weight': 3,
      'colsample_bytree': 0.8210307463506532,
      'colsample_bylevel': 0.8061816543590015
  }

  clf = xgb.XGBClassifier(**params)
  clf.fit(train_x, train_y)
  preds = clf.predict(test_x)

  _df = pd.DateFrame()
  _df["PassengerId"] = df_test_raw["PassengerId"]
  _df["Survived"] = preds
  _df.to_csv(path_output, index=False)

In [7]:
def main():
  df_train = pd.read_csv("drive/MyDrive/train.csv")

  # 前処理
  train_y = df_train["Survived"]
  train_x = df_train.drop("Survived", axis=1)

  train_x = preprocess_df(train_x)
  accuracies, feature_importances = validate(train_x, train_y, {})
  plot_feature_importances(feature_importances, train_x.columns)

  flag_product = True
  if flag_product:
    df_test = pd.read_csv("drive/MyDrive/test.csv")
    df_test_raw = df.test.copy()
    test_x = preprocess_df(df_test)
    predict_df(train_x, train_y, test_x, df_test_raw, "result.csv")

In [8]:
# main文を書き換え、別関数として定義
def main_parametersearch():
  df_train = pd.read_csv("/drive/MyDrive/train.csv")
  train_y = df_train["Survived"]
  train_x = df_train.drop("Survived", axis=1)
  train_x = preprocess_df(train_x)

  # ここまではmainと同じ
  # tqdmで囲むことで、進捗を可視化できます。
  best_score = 0
  best_params = {}
  for params in tqdm(ParameterGrid(all_params)):
    accuracies, feature_importances = validate(train_x, train_y, params)

     # もしaccuracyの平均値が最大だった場合、best_scoreを更新して、best_paramsを更新する
    if np.mean(accuracies) > best_score:
      best_score = np.mean(accuracies)
      best_params = params
  print(best_score, best_params)

# 呼んでいる関数を変えた
if __name__ == "__main_":
  main_parametersearch()