In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
import lightgbm as lgb
import itertools
import time

In [2]:
path = "/content/drive/MyDrive/study/dataset/titanic/"

df_train = pd.read_csv(path+"train.csv")
df_test = pd.read_csv(path+"test.csv")

df = pd.concat([df_train, df_test], sort=False)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# 特徴量の作成
df['Sex'].replace(['male','female'], [0, 1], inplace=True)
df['Embarked'].fillna(('S'), inplace=True)
df['Embarked'] = df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
df['Fare'].fillna(np.mean(df['Fare']), inplace=True)
df['Age'].fillna(df['Age'].median(), inplace=True)
df['FamilySize'] = df['Parch'] + df['SibSp'] + 1
df['IsAlone'] = 0
df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

# 特徴量の削除
delete_columns = ['Name', 'PassengerId','Ticket', 'Cabin']
df.drop(delete_columns, axis=1, inplace=True)

train = df[:len(df_train)]
test = df[len(df_train):]

y_train = train['Survived']
x_train = train.drop('Survived', axis=1)
x_test = test.drop('Survived', axis=1)

In [4]:
conf = {
    "cv_params": 
    {
        "max_bin": [255, 300, 400, 500],
        "num_leaves_rate": [0.7, 0.8, 0.9, 1.0],
        "boosting": ["gbdt", "dart"],
        "max_depth": [3, 4, 5, 6, 7, 8]
    },
    "params":
    {
        "objective": "binary",
        "learning_rate": 0.01,
    }
}

all_params = [
              dict(zip(conf["cv_params"].keys(), v))
              for v in itertools.product(*conf["cv_params"].values())
]
print(len(all_params))

192


In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=0)
categorical_features = ['Embarked', 'Pclass', 'Sex']

best_score = 1000000
best_params = {}
best_y_preds = []

start = time.time()
# パラメータの組み合わせごとに交差検証
for params in all_params:
  models = []
  y_preds = [] # 各交差検証でのテストデータの予測結果
  for fold_id, (train_index, valid_index) in enumerate(cv.split(x_train)):
      x_tr = x_train.loc[train_index, :]
      x_val = x_train.loc[valid_index, :]
      y_tr = y_train[train_index]
      y_val = y_train[valid_index]
      
      # lightgbm用のデータセットを作成
      lgb_train = lgb.Dataset(x_tr, y_tr, categorical_feature=categorical_features)
      lgb_eval = lgb.Dataset(x_val, y_val, reference=lgb_train, categorical_feature=categorical_features)

      # num_leavesのパラメータを作成
      val_params = {**conf["params"], **params}
      val_params["num_leaves"] = int((2**val_params["max_depth"]) * val_params["num_leaves_rate"])
      del val_params["num_leaves_rate"]

      model = lgb.train(
          val_params, lgb_train,
          valid_sets=[lgb_train, lgb_eval],
          verbose_eval=-1,
          num_boost_round=1000,
          early_stopping_rounds=10
      )
      y_pred = model.predict(x_test, num_iteration=model.best_iteration)
      y_preds.append(y_pred)
      models.append(model)
    
  # このparamsでの交差検証の平均スコアを出す
  scores = [m.best_score['valid_1']['binary_logloss'] for m in models]
  score = sum(scores) / len(scores)
  if score < best_score:
    best_score = score
    best_params = params
    # テストでの予測
    y_sub = sum(y_preds) / len(y_preds)
    best_y_preds.append((y_sub > 0.5).astype(int))

In [6]:
elapsed_time = time.time() - start
print("elapsed_time:{0}".format(elapsed_time) + "[sec]")

elapsed_time:1556.7048478126526[sec]


In [7]:
print(best_score)
print(best_params)

0.41323454288774714
{'max_bin': 255, 'num_leaves_rate': 0.7, 'boosting': 'gbdt', 'max_depth': 3}


In [17]:
df_sub = pd.read_csv(path+"gender_submission.csv")
df_sub["Survived"] = best_y_preds[0]

In [18]:
df_sub.to_csv(path+"submission.csv", index=False)