# FamilySizeによってカテゴリわけ
# トレーニングデータを増やす

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [0]:
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/titanic/train.csv")
test = pd.read_csv("/content/drive/My Drive/Colab Notebooks/titanic/test.csv")

# 80.86%を出したファイルを使って、トレーニングデータを拡張
test_survive = pd.read_csv("/content/drive/My Drive/Colab Notebooks/titanic/expand.csv")
test["Survived"] = test_survive.iloc[:, 1]
test = test[["PassengerId", "Survived", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"]]

# 前処理

In [0]:
# 名前の先頭の文字列から、Mr、Mrs、Miss、Other_pre、Other_postに分ける
middle = data["Name"].str.split(", ", expand=True)
middle = middle[1].str.split(".", expand=True)
data["Middle"] = middle[0]
data.loc[(data["Middle"] != "Mr") 
         & (data["Middle"] != "Mrs") 
         & (data["Middle"] != "Miss") 
         & (data["Middle"] != "Master"), "Middle"] = "Other"
data.loc[(data["Middle"] == "Other") 
         & (data["Age"] < data.loc[data["Middle"] == "Other", "Age"].dropna().median()), "Middle"] = "Other_pre"
data.loc[(data["Middle"] == "Other"), "Middle"] = "Other_post"

# それぞれの中央値で欠損値を補完
for middle in ["Mr", "Mrs", "Miss", "Master", "Other_pre", "Other_post"]:
    data.loc[(data["Age"].isnull()) & (data["Middle"] == middle), "Age"] = data.loc[
        data["Middle"] == middle, "Age"].dropna().median()

# 性別をMr、notMr、Msに分ける
data.loc[(data["Middle"] == "Mr") & (data["Sex"] == "male"), "Sex"] = "Mr"
data.loc[(data["Middle"] != "Mr") & (data["Sex"] == "male"), "Sex"] = "notMr"

In [0]:
# テストデータに対しても同じ前処理を行う
middle = test["Name"].str.split(", ", expand=True)
middle = middle[1].str.split(".", expand=True)
test["Middle"] = middle[0]
test.loc[(test["Middle"] != "Mr") 
         & (test["Middle"] != "Mrs") 
         & (test["Middle"] != "Miss") 
         & (test["Middle"] != "Master"), "Middle"] = "Other"
test.loc[(test["Middle"] == "Other") 
         & (test["Age"] < test.loc[test["Middle"] == "Other", "Age"].dropna().median()), "Middle"] = "Other_pre"
test.loc[(test["Middle"] == "Other"), "Middle"] = "Other_post"

for middle in ["Mr", "Mrs", "Miss", "Master", "Other_pre", "Other_post"]:
    test.loc[(test["Age"].isnull()) & (test["Middle"] == middle), "Age"] = test.loc[
        test["Middle"] == middle, "Age"].dropna().median()
test["Fare"] = test["Fare"].fillna(test.loc[test["Middle"] == "Mr", "Fare"].dropna().median())

test.loc[(test["Middle"] == "Mr") & (test["Sex"] == "male"), "Sex"] = "Mr"
test.loc[(test["Middle"] != "Mr") & (test["Sex"] == "male"), "Sex"] = "notMr"

In [0]:
# 特徴量FamilySizeを導入
data["FamilySize"] = data["SibSp"] + data["Parch"] + 1
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1

# FamilySize=1と、FamilySize>=5と、その他の3つに分ける
data["FamilySize_1"] = 0
data["FamilySize_mid"] = 0
data["FamilySize_big"] = 0
data.loc[data["FamilySize"] == 1, "FamilySize_1"] = 1
data.loc[(data["FamilySize"] > 1) & (data["FamilySize"] < 5), "FamilySize_mid"] = 1
data.loc[data["FamilySize"] >= 5, "FamilySize_big"] = 1
test["FamilySize_1"] = 0
test["FamilySize_mid"] = 0
test["FamilySize_big"] = 0
test.loc[test["FamilySize"] == 1, "FamilySize_1"] = 1
test.loc[(test["FamilySize"] > 1) & (test["FamilySize"] < 5), "FamilySize_mid"] = 1
test.loc[test["FamilySize"] >= 5, "FamilySize_big"] = 1

In [0]:
# Cabinのデータが欠損している人を0、欠損していない人を1とする
data["Cabin"] = data["Cabin"].fillna(0)
data.loc[data["Cabin"] != 0, "Cabin"] = 1
test["Cabin"] = test["Cabin"].fillna(0)
test.loc[test["Cabin"] != 0, "Cabin"] = 1

In [0]:
# Embarkedは一番多いSで補完
data["Embarked"] = data["Embarked"].fillna("S")

# トレーニングデータの拡張

In [0]:
data = pd.concat([data, test])

# X, yに分ける

In [0]:
X = data.drop(["PassengerId", "Survived", "Name", "SibSp", "Parch", "Ticket", "Middle", "FamilySize"], axis=1)
y = data["Survived"]
X_test = test.drop(["PassengerId", "Survived", "Name", "SibSp", "Parch", "Ticket", "Middle", "FamilySize"], axis=1)

# ワンホットエンコーディング

In [0]:
# Pclass、Sex、Embarkedをワンホットエンコーディング
X = pd.get_dummies(X, columns=["Pclass"])
X = pd.get_dummies(X, columns=["Sex"])
X = pd.get_dummies(X, columns=["Embarked"])
X_test = pd.get_dummies(X_test, columns=["Pclass"])
X_test = pd.get_dummies(X_test, columns=["Sex"])
X_test = pd.get_dummies(X_test, columns=["Embarked"])

# 標準化

In [0]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = pd.DataFrame(sc.fit_transform(X))
X_test = pd.DataFrame(sc.fit_transform(X_test))

# Stacking

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold

n_folds = 4
gridSearch = False

if gridSearch:
  # GridSearch(LogisticRegression)
  params = {
      "C":[0.001, 0.01, 0.1, 1, 10, 100]
  }
  gscv = GridSearchCV(LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000, random_state=0), params, cv=n_folds)
  gscv.fit(X, y)
  print('Best parameters(LogisticRegression): {}'.format(gscv.best_params_))
  lr = LogisticRegression(C=gscv.best_params_["C"], solver="lbfgs", max_iter=10000)

  # GridSearch(SVM)
  params = {
      "C":[0.001, 0.01, 0.1, 1, 10, 100],
      "gamma":[0.001, 0.01, 0.1, 1, 10, 100]
  }
  gscv = GridSearchCV(SVC(random_state=0), params, cv=n_folds)
  gscv.fit(X, y)
  print('Best parameters(SVM): {}'.format(gscv.best_params_))
  svm = SVC(C=gscv.best_params_["C"], gamma=gscv.best_params_["gamma"])

  # GridSearch(RandomForest)
  params = {
      "max_depth":[2, 4, 6, 8, 10],
      "n_estimators":[100, 200, 300]
  }
  gscv = GridSearchCV(RandomForestClassifier(random_state=0), params, cv=n_folds)
  gscv.fit(X, y)
  print('Best parameters(RandomForest): {}'.format(gscv.best_params_))
  rf = RandomForestClassifier(max_depth=gscv.best_params_["max_depth"], n_estimators=gscv.best_params_["n_estimators"])

  # GridSearch(Xgboost)
  params = {
      "max_depth":[2, 4, 6, 8, 10],
      "n_estimators":[100, 200, 300]
  }
  gscv = GridSearchCV(XGBClassifier(random_state=0), params, cv=n_folds)
  gscv.fit(X, y)
  print('Best parameters(Xgboost): {}'.format(gscv.best_params_))
  xgb = RandomForestClassifier(max_depth=gscv.best_params_["max_depth"], n_estimators=gscv.best_params_["n_estimators"])

  # GridSearch(LightGBM)
  params = {
      "max_depth":[2, 4, 6, 8, 10],
      "n_estimators":[100, 200, 300]
  }
  gscv = GridSearchCV(LGBMClassifier(random_state=0), params, cv=n_folds)
  gscv.fit(X, y)
  print('Best parameters(LightGBM): {}'.format(gscv.best_params_))
  lgb = RandomForestClassifier(max_depth=gscv.best_params_["max_depth"], n_estimators=gscv.best_params_["n_estimators"])

else:
  lr = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000)
  svm = SVC(gamma="scale")
  rf = RandomForestClassifier()
  xgb = XGBClassifier()
  lgb = LGBMClassifier()

In [0]:
skf = StratifiedKFold(n_splits=n_folds)

clfs = [lr, svm, rf, xgb, lgb]

blend_train = np.zeros((X.shape[0], len(clfs)))
blend_test = np.zeros((X_test.shape[0], len(clfs)))

for j, clf in enumerate(clfs):         
  blend_test_j = np.zeros((X_test.shape[0], n_folds))
  for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    # X1, y1でfittingを行い、X2のpredictを出す(ブレンドデータ)
    X1 = X.iloc[train_index]
    X2 = X.iloc[test_index]
    y1 = y.iloc[train_index]
    y2 = y.iloc[test_index]
    clf.fit(X1, y1)
    pred = clf.predict(X2)
    blend_train[test_index, j] = pred
    blend_test_j[:, i] = clf.predict(X_test)
  blend_test[:, j] = blend_test_j.mean(1)

if gridSearch:
  params = {
      'C':[0.001, 0.01, 0.1, 1, 10, 100],
  }
  gscv = GridSearchCV(LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000), params, cv=n_folds)
  gscv.fit(blend_train, y)
  print('Best parameters: {}'.format(gscv.best_params_))
  clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000, C=gscv.best_params_["C"])

else:
  clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000)



# 提出用ファイルを作成

In [0]:
clf.fit(blend_train, y)
pred = clf.predict(blend_test)
submit = pd.DataFrame({"PassengerId":test["PassengerId"], "Survived":pred})
submit.to_csv("/content/drive/My Drive/Colab Notebooks/titanic/gender_submission.csv",index=False)

In [0]:
from google.colab import files
files.download("/content/drive/My Drive/Colab Notebooks/titanic/gender_submission.csv")

# 結果

- 80.86%