# 複数のアンサンブル学習器を利用

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("train.csv")

# 欠損値の処理

In [3]:
data["Age"] = data["Age"].fillna(data["Age"].median())
data["Embarked"] = data["Embarked"].fillna("S")

In [4]:
data = pd.get_dummies(data, columns=["Sex"])
data = pd.get_dummies(data, columns=["Embarked"])

In [5]:
X = data.drop(["PassengerId", "Survived", "Name", "Ticket", "Cabin"], axis=1)
y = data["Survived"]

### 4-fold cross validation, Grid Search

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

### グリッドサーチ

In [8]:
def param():
  ret = {
      "max_depth": [2,4,6,8,10],
      "max_features": ['log2', 'sqrt','auto'],
      "min_samples_split": [2, 3, 5],
      "min_samples_leaf": [1,5,8],
      "criterion": ["gini", "entropy"],
  }
  return ret
gscv = GridSearchCV(DecisionTreeClassifier(random_state=0), param(), cv=4)
gscv.fit(X, y)

print('Best score: {}'.format(gscv.best_score_))
print('Best parameters: {}'.format(gscv.best_params_))

Best score: 0.8181818181818182
Best parameters: {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5}


In [9]:
dt = DecisionTreeClassifier(criterion="entropy", max_depth=8, max_features="log2", min_samples_split=5)
scores = cross_val_score(dt, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.74107143 0.8206278  0.81081081 0.82882883]
Average score: 0.8003347177254129


# Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

### グリッドサーチ

In [11]:
def param():
  ret = {
      "n_estimators":[50,100,200],
      "max_features": [1, 3, 7],
      "min_samples_split": [2, 10, 20],
  }
  return ret
gscv = GridSearchCV(RandomForestClassifier(random_state=0), param(), cv=4)
gscv.fit(X, y)

print('Best score: {}'.format(gscv.best_score_))
print('Best parameters: {}'.format(gscv.best_params_))

Best score: 0.8338945005611672
Best parameters: {'max_features': 3, 'min_samples_split': 10, 'n_estimators': 100}


In [12]:
rf = RandomForestClassifier(n_estimators=100, max_features=3, min_samples_split=10)
scores = cross_val_score(rf, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.78125    0.86098655 0.83333333 0.83783784]
Average score: 0.8283519295640932


# AdaBoost

In [13]:
from sklearn.ensemble import AdaBoostClassifier

### グリッドサーチ

In [14]:
def param():
  ret = {
      "n_estimators":[50,100,200]
  }
  return ret
gscv = GridSearchCV(RandomForestClassifier(random_state=0), param(), cv=4)
gscv.fit(X, y)

print('Best score: {}'.format(gscv.best_score_))
print('Best parameters: {}'.format(gscv.best_params_))

Best score: 0.8035914702581369
Best parameters: {'n_estimators': 100}


In [15]:
ada = AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(ada, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.77232143 0.83856502 0.79279279 0.82882883]
Average score: 0.8081270181536437


# Extra Trees Classifier

In [16]:
from sklearn.ensemble import ExtraTreesClassifier

### グリッドサーチ

In [17]:
def param():
  ret = {
      "n_estimators":[50,100,200],
      "max_features": [5, 7, 10],
      "min_samples_split": [2, 10, 20]
  }
  return ret
gscv = GridSearchCV(ExtraTreesClassifier(random_state=0), param(), cv=4)
gscv.fit(X, y)

print('Best score: {}'.format(gscv.best_score_))
print('Best parameters: {}'.format(gscv.best_params_))

Best score: 0.8271604938271605
Best parameters: {'max_features': 10, 'min_samples_split': 10, 'n_estimators': 100}


In [18]:
et = ExtraTreesClassifier(max_features=10, min_samples_split=10, n_estimators=100)
scores = cross_val_score(et, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.77232143 0.84304933 0.84684685 0.83783784]
Average score: 0.8250138601525934


# 勾配ブースティング木

In [19]:
from sklearn.ensemble import GradientBoostingClassifier

### グリッドサーチ

In [20]:
def param():
  ret = {
      "n_estimators":[100, 200, 300]
  }
  return ret
gscv = GridSearchCV(GradientBoostingClassifier(random_state=0), param(), cv=4)
gscv.fit(X, y)

print('Best score: {}'.format(gscv.best_score_))
print('Best parameters: {}'.format(gscv.best_params_))

Best score: 0.8338945005611672
Best parameters: {'n_estimators': 200}


In [21]:
gb = GradientBoostingClassifier(n_estimators=200)
scores = cross_val_score(gb, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.8125     0.86098655 0.83333333 0.82882883]
Average score: 0.833912177311841


# アンサンブル学習(Voting)

In [22]:
from sklearn.ensemble import VotingClassifier

### 最適な重みを探索

In [23]:
max_score = 0
for w_ada in range(5):
    for w_et in range(5):
        for w_gb in range(5):
            for w_rf in range(1, 6):
                vote_clf = VotingClassifier([('ada', ada), ('et', et), ('gb', gb), ('rf', rf)], weights=[w_ada, w_et, w_gb, w_rf])
                scores = cross_val_score(vote_clf, X, y, cv=4)
                tmp_score = np.mean(scores)
                if max_score < tmp_score:
                    max_score = tmp_score
                    print("ada:et:gb:rf = {}:{}:{}:{}".format(w_ada, w_et, w_gb, w_rf))
                    print("tmp_score = {}".format(tmp_score))
                    print()
print("Test set score: {:.3f}".format(max_score))
print()

ada:et:gb:rf = 0:0:0:1
tmp_score = 0.832816215278379

ada:et:gb:rf = 0:0:0:2
tmp_score = 0.8350433533309094

ada:et:gb:rf = 0:1:1:1
tmp_score = 0.839547812747084

ada:et:gb:rf = 0:3:2:5
tmp_score = 0.8418251341107282



KeyboardInterrupt: 

In [36]:
vote_clf =  VotingClassifier([('ada', ada), ('et', et), ('gb', gb), ('rf', rf)], weights=[0, 3, 2, 5])
scores = cross_val_score(vote_clf, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.80357143 0.87892377 0.82882883 0.84684685]
Average score: 0.8395427177658119


# Votingをとりあえず採用

In [37]:
vote_clf.fit(X, y)
print()




# 提出用ファイルを作成

In [38]:
test = pd.read_csv("test.csv")

In [39]:
test["Age"] = test["Age"].fillna(test["Age"].median())
test["Fare"] = test["Fare"].fillna(test["Fare"].median())
test = pd.get_dummies(test, columns=["Sex"])
test = pd.get_dummies(test, columns=["Embarked"])

In [40]:
X_test = test.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

### 予測結果

In [41]:
pred = vote_clf.predict(X_test)

In [42]:
submit = pd.DataFrame({"PassengerId":test["PassengerId"], "Survived":pred})

In [43]:
submit.to_csv("gender_submission.csv",index=False)

# 結果

- 77.5%