# ワンホットエンコーディング

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("train.csv")

# 欠損値の処理

In [3]:
data["Age"] = data["Age"].fillna(data["Age"].median())
data["Embarked"] = data["Embarked"].fillna("S")

# "Sex"、"Embarked" → ワンホットエンコーディング

In [4]:
data = pd.get_dummies(data, columns=["Sex"])
data = pd.get_dummies(data, columns=["Embarked"])

In [5]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,1,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,1,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,1,0,0,1


In [6]:
X = data.drop(["PassengerId", "Survived", "Name", "Ticket", "Cabin"], axis=1)
y = data["Survived"]

In [7]:
X.shape

(891, 10)

In [8]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,26.0,0,0,7.925,1,0,0,0,1
3,1,35.0,1,0,53.1,1,0,0,0,1
4,3,35.0,0,0,8.05,0,1,0,0,1


### 4-fold cross validation, Grid Search

In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# KNN

In [10]:
from sklearn.neighbors import KNeighborsClassifier

### グリッドサーチ

In [11]:
def param():
  ret = {
      'n_neighbors':[10, 20, 30],
  }
  return ret
gscv = GridSearchCV(KNeighborsClassifier(), param(), cv=4)
gscv.fit(X, y)

print('Best score: {}'.format(gscv.best_score_))
print('Best parameters: {}'.format(gscv.best_params_))

Best score: 0.7070707070707071
Best parameters: {'n_neighbors': 20}




In [12]:
knn = KNeighborsClassifier(n_neighbors=20)
scores = cross_val_score(knn, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.60267857 0.73991031 0.70720721 0.77927928]
Average score: 0.7072688429541008


# Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
def param():
  ret = {
      'C':[0.001, 0.01, 0.1, 1, 10, 100],
  }
  return ret
gscv = GridSearchCV(LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000), param(), cv=4)
gscv.fit(X, y)

print('Best score: {}'.format(gscv.best_score_))
print('Best parameters: {}'.format(gscv.best_params_))

Best score: 0.7957351290684624
Best parameters: {'C': 0.1}


In [15]:
lr = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000, C=0.1)
scores = cross_val_score(lr, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.77678571 0.79820628 0.77477477 0.83333333]
Average score: 0.795775025105182


# SVM

In [16]:
from sklearn.svm import SVC

### グリッドサーチ

In [17]:
def param():
  ret = {
      'C':[0.001, 0.01, 0.1, 1, 10, 100],
      'gamma':[0.001, 0.01, 0.1, 1, 10, 100]
  }
  return ret
gscv = GridSearchCV(SVC(), param(), cv=4)
gscv.fit(X, y)

print('Best score: {}'.format(gscv.best_score_))
print('Best parameters: {}'.format(gscv.best_params_))

Best score: 0.7957351290684624
Best parameters: {'C': 100, 'gamma': 0.001}


In [18]:
svm = SVC(C=100, gamma=0.001)
scores = cross_val_score(svm, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.77678571 0.80269058 0.77927928 0.82432432]
Average score: 0.7957699752122398


# Decision Tree

In [19]:
from sklearn.tree import DecisionTreeClassifier

### グリッドサーチ

In [20]:
def param():
  ret = {
      "max_depth": [2,4,6,8,10],
      "max_features": ['log2', 'sqrt','auto'],
      "min_samples_split": [2, 3, 5],
      "min_samples_leaf": [1,5,8],
      "criterion": ["gini", "entropy"],
  }
  return ret
gscv = GridSearchCV(DecisionTreeClassifier(random_state=0), param(), cv=4)
gscv.fit(X, y)

print('Best score: {}'.format(gscv.best_score_))
print('Best parameters: {}'.format(gscv.best_params_))

Best score: 0.8181818181818182
Best parameters: {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5}


In [21]:
dt = DecisionTreeClassifier(criterion="entropy", max_depth=8, max_features="log2", min_samples_split=5)
scores = cross_val_score(dt, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.79017857 0.82511211 0.80630631 0.81981982]
Average score: 0.810354201294504


# Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

### グリッドサーチ

In [23]:
def param():
  ret = {
      "n_estimators":[50,100,200],
      "max_features": [1, 3, 7],
      "min_samples_split": [2, 10, 20],
  }
  return ret
gscv = GridSearchCV(RandomForestClassifier(random_state=0), param(), cv=4)
gscv.fit(X, y)

print('Best score: {}'.format(gscv.best_score_))
print('Best parameters: {}'.format(gscv.best_params_))

Best score: 0.8338945005611672
Best parameters: {'max_features': 3, 'min_samples_split': 10, 'n_estimators': 100}


In [24]:
rf = RandomForestClassifier(n_estimators=100, max_features=3, min_samples_split=10)
scores = cross_val_score(rf, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.78125    0.86547085 0.82432432 0.83783784]
Average score: 0.8272207535450249


# アンサンブル学習(Voting)

In [25]:
from sklearn.ensemble import VotingClassifier

### 最適な重みを探索

In [26]:
max_score = 0
for w_lr in range(5):
    for w_svm in range(5):
        for w_dt in range(5):
            for w_rf in range(1, 6):
                vote_clf = VotingClassifier([('lr', lr), ('svm', svm), ('dt', dt), ('rf', rf)], weights=[w_lr, w_svm, w_dt, w_rf])
                scores = cross_val_score(vote_clf, X, y, cv=4)
                tmp_score = np.mean(scores)
                if max_score < tmp_score:
                    max_score = tmp_score
                    print("lr:svm:dt:rf = {}:{}:{}:{}".format(w_lr, w_svm, w_dt, w_rf))
                    print("tmp_score = {}".format(tmp_score))
                    print()
print("Test set score: {:.3f}".format(max_score))
print()

lr:svm:dt:rf = 0:0:0:1
tmp_score = 0.8294629961880522

lr:svm:dt:rf = 0:0:0:2
tmp_score = 0.8339523059254002

lr:svm:dt:rf = 0:0:0:4
tmp_score = 0.8361945485684275

lr:svm:dt:rf = 0:0:2:5
tmp_score = 0.8395528175516965



KeyboardInterrupt: 

In [27]:
vote_clf =  VotingClassifier([('lr', lr), ('svm', svm), ('dt', dt), ('rf', rf)], weights=[2, 1, 1, 5])
scores = cross_val_score(vote_clf, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.80357143 0.86547085 0.83783784 0.83783784]
Average score: 0.8361794890662603


# Votingをとりあえず採用

In [28]:
vote_clf.fit(X, y)
print()




# 提出用ファイルを作成

In [29]:
test = pd.read_csv("test.csv")

In [30]:
test["Age"] = test["Age"].fillna(test["Age"].median())
test["Fare"] = test["Fare"].fillna(test["Fare"].median())
test = pd.get_dummies(test, columns=["Sex"])
test = pd.get_dummies(test, columns=["Embarked"])

In [31]:
X_test = test.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

### 予測結果

In [32]:
pred = vote_clf.predict(X_test)

In [33]:
submit = pd.DataFrame({"PassengerId":test["PassengerId"], "Survived":pred})

In [34]:
submit.to_csv("gender_submission.csv",index=False)

# 結果

- 77.99%