# グリッドサーチでチューニング

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("train.csv")

# 欠損値の処理

In [3]:
data["Age"] = data["Age"].fillna(data["Age"].median())
data["Embarked"] = data["Embarked"].fillna("S")

# 文字列を数値に変換

In [4]:
data["Sex"] = data["Sex"].replace("male", 0)
data["Sex"] = data["Sex"].replace("female", 1)
data["Embarked"] = data["Embarked"].replace("S", 0)
data["Embarked"] = data["Embarked"].replace("C", 1)
data["Embarked"] = data["Embarked"].replace("Q", 2)

In [5]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0


# 学習

### "Cabin"を抜いてやってみる

In [6]:
X = data.drop(["PassengerId", "Survived", "Name", "Ticket", "Cabin"], axis=1)
y = data["Survived"]

### 4-fold cross validation

In [7]:
from sklearn.model_selection import cross_val_score

# KNN

In [8]:
from sklearn.neighbors import KNeighborsClassifier

### グリッドサーチ

In [9]:
from sklearn.model_selection import GridSearchCV
def param():
  ret = {
      'n_neighbors':[10, 20, 30],
  }
  return ret
gscv = GridSearchCV(KNeighborsClassifier(), param(), cv=4)
gscv.fit(X, y)

print('Best score: {}'.format(gscv.best_score_))
print('Best parameters: {}'.format(gscv.best_params_))

Best score: 0.7014590347923682
Best parameters: {'n_neighbors': 20}




In [10]:
knn = KNeighborsClassifier(n_neighbors=20)
scores = cross_val_score(knn, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.60267857 0.7264574  0.70720721 0.77027027]
Average score: 0.701653362002297


# Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
from sklearn.model_selection import GridSearchCV
def param():
  ret = {
      'C':[0.001, 0.01, 0.1, 1, 10, 100],
  }
  return ret
gscv = GridSearchCV(LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000), param(), cv=4)
gscv.fit(X, y)

print('Best score: {}'.format(gscv.best_score_))
print('Best parameters: {}'.format(gscv.best_params_))

Best score: 0.797979797979798
Best parameters: {'C': 0.1}


In [13]:
lr = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000, C=0.1)
scores = cross_val_score(lr, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.77232143 0.8161435  0.77477477 0.82882883]
Average score: 0.7980171324832199


# SVM

In [14]:
from sklearn.svm import SVC

### グリッドサーチ

In [15]:
from sklearn.model_selection import GridSearchCV
def param():
  ret = {
      'C':[0.001, 0.01, 0.1, 1, 10, 100],
      'gamma':[0.001, 0.01, 0.1, 1, 10, 100]
  }
  return ret
gscv = GridSearchCV(SVC(), param(), cv=4)
gscv.fit(X, y)

print('Best score: {}'.format(gscv.best_score_))
print('Best parameters: {}'.format(gscv.best_params_))

Best score: 0.7890011223344556
Best parameters: {'C': 100, 'gamma': 0.001}


In [16]:
svm = SVC(C=100, gamma=0.001)
scores = cross_val_score(svm, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.76339286 0.80269058 0.77027027 0.81981982]
Average score: 0.7890433825481471


# Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
scores = cross_val_score(dt, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.72767857 0.78026906 0.7972973  0.74774775]
Average score: 0.7632481686923951


# Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier

### グリッドサーチ

In [19]:
from sklearn.model_selection import GridSearchCV
def param():
  ret = {
      "n_estimators":[50,100,200],
      "max_features": [1, 3, 7],
      "min_samples_split": [2, 10, 20],
  }
  return ret
gscv = GridSearchCV(RandomForestClassifier(random_state=0), param(), cv=4)
gscv.fit(X, y)

print('Best score: {}'.format(gscv.best_score_))
print('Best parameters: {}'.format(gscv.best_params_))

Best score: 0.835016835016835
Best parameters: {'max_features': 3, 'min_samples_split': 10, 'n_estimators': 100}


In [20]:
rf = RandomForestClassifier(n_estimators=100, max_features=3, min_samples_split=10)
scores = cross_val_score(rf, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.79910714 0.86547085 0.84234234 0.83783784]
Average score: 0.836189543763815


# アンサンブル学習(Voting)

### 最適な重みを探索

In [21]:
from sklearn.ensemble import VotingClassifier

In [22]:
max_score = 0
for w_lr in range(5):
    for w_svm in range(5):
        for w_dt in range(5):
            for w_rf in range(1, 6):
                vote_clf = VotingClassifier([('lr', lr), ('svm', svm), ('dt', dt), ('rf', rf)], weights=[w_lr, w_svm, w_dt, w_rf])
                scores = cross_val_score(vote_clf, X, y, cv=4)
                tmp_score = np.mean(scores)
                if max_score < tmp_score:
                    max_score = tmp_score
                    print("lr:svm:dt:rf = {}:{}:{}:{}".format(w_lr, w_svm, w_dt, w_rf))
                    print("tmp_score = {}".format(tmp_score))
                    print()
print("Test set score: {:.3f}".format(max_score))
print()

lr:svm:dt:rf = 0:0:0:1
tmp_score = 0.8328413294781007

lr:svm:dt:rf = 0:0:0:3
tmp_score = 0.8350734272469138

lr:svm:dt:rf = 0:0:0:5
tmp_score = 0.8373106199969989

lr:svm:dt:rf = 0:0:2:3
tmp_score = 0.8373257245874959

lr:svm:dt:rf = 0:1:1:2
tmp_score = 0.8384166367280157

lr:svm:dt:rf = 0:1:1:3
tmp_score = 0.838436746123125



KeyboardInterrupt: 

In [23]:
vote_clf =  VotingClassifier([('lr', lr), ('svm', svm), ('dt', dt), ('rf', rf)], weights=[0, 1, 1, 3])
scores = cross_val_score(vote_clf, X, y, cv=4)
print("Cross-Validation scores: {}".format(scores))
print("Average score: {}".format(np.mean(scores)))

Cross-Validation scores: [0.78125    0.86995516 0.83333333 0.83783784]
Average score: 0.8305940820304609


# Votingをとりあえず採用

In [24]:
vote_clf.fit(X, y)
print()




# 提出用ファイルを作成

In [25]:
test = pd.read_csv("test.csv")

In [26]:
test["Age"] = test["Age"].fillna(test["Age"].median())
test["Fare"] = test["Fare"].fillna(test["Fare"].median())
test["Sex"] = test["Sex"].replace("male", 0)
test["Sex"] = test["Sex"].replace("female", 1)
test["Embarked"] = test["Embarked"].replace("S", 0)
test["Embarked"] = test["Embarked"].replace("C", 1)
test["Embarked"] = test["Embarked"].replace("Q", 2)

In [27]:
X_test = test.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

### 予測結果

In [28]:
pred = vote_clf.predict(X_test)

In [29]:
submit = pd.DataFrame({"PassengerId":test["PassengerId"], "Survived":pred})

In [30]:
submit.to_csv("gender_submission.csv",index=False)

# 結果

- 77.5%