In [103]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [104]:
dataset = pd.read_csv('data/train.csv', sep=',')
#print(dataset)

In [105]:
#features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
# notebookに倣ってfeaturesを絞ってみる
features = ["Pclass", "Sex_male", "SibSp", "Parch"]

# 欠損値の確認
#print(dataset.isnull().sum())
def data_preprocessing(dataset):
    # Embarkedの欠損率は約2%... 消すか
    dataset = dataset.dropna(subset=['Embarked'])

    # Sex column, Embarkedをエンコーディング
    from sklearn.preprocessing import LabelEncoder
    label_encoder = LabelEncoder()
    dataset['Embarked'] = label_encoder.fit_transform(dataset['Embarked'])
    
    # SexはLabelEncodingを用いると多重共線性が発生? OneHot Encodingへ変更
    dataset = pd.get_dummies(dataset, columns=['Sex'], drop_first=True)
    #dataset['Sex'] = label_encoder.fit_transform(dataset['Sex'])

    # Ageの欠損値にはpandaではなくscikit-learnのライブラリを使用
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
    dataset['Age'] = imputer.fit_transform(dataset['Age'].values.reshape(-1,1))

    X = dataset.loc[:,features].values
    if 'Survived' in dataset.columns:
        y = dataset.loc[:,'Survived'].values
    else:
        y = np.nan

    # それ以外の欠損はimpute
    X = imputer.fit_transform(X)

    # Features Scaling
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    return X, y, dataset

X, y, dataset = data_preprocessing(dataset)

In [106]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [107]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logi_classifier = LogisticRegression()
logi_classifier.fit(X_train, y_train)
y_pred = logi_classifier.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
def create_confusion_matrix(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print(f'accuracy_score = {accuracy_score(y_test,y_pred):.5f}')

create_confusion_matrix(y_test, y_pred)

[[124  16]
 [ 28  55]]
accuracy_score = 0.80269


In [108]:
# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
k_classifier = KNeighborsClassifier(n_neighbors=2, metric='minkowski', p=1)
k_classifier.fit(X_train, y_train)
y_pred = k_classifier.predict(X_test)

create_confusion_matrix(y_test, y_pred)

[[126  14]
 [ 35  48]]
accuracy_score = 0.78027


In [109]:
# Support Vector Machine
from sklearn.svm import SVC
svm_classifier = SVC(kernel='linear', random_state=0)
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)

create_confusion_matrix(y_test, y_pred)

[[120  20]
 [ 26  57]]
accuracy_score = 0.79372


In [110]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
naive_classifier = GaussianNB()
naive_classifier.fit(X_train, y_train)
y_pred = naive_classifier.predict(X_test)

create_confusion_matrix(y_test, y_pred)

[[111  29]
 [ 17  66]]
accuracy_score = 0.79372


In [119]:
# Random Forest Classification
from sklearn.ensemble import RandomForestClassifier
rndm_classifier = RandomForestClassifier(n_estimators=5, criterion='gini', bootstrap=True, max_features=2, min_samples_leaf=20, min_samples_split=5, random_state=0)
rndm_classifier.fit(X_train, y_train)
y_pred = rndm_classifier.predict(X_test)

create_confusion_matrix(y_test, y_pred)

[[128  12]
 [ 34  49]]
accuracy_score = 0.79372


In [115]:
# grid search
from sklearn.model_selection import GridSearchCV
grid_parameters = [
    {'n_estimators': [1, 2, 5, 10, 100, 1000], 
    'criterion': ['gini', 'entropy'],
    'max_features': [1, 2, 5, 10, 20],
    'min_samples_split': [1, 2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10, 20],
    'bootstrap': [True, False],
    }
]

grid_search = GridSearchCV(RandomForestClassifier(), grid_parameters, cv=5, scoring='accuracy', n_jobs = -1)
grid_search.fit(X_train, y_train)
grid_search.best_params_

KeyboardInterrupt: 

In [79]:
# Predict
test_dataset = pd.read_csv('data/test.csv')
X, y, test_dataset = data_preprocessing(test_dataset)
y_pred = logi_classifier.predict(X)

output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Survived': y_pred})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
