In [209]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [210]:
dataset = pd.read_csv('data/train.csv', sep=',')

women = dataset[(dataset['Sex'] == 'female')]
women_survived = women[women['Survived'] == 1]
print(f'% of women who survived: {len(women_survived)/len(women)}')

men = dataset[(dataset['Sex'] == 'male')]
men_survived = men[men['Survived'] == 1]
print(f'% of men who survived: {len(men_survived)/len(men)}')

% of women who survived: 0.7420382165605095
% of men who survived: 0.18890814558058924


In [211]:
#features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
# notebookに倣ってfeaturesを絞ってみる
features = ["Pclass", "Sex", "SibSp", "Parch"]

# 欠損値の確認
#print(dataset.isnull().sum())
def data_preprocessing(dataset):
    # Embarkedの欠損率は約2%... 消すか
    dataset = dataset.dropna(subset=['Embarked'])

    # Sex column, Embarkedをエンコーディング
    from sklearn.preprocessing import LabelEncoder
    label_encoder = LabelEncoder()
    dataset['Sex'] = label_encoder.fit_transform(dataset['Sex'])
    dataset['Embarked'] = label_encoder.fit_transform(dataset['Embarked'])

    # Ageの欠損値にはpandaではなくscikit-learnのライブラリを使用
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
    dataset['Age'] = imputer.fit_transform(dataset['Age'].values.reshape(-1,1))

    X = dataset.loc[:,features].values
    if 'Survived' in dataset.columns:
        y = dataset.loc[:,'Survived'].values
    else:
        y = np.nan

    # それ以外の欠損はimpute
    X = imputer.fit_transform(X)

    # Features Scaling
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    return X, y, dataset

X, y, dataset = data_preprocessing(dataset)

In [212]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [213]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logi_classifier = LogisticRegression()
logi_classifier.fit(X_train, y_train)
y_pred = logi_classifier.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
def create_confusion_matrix(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print(f'accuracy_score = {accuracy_score(y_test,y_pred):.5f}')

create_confusion_matrix(y_test, y_pred)

[[130  13]
 [ 24  56]]
accuracy_score = 0.83408


In [214]:
# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
k_classifier = KNeighborsClassifier(n_neighbors=2, metric='minkowski', p=1)
k_classifier.fit(X_train, y_train)
y_pred = k_classifier.predict(X_test)

create_confusion_matrix(y_test, y_pred)

[[142   1]
 [ 46  34]]
accuracy_score = 0.78924


In [215]:
# Support Vector Machine
from sklearn.svm import SVC
svm_classifier = SVC(kernel='linear', random_state=0)
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)

create_confusion_matrix(y_test, y_pred)

[[127  16]
 [ 24  56]]
accuracy_score = 0.82063


In [216]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
naive_classifier = GaussianNB()
naive_classifier.fit(X_train, y_train)
y_pred = naive_classifier.predict(X_test)

create_confusion_matrix(y_test, y_pred)

[[112  31]
 [ 13  67]]
accuracy_score = 0.80269


In [217]:
# Random Forest Classification
from sklearn.ensemble import RandomForestClassifier
rndm_classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0)
rndm_classifier.fit(X_train, y_train)
y_pred = rndm_classifier.predict(X_test)

create_confusion_matrix(y_test, y_pred)

[[134   9]
 [ 29  51]]
accuracy_score = 0.82960


In [218]:
# Predict
test_dataset = pd.read_csv('data/test.csv')
X, y, test_dataset = data_preprocessing(test_dataset)
y_pred = logi_classifier.predict(X)

output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Survived': y_pred})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
