In [146]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings('ignore')

%matplotlib inline

In [147]:
# データの読み込み
train = pd.read_csv('input/train.csv', usecols=['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked'])
test = pd.read_csv('input/test.csv', usecols=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked'])

train.columns, test.columns

(Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin',
        'Embarked'],
       dtype='object'),
 Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked'], dtype='object'))

In [148]:
train.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S
5,0,3,male,,0,0,8.4583,,Q
6,0,1,male,54.0,0,0,51.8625,E46,S
7,0,3,male,2.0,3,1,21.075,,S
8,1,3,female,27.0,0,2,11.1333,,S
9,1,2,female,14.0,1,0,30.0708,,C


In [149]:
train['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

# データの前処理

In [150]:
# 性別を0, 1に変換
sex_copy = train['Sex'].map({'male': 0, 'female': 1}).astype(int)
train['Sex'] = sex_copy

In [151]:
# ageを中央値で保管

med = train['Age'].median()
train['Age'] = train['Age'].fillna(med)

In [152]:
# 最頻値である S で欠損を補完する
train['Embarked'] = train['Embarked'].fillna('S')

train['Embarked'] = train['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

In [153]:
# キャビンを頭文字だけにして、欠損値はXとする
train['Cabin'] = train['Cabin'].fillna('X')

levels = []
for level in train['Cabin']:
    levels.append(str(level)[0])
    
train['Cabin'] = levels

train['Cabin'] = train['Cabin'].map({'X': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8})

# データの分割

In [154]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(train.drop(columns=['Survived']),
                                                    train['Survived'],
                                                    random_state=0)

X_train.shape, X_valid.shape

((668, 8), (223, 8))

In [155]:
from sklearn.linear_model import LogisticRegression

logreg1 = LogisticRegression(C=1)
logreg1.fit(X_train, y_train)

logreg1.score(X_valid, y_valid)

0.8116591928251121

In [156]:
logreg2 = LogisticRegression(C=10)
logreg2.fit(X_train, y_train)

logreg2.score(X_valid, y_valid)

0.820627802690583

In [157]:
logreg3 = LogisticRegression(C=100)
logreg3.fit(X_train, y_train)

logreg3.score(X_valid, y_valid)

0.820627802690583

# 良かったモデルで再学習

In [158]:
final_model = LogisticRegression(C=10)
final_model.fit(train.drop(columns=['Survived']), train['Survived'])

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

# テストデータでの予測

In [159]:
# テストデータにも同じ前処理
# Sex
sex_copy = test['Sex'].map({'male': 0, 'female': 1}).astype(int)
test['Sex'] = sex_copy

# Age
med = test['Age'].median()
test['Age'] = test['Age'].fillna(med)

# Embarked
test['Embarked'] = test['Embarked'].fillna('S')
test['Embarked'] = test['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Fare
test['Fare'] = test['Fare'].fillna(train['Fare'].median())


# Cabin
test['Cabin'] = test['Cabin'].fillna('X')

levels = []
for level in test['Cabin']:
    levels.append(str(level)[0])
    
test['Cabin'] = levels
test['Cabin'] = test['Cabin'].map({'X': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8})

In [160]:
final_model.predict(test)

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [161]:
submit_data = pd.read_csv('input/gender_submission.csv')

submit_data['Survived'] = final_model.predict(test)

# CSVで保存
# submit_data.to_csv('20191007_02_logreg1.csv', index=False)