In [94]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings('ignore')

%matplotlib inline

In [95]:
# データの読み込み
train = pd.read_csv('input/train.csv', usecols=['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
test = pd.read_csv('input/test.csv', usecols=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])

train.columns, test.columns

(Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
        'Embarked'],
       dtype='object'),
 Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object'))

In [96]:
train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
5,0,3,male,,0,0,8.4583,Q
6,0,1,male,54.0,0,0,51.8625,S
7,0,3,male,2.0,3,1,21.0750,S
8,1,3,female,27.0,0,2,11.1333,S
9,1,2,female,14.0,1,0,30.0708,C


# データの前処理

In [97]:
# 性別を0, 1に変換
sex_copy = train['Sex'].map({'male': 0, 'female': 1}).astype(int)
train['Sex'] = sex_copy

In [98]:
# ageを中央値で保管

med = train['Age'].median()
train['Age'] = train['Age'].fillna(med)

In [99]:
# 最頻値である S で欠損を補完する
train['Embarked'] = train['Embarked'].fillna('S')

train['Embarked'] = train['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

In [100]:
# # キャビンを頭文字だけにして、欠損値はXとする
# train['Cabin'] = train['Cabin'].fillna('X')

# for i in range(len(train['Cabin'])):
#     if train['Cabin'][i] != 'X':
#         train['Cabin'][i] = train['Cabin'][i][0]

# データの分割

In [101]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(train.drop(columns=['Survived']),
                                                    train['Survived'],
                                                    random_state=0)

X_train.shape, X_valid.shape

((668, 7), (223, 7))

In [102]:
from sklearn.linear_model import LogisticRegression

logreg1 = LogisticRegression(C=1)
logreg1.fit(X_train, y_train)

logreg1.score(X_valid, y_valid)

0.7892376681614349

In [103]:
logreg2 = LogisticRegression(C=10)
logreg2.fit(X_train, y_train)

logreg2.score(X_valid, y_valid)

0.7982062780269058

In [104]:
logreg3 = LogisticRegression(C=100)
logreg3.fit(X_train, y_train)

logreg3.score(X_valid, y_valid)

0.7982062780269058

# 良かったモデルで再学習

In [105]:
final_model = LogisticRegression(C=10)
final_model.fit(train.drop(columns=['Survived']), train['Survived'])

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

# テストデータでの予測

In [106]:
# テストデータにも同じ前処理

sex_copy = test['Sex'].map({'male': 0, 'female': 1}).astype(int)
test['Sex'] = sex_copy

med = test['Age'].median()
test['Age'] = test['Age'].fillna(med)

test['Embarked'] = test['Embarked'].fillna('S')

test['Embarked'] = test['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

test['Fare'] = test['Fare'].fillna(train['Fare'].median())

In [107]:
final_model.predict(test)

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [108]:
submit_data = pd.read_csv('input/gender_submission.csv')

submit_data['Survived'] = final_model.predict(test)

# CSVで保存
submit_data.to_csv('20191007_01_logreg1.csv', index=False)