In [1]:
import pandas as pd

# read csv

In [2]:
INPUT_DIR = '../input/'

train = pd.read_csv(INPUT_DIR + 'train.csv')
test = pd.read_csv(INPUT_DIR + 'test.csv')
submission = pd.read_csv(INPUT_DIR + 'gender_submission.csv')

# shape

In [3]:
print(train.shape)
print(test.shape)
print(submission.shape)

(891, 12)
(418, 11)
(418, 2)


# head

In [4]:
print(train.columns)
print(test.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [5]:
# columnsの差分をみる
set(train.columns) - set(test.columns)

{'Survived'}

In [6]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# 欠損値がどれくらいあるか見る

In [8]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
train.isnull().sum() / train.shape[0] * 100

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

In [10]:
test.isnull().sum() / test.shape[0] * 100

PassengerId     0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            20.574163
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.239234
Cabin          78.229665
Embarked        0.000000
dtype: float64

## 欠損値をどうするか
- 何もしない
- なんか埋める（平均、中央値、最頻値、予測モデルを作る）
- 欠損してる行、列を消す

In [11]:
# 年齢の欠損値を平均で埋める
# train, testの平均を出す

# trainとtestを縦結合
# あとで分けやすいようにフラグつけておく
train['test_flg'] = 0
test['test_flg'] = 1

# indexを振り直す(元のindexはいらないのでdrop=True で消す)
data = pd.concat([train, test]).reset_index(drop=True)
print(data.shape)

(1309, 13)


In [12]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'test_flg'],
      dtype='object')

In [13]:
# 平均で埋める
age_mean = data['Age'].mean()
data['Age'] = data['Age'].fillna(age_mean)


# まとめて処理する
COLS = ['Fare', 'Age']

for col in COLS:
    age_mean = data[col].mean()
    data[col] = data[col].fillna(age_mean)

In [14]:
# docstring -> shift + tab (jupyter notebook)

In [15]:
# fear of the dark

In [16]:
data['Embarked'].value_counts()

S    914
C    270
Q    123
Name: Embarked, dtype: int64

In [17]:
# 一番多い要素を入れる
data['Embarked'] = data['Embarked'].fillna('S')

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1309 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1309 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1309 non-null   object 
 12  test_flg     1309 non-null   int64  
dtypes: float64(3), int64(5), object(5)
memory usage: 133.1+ KB


In [19]:
# フラグを見てtrain, testに分ける
constraint1 = (data['test_flg']==0)
constraint2 = (data['test_flg']==1)

train = data[constraint1].copy()
test = data[constraint2].copy()

In [20]:
print(train.shape)
print(test.shape)

(891, 13)
(418, 13)


In [21]:
# いらない列を消す
# Cabinは欠損値が多いので使わない
DROP_COLS_TRAIN = ['test_flg', 'Cabin']
DROP_COLS_TEST = ['test_flg', 'Survived', 'Cabin']

train = train.drop(DROP_COLS_TRAIN, axis=1)
test = test.drop(DROP_COLS_TEST, axis=1)

# ラベルエンコーディング
ここで使うロジスティック回帰モデルは数値しか受け付けないので、  
データが文字で入ってる列を数値に置き換える

In [22]:
train['Sex'].unique()

array(['male', 'female'], dtype=object)

In [23]:
SEX_DICT = {'male': 0, 'female': 1}

train['Sex'] = train['Sex'].map(SEX_DICT)
print(train['Sex'].unique())

# TODO: Embarkedはone hot encordingする
emb_uni = train['Embarked'].unique()
EMBARKED_DICT = dict(zip(emb_uni, range(len(emb_uni))))

train['Embarked'] = train['Embarked'].map(EMBARKED_DICT)
print(train['Embarked'].unique())

[0 1]
[0 1 2]


In [24]:
SEX_DICT = {'male': 0, 'female': 1}

test['Sex'] = test['Sex'].map(SEX_DICT)
print(test['Sex'].unique())

# TODO: Embarkedはone hot encordingする

emb_uni = test['Embarked'].unique()
EMBARKED_DICT = dict(zip(emb_uni, range(len(emb_uni))))

test['Embarked'] = test['Embarked'].map(EMBARKED_DICT)
print(test['Embarked'].unique())

[0 1]
[0 1 2]


In [25]:
# logを思い出す
import numpy as np

print(np.log10(100))
print(np.log2(16))

2.0
4.0


# 予測する

In [26]:
DROP_COL = ['Name', 'Ticket', 'Survived']

x_train = train.drop(DROP_COL, axis=1)
y_train = train['Survived']
x_test = test.drop(['Name', 'Ticket'], axis=1)

## ロジスティック回帰とは的な
$$
ln\frac{p}{1-p} = a_1x_1 + a_2x_2 + ... + a_nx_n + b
$$

x: 入力（Age, Sexとか）  
a, b: 未知のパラメータ（学習用データで計算する）  
p: 確率（ここでは生き残る確率）

In [27]:
# ロジスティック回帰

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [28]:
print(x_train.columns)

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')


In [29]:
# 分類
print(logreg.classes_)
# 係数
print(logreg.coef_)
# 切片
print(logreg.intercept_)
# 学習した回数？的な
print(logreg.n_iter_)

[0. 1.]
[[ 4.25731207e-04 -7.02620399e-01  2.69448452e+00 -2.54636715e-02
  -3.12453266e-01 -1.86075398e-01  7.20870339e-03  2.31358601e-01]]
[0.51120142]
[100]


In [30]:
# 0, 1予測
y_pred = logreg.predict(x_test)
# 確率予測
y_pred_proba = logreg.predict_proba(x_test)

In [31]:
y_pred

array([0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0.,
       1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0., 1., 0., 1., 0., 0., 0.,
       0., 1., 1., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 1.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
       1., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1.,
       0., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

In [32]:
y_pred_proba

array([[0.88482792, 0.11517208],
       [0.43768163, 0.56231837],
       [0.88312368, 0.11687632],
       [0.8333205 , 0.1666795 ],
       [0.32289788, 0.67710212],
       [0.78132616, 0.21867384],
       [0.31621572, 0.68378428],
       [0.77407753, 0.22592247],
       [0.17690379, 0.82309621],
       [0.87731767, 0.12268233],
       [0.84359934, 0.15640066],
       [0.63634325, 0.36365675],
       [0.05656144, 0.94343856],
       [0.88149167, 0.11850833],
       [0.11386223, 0.88613777],
       [0.12724393, 0.87275607],
       [0.78745901, 0.21254099],
       [0.7737498 , 0.2262502 ],
       [0.31566732, 0.68433268],
       [0.29846864, 0.70153136],
       [0.65139872, 0.34860128],
       [0.79721849, 0.20278151],
       [0.06971636, 0.93028364],
       [0.40552546, 0.59447454],
       [0.04092609, 0.95907391],
       [0.9210003 , 0.0789997 ],
       [0.04491118, 0.95508882],
       [0.77963581, 0.22036419],
       [0.59689226, 0.40310774],
       [0.8777566 , 0.1222434 ],
       [0.

In [33]:
# 予測結果を提出する
submission['Survived'] = y_pred.astype(int)
OUTPUT_DIR = '../output/'
submission.to_csv(OUTPUT_DIR + 'pred.csv', index=False)