In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data_path = '../mydata/data/datafile/titanic.csv'
titanic = pd.read_csv(data_path, index_col='PassengerId')
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


이름과 티켓명은 제거한다. 값도 길고 무쓸모이다. drop을 이용해 칼럼을 제거한다.

In [2]:
titanic.drop(['Name', 'Ticket'], axis=1, inplace=True)
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,male,22.0,1,0,7.25,,S
2,1,1,female,38.0,1,0,71.2833,C85,C
3,1,3,female,26.0,0,0,7.925,,S
4,1,1,female,35.0,1,0,53.1,C123,S
5,0,3,male,35.0,0,0,8.05,,S


In [3]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 9 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 69.6+ KB


Age가 수가 모자란 이유는 정보가 없기 때문이다. 이번에는 탑승지 데이터를 dummy variable로 만들어보자.

In [4]:
print(titanic['Embarked'].value_counts())
embark_dummy = pd.get_dummies(titanic['Embarked'], prefix='port')
embark_dummy.head()

S    644
C    168
Q     77
Name: Embarked, dtype: int64


Unnamed: 0_level_0,port_C,port_Q,port_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,0,1
2,1,0,0
3,0,0,1
4,0,0,1
5,0,0,1


In [5]:
titanic['Age'].isnull()

PassengerId
1      False
2      False
3      False
4      False
5      False
       ...  
887    False
888    False
889     True
890    False
891    False
Name: Age, Length: 891, dtype: bool

NaN을 특정 값으로 바꿀 수도 있겠지만 categorical하게 바꿔보자.

In [6]:
titanic['Age'].fillna(-1, inplace=False)

PassengerId
1      22.0
2      38.0
3      26.0
4      35.0
5      35.0
       ... 
887    27.0
888    19.0
889    -1.0
890    26.0
891    32.0
Name: Age, Length: 891, dtype: float64

In [7]:
age_group = titanic['Age'] < 20
age_group[age_group] = 'child'
age_group[titanic['Age'] >= 20] = 'adult'
age_group[titanic['Age'].isnull()] = 'unknown'
age_group.name = 'AgeGroup'
age_group

PassengerId
1        adult
2        adult
3        adult
4        adult
5        adult
        ...   
887      adult
888      child
889    unknown
890      adult
891      adult
Name: AgeGroup, Length: 891, dtype: object

In [8]:
age_dummy = pd.get_dummies(age_group, prefix='Age')

In [9]:
pclass_dummy = pd.get_dummies(titanic['Pclass'], prefix='Pclass')

In [10]:
titanic['Sex'] = titanic['Sex'].map({'female':1, 'male':0})

지금까지 만든 모든 변수를 취합해보자.

In [11]:
titanic = pd.concat([titanic, pclass_dummy, embark_dummy, age_dummy], axis=1)
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Pclass_1,Pclass_2,Pclass_3,port_C,port_Q,port_S,Age_adult,Age_child,Age_unknown
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,3,0,22.0,1,0,7.25,,S,0,0,1,0,0,1,1,0,0
2,1,1,1,38.0,1,0,71.2833,C85,C,1,0,0,1,0,0,1,0,0
3,1,3,1,26.0,0,0,7.925,,S,0,0,1,0,0,1,1,0,0
4,1,1,1,35.0,1,0,53.1,C123,S,1,0,0,0,0,1,1,0,0
5,0,3,0,35.0,0,0,8.05,,S,0,0,1,0,0,1,1,0,0


In [12]:
def make_train_data(input_names, output_name):
    X = titanic[input_names].to_numpy()
    y = titanic[output_name].to_numpy()
    print(f'shape of X = {X.shape}')
    print(f'shape of y = {y.shape}')
    return X, y

input_names = 'Pclass_1 Pclass_2 Pclass_3 Sex SibSp Parch Fare port_C port_Q port_S Age_adult Age_child Age_unknown'.split()
output_name = 'Survived'

X, y = make_train_data(input_names, output_name)

shape of X = (891, 13)
shape of y = (891,)


13개의 변수가 생겼다.

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

model = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', max_iter=1000)
model.fit(X, y)

print(f'intercept   : {model.intercept_[0]:.4}')
for i, coef in enumerate(model.coef_[0]):
    print(f'{input_names[i]:12}: {coef:.4}')

intercept   : -0.823
Pclass_1    : 0.7611
Pclass_2    : 0.1617
Pclass_3    : -0.9062
Sex         : 2.606
SibSp       : -0.2938
Parch       : -0.1196
Fare        : 0.003338
port_C      : 0.1133
port_Q      : 0.06195
port_S      : -0.3713
Age_adult   : -0.2554
Age_child   : 0.5928
Age_unknown : -0.3208


1등석 생존률이 높고 3등석은 생존률이 낮았다. 또한 여성이 남성보다 높은 생존률을 갖고 있고, 어린이가 어른보다 생존률이 높다.

생존 확률도 확인해볼 수 있다.

In [14]:
y_prob = model.predict_proba(X)
y_prob[:5]

array([[0.93248189, 0.06751811],
       [0.08739469, 0.91260531],
       [0.43135573, 0.56864427],
       [0.14178403, 0.85821597],
       [0.91124627, 0.08875373]])

In [15]:
y_prob

array([[0.93248189, 0.06751811],
       [0.08739469, 0.91260531],
       [0.43135573, 0.56864427],
       ...,
       [0.56716174, 0.43283826],
       [0.52592348, 0.47407652],
       [0.86951779, 0.13048221]])

In [16]:
X

array([[0., 0., 1., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.]])

# Question

Logistic Regression 의 classification 은 확률이 0.5 를 넘으면 1, 그렇지 않으면 0 으로 이뤄집니다. 만약 이 threshold 를 [0,3, 0.4, 0.5, 0.6, 0.7] 로 조절하면 각각 precision, recall, f1-score, accuracy 가 어떻게 변화할까요? `y_prob` 와 `numpy.where()` 을 이용하여 각각의 값을 계산하고, 그 결과를 이해하기 쉬운 plots 으로 그려봅니다.