1. Loading Dataset

In [27]:
import pandas as pd
passengers = pd.read_csv("train.csv")

In [28]:
print(passengers.shape)

(891, 12)


In [29]:
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


2. Data Pre-processing

In [30]:
# 문자열을 숫자로 변환하기
passengers['Sex'] = passengers['Sex'].map({'female':1,'male':0})

In [31]:
# 결측키 채워주기
passengers['Age'].fillna(value=passengers['Age'].mean(), inplace=True)

In [32]:
# feature 분리하기
passengers['FirstClass'] = passengers['Pclass'].apply(lambda x: 1 if x == 1 else 0)
passengers['SecondClass'] = passengers['Pclass'].apply(lambda x: 1 if x == 2 else 0)

In [33]:
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0,0


In [34]:
features = passengers[['Sex', 'Age', 'FirstClass', 'SecondClass']]
survival = passengers['Survived']

3. Data Split

In [35]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, survival)

4. Data Scaling

In [36]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

5. 모델 생성 및 평가하기

In [37]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(train_features, train_labels)

LogisticRegression()

In [38]:
print(model.score(train_features, train_labels))

0.7844311377245509


* 80% accuracy

In [39]:
print(model.score(test_features, test_labels))

0.7847533632286996


* 78% accuracy


In [40]:
print(model.coef_)

[[ 1.24359621 -0.44450072  1.03277132  0.48311796]]


아까 Sex, Age, FirstClass, SecondClass 순으로 넣었기 때문에 그 순서대로 확인해주면 된다. 성별이 1(여자)이고, 일등석 탑승 여부가 중요하다는 걸 알 수 있다. 반면 나이에 대한 계수는 음수가 나오는데 나이가 많을수록 생존 확률이 낮아진다는 의미로 이해하면 되겠다.

5. 예측하기

In [41]:
import numpy as np
Jack = np.array([0.0, 20.0, 0.0, 0.0])
Rose = np.array([1.0, 17.0, 1.0, 0.0])
ME = np.array([1.0, 10.0, 1.0, 0.0])
sample_passengers = np.array([Jack, Rose, ME])

In [42]:
# data scaling
sample_passengers = scaler.transform(sample_passengers)

In [43]:
print(model.predict(sample_passengers))

[0 1 1]


In [44]:
print(model.predict_proba(sample_passengers))

[[0.88758983 0.11241017]
 [0.04426475 0.95573525]
 [0.03504355 0.96495645]]


살아남을 확률이 96%정도 된다