### 타이타닉 생존자 예측

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
# import warnings
# warnings.filterwarnings('ignore')

In [4]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


##### 1. Preprocessing data
- Feature selection: remove duplicate field

In [5]:
df = df[['survived','pclass','sex','age','sibsp','parch','fare','embarked','who','deck']]
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,deck
886,0,2,male,27.0,0,0,13.0,S,man,
887,1,1,female,19.0,0,0,30.0,S,woman,B
888,0,3,female,,1,2,23.45,S,woman,
889,1,1,male,26.0,0,0,30.0,C,man,C
890,0,3,male,32.0,0,0,7.75,Q,man,


- Handle missing values

In [11]:
df.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
who           0
deck        688
dtype: int64

In [12]:
df.who.value_counts()

who
man      537
woman    271
child     83
Name: count, dtype: int64

In [18]:
df.age[df.who == 'man'].isna().sum(), df.age[df.who == 'woman'].isna().sum(), df.age[df.who == 'child'].isna().sum() 

(124, 53, 0)

In [21]:
# man, woman age 평균
# df[df.who.isin(['man', 'woman'])].age.mean().round(1)

In [22]:
df.age.fillna(df[df.who.isin(['man', 'woman'])].age.mean().round(1), inplace=True)

In [23]:
df.age.isna().sum()

0

In [24]:
# embarked 결측치는 최빈값으로 대체
df.embarked.value_counts()

embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [25]:
df.embarked.fillna('S', inplace=True)

In [26]:
# deckl 컬럼 삭제(결측치가 너무 많음), who 컬럼도 삭제(age와 중복)
df.drop(columns=['deck', 'who'], inplace=True)

In [28]:
# 최종 확인
df.isna().sum().sum()

0

- 카테고리형 데이터를 숫자로 변환

In [32]:
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
886,0,2,1,27.0,0,0,13.0,2
887,1,1,0,19.0,0,0,30.0,2
888,0,3,0,32.8,1,2,23.45,2
889,1,1,1,26.0,0,0,30.0,0
890,0,3,1,32.0,0,0,7.75,1


In [31]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df.sex = le.fit_transform(df.sex)
df.embarked = le.fit_transform(df.embarked)
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
886,0,2,1,27.0,0,0,13.0,2
887,1,1,0,19.0,0,0,30.0,2
888,0,3,0,32.8,1,2,23.45,2
889,1,1,1,26.0,0,0,30.0,0
890,0,3,1,32.0,0,0,7.75,1


##### 2. 훈련/테스트 데이터셋 분리

In [35]:
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values        # df.survived.values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2023
)

##### 3. Training
- Random Forest: Decision Tree 100
- Grid Search CV


In [36]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2023)
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2023,
 'verbose': 0,
 'warm_start': False}

In [38]:
# ⭐ GridSearchCV란 머신러닝에서 모델의 성능향상을 위해 쓰이는 기법중 하나입니다.
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [4, 7, 10],
    'min_samples_split':[2, 3, 4]
}
grid_rf = GridSearchCV(rfc, params, scoring='accuracy', cv=5)
grid_rf.fit(X_train, y_train)

grid_rf.best_params_


{'max_depth': 4, 'min_samples_split': 3}

In [39]:
rfc = RandomForestClassifier(random_state=2023)
params = {
    'max_depth': [3, 4, 5, 6],
    'min_samples_split':[2, 3, 4]
}
grid_rf = GridSearchCV(rfc, params, scoring='accuracy', cv=5)
grid_rf.fit(X_train, y_train)
grid_rf.best_params_

{'max_depth': 6, 'min_samples_split': 4}

##### 4. 예측 및 평가

In [42]:
best_rf = grid_rf.best_estimator_
pred = best_rf.predict(X_test)
rf = pd.DataFrame({'real': y_test, 'predict': pred})
rf.head()

Unnamed: 0,real,predict
0,0,0
1,1,1
2,0,0
3,0,0
4,0,0


In [43]:
best_rf.score(X_test, y_test)

0.8100558659217877

#### 5. Logistic Regression model

로지스틱 회귀(Logistic Regression)는 회귀를 사용하여 데이터가 어떤 범주에 속할 확률을 0과 1 사이의 값으로 예측하고,
그 확률에 따라 가능성이 더 높은 범주에 속하는 것으로 분류해주는 지도 학습 알고리즘이다.

In [44]:
X

array([[ 3.    ,  1.    , 22.    , ...,  0.    ,  7.25  ,  2.    ],
       [ 1.    ,  0.    , 38.    , ...,  0.    , 71.2833,  0.    ],
       [ 3.    ,  0.    , 26.    , ...,  0.    ,  7.925 ,  2.    ],
       ...,
       [ 3.    ,  0.    , 32.8   , ...,  2.    , 23.45  ,  2.    ],
       [ 1.    ,  1.    , 26.    , ...,  0.    , 30.    ,  0.    ],
       [ 3.    ,  1.    , 32.    , ...,  0.    ,  7.75  ,  1.    ]])

In [45]:
X[:5]

array([[ 3.    ,  1.    , 22.    ,  1.    ,  0.    ,  7.25  ,  2.    ],
       [ 1.    ,  0.    , 38.    ,  1.    ,  0.    , 71.2833,  0.    ],
       [ 3.    ,  0.    , 26.    ,  0.    ,  0.    ,  7.925 ,  2.    ],
       [ 1.    ,  0.    , 35.    ,  1.    ,  0.    , 53.1   ,  2.    ],
       [ 3.    ,  1.    , 35.    ,  0.    ,  0.    ,  8.05  ,  2.    ]])

- 표준화 Standardization
    - 표준화는 데이터의 분포를 정규분포로 바꿔준다.
    - 즉 데이터의 평균이 0이 되도록하고 표준편차가 1이 되도록 만들어준다.

- 정규화 Normalization
    - dataset의 numerical value 범위의 차이를 왜곡하지 않고 공통 척도로 변경하는 것이다.

#### 표준화

In [47]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_std = ss.fit_transform(X)
X_std[:5]

array([[ 0.82737724,  0.73769513, -0.63700389,  0.43279337, -0.47367361,
        -0.50244517,  0.58595414],
       [-1.56610693, -1.35557354,  0.58872284,  0.43279337, -0.47367361,
         0.78684529, -1.9423032 ],
       [ 0.82737724, -1.35557354, -0.3305722 , -0.4745452 , -0.47367361,
        -0.48885426,  0.58595414],
       [-1.56610693, -1.35557354,  0.35889908,  0.43279337, -0.47367361,
         0.42073024,  0.58595414],
       [ 0.82737724,  0.73769513,  0.35889908, -0.4745452 , -0.47367361,
        -0.48633742,  0.58595414]])

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    X_std, y, stratify=y, test_size=0.2, random_state=2023
)

from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train, y_train)
lrc.score(X_test, y_test)

0.7486033519553073

### 정규화

In [49]:
from sklearn.preprocessing import MinMaxScaler
X_mm = MinMaxScaler().fit_transform(X)
X_mm[:5]

array([[1.        , 1.        , 0.27117366, 0.125     , 0.        ,
        0.01415106, 1.        ],
       [0.        , 0.        , 0.4722292 , 0.125     , 0.        ,
        0.13913574, 0.        ],
       [1.        , 0.        , 0.32143755, 0.        , 0.        ,
        0.01546857, 1.        ],
       [0.        , 0.        , 0.43453129, 0.125     , 0.        ,
        0.1036443 , 1.        ],
       [1.        , 1.        , 0.43453129, 0.        , 0.        ,
        0.01571255, 1.        ]])

In [50]:
X_train, X_test, y_train, y_test = train_test_split(
    X_mm, y, stratify=y, test_size=0.2, random_state=2023
)
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train, y_train)
lrc.score(X_test, y_test)

0.770949720670391