In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

# 1. 데이터 준비 

In [18]:
# seaborn의 load_dataset 함수로 데이터프레임으로 변환 
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


- pcalss: 객실 등급 (1,2,3) --> class(First, Second, Third)
- sibsp: 형제 유무
- parch: 부모님 유무
- embarked: 승선지 (S,C,D) --> embark_town (Southampton, Cherbourg, ..) 
- who: 남녀 (man, woman)
- adult_male: 성인 남성인지
- alone : 혼자 탔는지 아닌지 

# 2. 데이터 탐색 

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


- age, embarked, deck, embark_town 열 결측치 존재

In [4]:
# 열마다의 결측치 개수 확인 
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

- 1) deck 열 삭제 --> 결측치 너무 많음.
- 2) embark_town, embark 중 하나 삭제 
    - embark_town 좀더 복잡해서 삭제

In [19]:
rdf = df.drop(['deck','embark_town'], axis=1)
rdf.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'alive', 'alone'],
      dtype='object')

- age 결측치 어떻게 대체할지 조금 애매 (ex. 평균값)
- 생존 여부에 굉장히 중요한 feature일 것이라고 예상 가능. 
    - 그러면 단순히 평균으로 대체하는 것은 꽤나 위험함.
    - age 결측치인 행은 아예 빼버리는 것이 더 안전함. 

In [20]:
rdf = rdf.dropna(subset=['age'], how='any', axis=0)
rdf.shape

(714, 13)

- embarked 결측치를 승선 도시 중 가장 많이 등장한 값으로 치환 --> 최빈값 대체 

In [21]:
rdf['embarked'].value_counts()

S    554
C    130
Q     28
Name: embarked, dtype: int64

In [22]:
rdf['embarked'].value_counts().idxmax()

'S'

In [23]:
# 최빈값을 확인하는 다른 방법 
rdf.describe(include='all') # 문자열 칼럼에서도 통계 계산.
# top 값 확인 

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,alive,alone
count,714.0,714.0,714,714.0,714.0,714.0,714.0,712,714,714,714,714,714
unique,,,2,,,,,3,3,3,2,2,2
top,,,male,,,,,S,Third,man,True,no,True
freq,,,453,,,,,554,355,413,413,424,404
mean,0.406162,2.236695,,29.699118,0.512605,0.431373,34.694514,,,,,,
std,0.49146,0.83825,,14.526497,0.929783,0.853289,52.91893,,,,,,
min,0.0,1.0,,0.42,0.0,0.0,0.0,,,,,,
25%,0.0,1.0,,20.125,0.0,0.0,8.05,,,,,,
50%,0.0,2.0,,28.0,0.0,0.0,15.7417,,,,,,
75%,1.0,3.0,,38.0,1.0,1.0,33.375,,,,,,


In [24]:
# rdf = rdf.embarked.fillna('S') 하면 안됨... ㅎ
rdf.embarked.fillna('S', inplace=True)

In [25]:
rdf.isnull().sum()

survived      0
pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
embarked      0
class         0
who           0
adult_male    0
alive         0
alone         0
dtype: int64

# 3. 분석에 사용할 feature 선택 

In [26]:
rdf = rdf[['survived','pclass','sex','age','sibsp','parch','embarked']]
rdf.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S


- 문자열 타입 처리 (sex, embarked)
    - 원핫 인코딩 
    - s,c,t 그냥 0,1,2 이렇게 치환해버리면 특정 값으로 기계가 생각해버릴 수도 있음. 
        - s와 t는 멀다고 인식하게 되버림. 
    - 판다스 메소드 "get_dummies" 이용 !

In [27]:
onehot_sex = pd.get_dummies(rdf['sex'])
onehot_sex

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
885,1,0
886,0,1
887,1,0
889,0,1


In [28]:
ndf = pd.concat([rdf, onehot_sex], axis=1)
ndf.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,female,male
0,0,3,male,22.0,1,0,S,0,1
1,1,1,female,38.0,1,0,C,1,0
2,1,3,female,26.0,0,0,S,1,0
3,1,1,female,35.0,1,0,S,1,0
4,0,3,male,35.0,0,0,S,0,1


In [29]:
onehot_embarked = pd.get_dummies(ndf['embarked'], prefix='town')
ndf = pd.concat([ndf, onehot_embarked], axis=1)
ndf.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,female,male,town_C,town_Q,town_S
0,0,3,male,22.0,1,0,S,0,1,0,0,1
1,1,1,female,38.0,1,0,C,1,0,1,0,0
2,1,3,female,26.0,0,0,S,1,0,0,0,1
3,1,1,female,35.0,1,0,S,1,0,0,0,1
4,0,3,male,35.0,0,0,S,0,1,0,0,1


In [30]:
ndf = ndf.drop(['sex', 'embarked'], axis=1)
ndf.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,female,male,town_C,town_Q,town_S
0,0,3,22.0,1,0,0,1,0,0,1
1,1,1,38.0,1,0,1,0,1,0,0
2,1,3,26.0,0,0,1,0,0,0,1
3,1,1,35.0,1,0,1,0,0,0,1
4,0,3,35.0,0,0,0,1,0,0,1


# 4. 데이터셋 분할 - 훈련/테스트

In [31]:
# 변수 선택 
x = ndf.drop('survived', axis=1)
y = ndf['survived']

x_train, x_test, y_train, y_test = train_test_split(x,y,
                                                   test_size=0.3,
                                                   random_state=4)

len(x_train), len(x_test)

(499, 215)

# 5. KNN 분류 모델

In [32]:
# KNN 분류 모형 객체 생성 (k=5)
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(x_train, y_train)

KNeighborsClassifier()

In [33]:
y_pred = knn.predict(x_test)

pd.DataFrame({'ans':y_test, 'pred':y_pred})

Unnamed: 0,ans,pred
480,0,0
325,1,1
141,1,0
541,0,0
242,0,0
...,...,...
579,1,1
402,0,0
362,0,1
30,0,0


# 6. 모형 성능 평가

In [36]:
# from sklearn import metrics
knn_matrix = metrics.confusion_matrix(y_test, y_pred)
print(knn_matrix)

[[108  16]
 [ 35  56]]


- TP FP : 생존(0) - 124
- TN FN : 사망(1) - 91

In [37]:
knn_report = metrics.classification_report(y_test, y_pred)
print(knn_report)

              precision    recall  f1-score   support

           0       0.76      0.87      0.81       124
           1       0.78      0.62      0.69        91

    accuracy                           0.76       215
   macro avg       0.77      0.74      0.75       215
weighted avg       0.76      0.76      0.76       215



- precision (정밀도)
    - 양성으로 예측한 데이터 중 실제로 양성인 데이터의 비율
    
- recall (재현률)
    - 실제 양성인 데이터 중 양성으로 예측한 비율
    
- 코로나 검사 키트의 경우
    - 민감도(재현률) 90% 이상 
        - 실제 양성인 사람이 코로나 검사 결과 양성으로 예측될 확률 90% 이상
    - 특이도 99% 이상
        - 실제 음성인 사람이 코로나 검사를 하면 음성으로 예측될 확률 99% 이상

# 7. 모델 고도화 

In [38]:
# 데이터 단위 스케일링
ss = preprocessing.StandardScaler()

scaled_train = ss.fit_transform(x_train)
scaled_test = ss.transform(x_test)

scaled_test

array([[ 0.90434532, -1.41237825,  4.55136791, ..., -0.46272229,
        -0.21475938,  0.53176719],
       [-1.45831601,  0.43352665, -0.55919147, ...,  2.16112345,
        -0.21475938, -1.88052217],
       [ 0.90434532, -0.52360922, -0.55919147, ..., -0.46272229,
        -0.21475938,  0.53176719],
       ...,
       [ 0.90434532,  1.04882829, -0.55919147, ...,  2.16112345,
        -0.21475938, -1.88052217],
       [-1.45831601,  0.70699405, -0.55919147, ...,  2.16112345,
        -0.21475938, -1.88052217],
       [ 0.90434532, -0.93381031, -0.55919147, ..., -0.46272229,
        -0.21475938,  0.53176719]])

In [41]:
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(scaled_train, y_train)

y_pre = knn.predict(scaled_test)

pd.DataFrame({'ans':y_test, 'pred':y_pre})

Unnamed: 0,ans,pred
480,0,0
325,1,1
141,1,0
541,0,0
242,0,0
...,...,...
579,1,1
402,0,0
362,0,1
30,0,1


In [43]:
knn_matrix = metrics.confusion_matrix(y_test, y_pre)
print(knn_matrix)

[[109  15]
 [ 26  65]]


In [44]:
knn_report = metrics.classification_report(y_test, y_pre)
print(knn_report)

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       124
           1       0.81      0.71      0.76        91

    accuracy                           0.81       215
   macro avg       0.81      0.80      0.80       215
weighted avg       0.81      0.81      0.81       215



- preprocessing.StandardScaler 적용해서 feature들을 스케일링 한 것이 더욱 모델 성능이 높음. 

- 스케일링 할 경우 오히려 성능이 낮아지는 특수한 경우도 존재함.

In [47]:
np.sqrt(499)

22.338307903688676

- 훈련 데이터의 루트값 으로 k 개수를 정하면 가장 성능이 좋은 경향이 있다고 한다. 