In [1]:
import pandas as pd
# import numpy as np
# import matplotlib as mpl
# import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = sns.load_dataset('titanic')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB
None


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


------------------------------------

## 간단한 전처리 과정

- NaN 값이 많은 deck열 삭제
- embarked 열과  embarked_town 열은 의미가 동일하므로 embarked_town 열 삭제
- age 변수의 값이 NaN인 행을 삭제

In [3]:
# NaN 값이 많은 deck열 삭제, embarked 열과 embarked_town 열은 의미가 동일하므로 embarked_town 열 삭제
ndf = df.drop(['deck','embark_town'], axis=1)
ndf.info()
ndf.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
survived      891 non-null int64
pclass        891 non-null int64
sex           891 non-null object
age           714 non-null float64
sibsp         891 non-null int64
parch         891 non-null int64
fare          891 non-null float64
embarked      889 non-null object
class         891 non-null category
who           891 non-null object
adult_male    891 non-null bool
alive         891 non-null object
alone         891 non-null bool
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 72.4+ KB


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,no,True


In [4]:
# age 변수의 값이 NaN인 행을 삭제
ndf = ndf.dropna(subset=['age'] , how = 'any', axis=0)
ndf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 13 columns):
survived      714 non-null int64
pclass        714 non-null int64
sex           714 non-null object
age           714 non-null float64
sibsp         714 non-null int64
parch         714 non-null int64
fare          714 non-null float64
embarked      712 non-null object
class         714 non-null category
who           714 non-null object
adult_male    714 non-null bool
alive         714 non-null object
alone         714 non-null bool
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 63.6+ KB


In [5]:
# embarked 열의 NaN값을 승선도시 중에서 가장 많이 출연한 데이터 값으로 치환하기(결측치 채우기)
# ndf['embarked'].value_counts(dropna=True).idxmax()
most_freq = ndf['embarked'].value_counts(dropna=True).idxmax()
print(most_freq)

ndf['embarked'].fillna(most_freq, inplace=True)
ndf.describe(include='all')

S


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,alive,alone
count,714.0,714.0,714,714.0,714.0,714.0,714.0,714,714,714,714,714,714
unique,,,2,,,,,3,3,3,2,2,2
top,,,male,,,,,S,Third,man,True,no,True
freq,,,453,,,,,556,355,413,413,424,404
mean,0.406162,2.236695,,29.699118,0.512605,0.431373,34.694514,,,,,,
std,0.49146,0.83825,,14.526497,0.929783,0.853289,52.91893,,,,,,
min,0.0,1.0,,0.42,0.0,0.0,0.0,,,,,,
25%,0.0,1.0,,20.125,0.0,0.0,8.05,,,,,,
50%,0.0,2.0,,28.0,0.0,0.0,15.7417,,,,,,
75%,1.0,3.0,,38.0,1.0,1.0,33.375,,,,,,


In [6]:
# 분류 분석에 사용할 변수 선택
# survived, pclass, sex, age, sibsp, parch, embarked
X = ndf[['pclass', 'sex', 'age', 'sibsp', 'parch', 'embarked']]
Y = ndf['survived']

In [7]:
# 범주형 데이터를 모델이 인식할 수 있는 숫자형 데이터로 변환 : one-hot encoding
onehot_sex = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf, onehot_sex], axis=1)

In [8]:
onehot_embarked = pd.get_dummies(ndf['embarked'], prefix='town')
ndf = pd.concat([ndf, onehot_embarked], axis=1)

In [9]:
ndf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 18 columns):
survived      714 non-null int64
pclass        714 non-null int64
sex           714 non-null object
age           714 non-null float64
sibsp         714 non-null int64
parch         714 non-null int64
fare          714 non-null float64
embarked      714 non-null object
class         714 non-null category
who           714 non-null object
adult_male    714 non-null bool
alive         714 non-null object
alone         714 non-null bool
female        714 non-null uint8
male          714 non-null uint8
town_C        714 non-null uint8
town_Q        714 non-null uint8
town_S        714 non-null uint8
dtypes: bool(2), category(1), float64(2), int64(4), object(4), uint8(5)
memory usage: 67.0+ KB


In [10]:
X = ndf[['pclass', 'female', 'male', 'age', 'sibsp', 'parch', 'town_C', 'town_Q', 'town_S']]
Y = ndf['survived']

In [11]:
#KNN 분류 분석을 수행하려면 설명변수를 정규화 (평균 0, 표준편차1)
from sklearn import preprocessing
print(X.head())
print('\n')

# print(X['pclass'].value_counts())
# print('\n')

X = preprocessing.StandardScaler().fit(X).transform(X)
print(pd.DataFrame(X).head())

   pclass  female  male   age  sibsp  parch  town_C  town_Q  town_S
0       3       0     1  22.0      1      0       0       0       1
1       1       1     0  38.0      1      0       1       0       0
2       3       1     0  26.0      0      0       0       0       1
3       1       1     0  35.0      1      0       0       0       1
4       3       0     1  35.0      0      0       0       0       1


          0         1         2         3         4         5         6  \
0  0.911232 -0.759051  0.759051 -0.530377  0.524570 -0.505895 -0.471808   
1 -1.476364  1.317434 -1.317434  0.571831  0.524570 -0.505895  2.119506   
2  0.911232  1.317434 -1.317434 -0.254825 -0.551703 -0.505895 -0.471808   
3 -1.476364  1.317434 -1.317434  0.365167  0.524570 -0.505895 -0.471808   
4  0.911232 -0.759051  0.759051  0.365167 -0.551703 -0.505895 -0.471808   

          7         8  
0 -0.202031  0.533078  
1 -0.202031 -1.875896  
2 -0.202031  0.533078  
3 -0.202031  0.533078  
4 -0.202031  0.5330

In [12]:
# train data : test data를 7:3으로 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=10)
print(X_train.shape)
print(X_test.shape)

(499, 9)
(215, 9)


In [13]:
# KNN 분류 분석으로 모델 생성
from sklearn.neighbors import KNeighborsClassifier

In [14]:
knn = KNeighborsClassifier(n_neighbors=5) # n_neighbors 값은 홀수여야 하고 작은 값으로 설정 권장)
knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [15]:
# 학습 데이터로부터 생성된 모델로부터 예측값 생성
y_predict = knn.predict(X_test)

print(y_predict[0:10])
print(Y_test.values[0:10])

[0 0 1 0 0 1 1 1 0 0]
[0 0 1 0 0 1 1 1 0 0]


In [16]:
from sklearn import metrics
knn_matrix = metrics.confusion_matrix(Y_test, y_predict)
print(knn_matrix)

[[109  16]
 [ 25  65]]


In [17]:
knn_report = metrics.classification_report(Y_test, y_predict)
print(knn_report)

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       125
           1       0.80      0.72      0.76        90

    accuracy                           0.81       215
   macro avg       0.81      0.80      0.80       215
weighted avg       0.81      0.81      0.81       215



- 생존 여부를 분류할 때 영향을 주는 변수를 선택해서 k(최근접을 몇 개까지 볼 것인지 지정)는 될수록 작은 수를 설정하고 홀수로 설정해서 분류분석을 수행합니다. [이진(2개 항목) 분류 문제에서는 동률의 투표를 피하기 위해 홀수인 k를 선택하는 것이 바람직하다.]
--------------------------
- 데이터 셋에서 생존자 클래스(생존자, 비생존자)의 데이터 수가 동일하다면 정확률로, 생존자 클래스의 데이터 수가 상이하다면 f1통계량으로 모델의 정확도를 판단한다.
--------------------------
- 통상적으로 k=1일때 overfitting 발생할 가능성이 높습니다.