In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

titanic_df = pd.read_csv('./titanic/titanic_train.csv')
y = titanic_df["Survived"]
titanic_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [23]:
def fillna(df):
    df['Cabin'] = df['Cabin'].fillna('N')
    df['Embarked'] = df['Embarked'].fillna('N')
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean') #missing_values=np.nan 모든 결측값을 대체한다. / strategy='mean' 평균으로 대체한다.
    age_array = df['Age'].to_numpy().reshape(-1, 1)
    imputer.fit(age_array)
    df['Age'] = imputer.transform(age_array)
    return df


def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
    print(df.head(3), "\n\n")

    df = df.drop('Survived', axis=1, inplace=False)
    return df


# # 레이블
# def encode_features_label(df):
#     features = ['Cabin', 'Sex', 'Embarked']
#     le = LabelEncoder()
#     for feature in features:
#         le.fit(df[feature])
#         df[feature] = le.transform(df[feature])
#     return df



# 표준화 => 2차원 데이터
def stscaler(df):
    features = ['Age', 'Fare']
    sc = StandardScaler()
    for feature in features:
        df[[feature]] = sc.fit_transform(df[[feature]])
    return df
    

# 원핫
# one-hot 인코딩을 하면 많은 0 값을 포함하기 때문에, 이러한 데이터를 희소 행렬 형식으로 저장하면 메모리 사용량을 크게 줄일 수 있다.
# ColumnTransformer는 인코딩된 데이터를 OneHotEncoder를 통해 처리한 후에 희소 행렬 형태로 반환
def encode_features_onehot(df):
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 6, 7])], remainder='passthrough')
    df = ct.fit_transform(df)
    # df = pd.DataFrame(df.toarray()) # 희소 행렬을 Dense 형태로 변환 후 DataFrame으로 변환
    return df


titanic_df = fillna(titanic_df)
titanic_df = drop_features(titanic_df)

titanic_df = stscaler(titanic_df)

titanic_df = encode_features_onehot(titanic_df)
# titanic_df = encode_features_label(titanic_df)
print(titanic_df[:3])

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Cabin Embarked
0         0       3    male  22.0      1      0   7.2500     N        S
1         1       1  female  38.0      1      0  71.2833   C85        C
2         1       3  female  26.0      0      0   7.9250     N        S 


  (0, 1)	1.0
  (0, 148)	1.0
  (0, 153)	1.0
  (0, 154)	3.0
  (0, 155)	-0.5924805998028931
  (0, 156)	1.0
  (0, 158)	-0.5024451714361923
  (1, 0)	1.0
  (1, 83)	1.0
  (1, 150)	1.0
  (1, 154)	1.0
  (1, 155)	0.6387890120425208
  (1, 156)	1.0
  (1, 158)	0.7868452935884461
  (2, 0)	1.0
  (2, 148)	1.0
  (2, 153)	1.0
  (2, 154)	3.0
  (2, 155)	-0.2846631968415396
  (2, 158)	-0.4888542575852486


In [24]:
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    
    accuracy = accuracy_score(y_test , pred)
    
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    
    f1 = f1_score(y_test,pred)
    
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    
    print('오차 행렬')
    print(confusion, "\n")
    
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc), "\n")


def precision_recall_curve_plot(y_test=None, pred_proba_c1=None):
    
    # threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출. 
    precisions, recalls, thresholds = precision_recall_curve( y_test, pred_proba_c1)
    
    # X축을 threshold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')
    
    # threshold 값 X 축의 Scale을 0.1 단위로 변경
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    
    # x축, y축 label과 legend, 그리고 grid 설정
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend(); plt.grid()
    plt.show()

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(titanic_df, y, test_size=0.2)

#### KNN은 임계값 설정이 필요 없다.
> - 입력 데이터와 가장 가까운 k개의 이웃을 찾아 다수결로 클래스를 결정하는 알고리즘
> - k 값(이웃의 개수)과 거리 측정 방식(metric)과 같은 하이퍼파라미터가 중요한 역할

#### 로지스틱 회귀, SVM, 신경망 등의 다른 분류기에서는 예측 확률이 중요하기 때문에, 임계값 설정이 필요

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# 그리드 서치 수행
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'metric': ['euclidean', 'manhattan', 'cosine']
}

knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5)
knr = grid_search.fit(X_train, y_train)


# 최적의 하이퍼 파라미터 출력
print("Best Parameters:", knr.best_params_)

# 훈련 데이터 성능 평가
print("Train Accuracy:", knr.score(X_train, y_train))

# 테스트 데이터 성능 평가 
print("Test Accuracy:", knr.score(X_test, y_test), "\n\n")


Best Parameters: {'metric': 'manhattan', 'n_neighbors': 7}
Train Accuracy: 0.8370786516853933
Test Accuracy: 0.8659217877094972 




In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve

# 테스트 데이터 예측
y_pred = knr.predict(X_test)
print(y_pred, "\n")

# 정확도 평가
accuracy = knr.score(X_test, y_test)
print("Accuracy:", accuracy, "\n")

# 오차행렬
pred_proba = knr.predict_proba(X_test)[:, 1] # 분류 결정 예측 확률 = 각 클래스에 대한 예측 확률을 반환
print(pred_proba[:4])

get_clf_eval(y_test , y_pred, pred_proba)

[0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 1
 0 1 1 0 1 0 1 1 0 1 0 0 1 0 0 1 0 0 1 0 1 1 1 0 1 0 0 1 0 0 0 1 1 1 0 1 1
 0 0 1 1 0 0 0 1 1 0 0 0 0 0 1 1 0 1 0 0 1 0 1 1 0 0 0 0 1 1 0 0 0 1 1 0 0
 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 1 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 1 1 1 1
 0 0 0 0 0 0 1 1 0 1 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] 

Accuracy: 0.8659217877094972 

[0.28571429 0.57142857 0.14285714 0.14285714]
오차 행렬
[[101  13]
 [ 11  54]] 

정확도: 0.8659, 정밀도: 0.8060, 재현율: 0.8308,    F1: 0.8182, AUC:0.8975 



In [40]:
knn_result = KNeighborsClassifier(n_neighbors=7, metric='manhattan')
knn_result.fit(X_train, y_train)


# 테스트 데이터 예측
y_pred_result = knn_result.predict(X_test)
print(y_pred_result, "\n")

# 오차행렬
pred_proba_result = knn_result.predict_proba(X_test) # 분류 결정 예측 확률 = 각 클래스에 대한 예측 확률을 반환
print(pred_proba_result[:4], "\n\n")

get_clf_eval(y_test, y_pred_result, pred_proba_result[:, 1])


# 훈련 데이터 성능 평가
print("Train Accuracy:", knr.score(X_train, y_train))

# 테스트 데이터 성능 평가 
print("Test Accuracy:", knr.score(X_test, y_test), "\n\n")

[0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 1
 0 1 1 0 1 0 1 1 0 1 0 0 1 0 0 1 0 0 1 0 1 1 1 0 1 0 0 1 0 0 0 1 1 1 0 1 1
 0 0 1 1 0 0 0 1 1 0 0 0 0 0 1 1 0 1 0 0 1 0 1 1 0 0 0 0 1 1 0 0 0 1 1 0 0
 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 1 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 1 1 1 1
 0 0 0 0 0 0 1 1 0 1 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] 

[[0.71428571 0.28571429]
 [0.42857143 0.57142857]
 [0.85714286 0.14285714]
 [0.85714286 0.14285714]] 


오차 행렬
[[101  13]
 [ 11  54]] 

정확도: 0.8659, 정밀도: 0.8060, 재현율: 0.8308,    F1: 0.8182, AUC:0.8975 

Train Accuracy: 0.8370786516853933
Test Accuracy: 0.8659217877094972 


