# 분류기 만들기

타이타닉 데이터의 생존여부 분류
- 규칙: 성별 (sex) = 1 => 생존 X

In [3]:
from sklearn.base import BaseEstimator
import numpy as np

class MyDummyClassifier(BaseEstimator):
    def fit(self, X, y):
        pass
    
    def predict(self, X):
        pred = np.zeros((X.shape[0], 1))
        for i in range(X.shape[0]):
            if X['Sex'].iloc[i] == 1:
                pred[i] = 0
            else:
                pred[i] = 1
        return pred
        

In [11]:
import pandas as pd

titanic_df = pd.read_csv('./data/titanic.csv')

y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis = 1)

In [12]:
from sklearn.preprocessing import LabelEncoder

# Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

# 머신러닝 알고리즘에 불필요한 피처 제거
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

# 레이블 인코딩 수행 함수
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

# 앞에서 설정한 데이터 전처리 함수 호출
def transform_features(df):
    df = fillna(df) 
    df = drop_features(df)
    df = format_features(df)
    return df

In [13]:
X_titanic_df = transform_features(X_titanic_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cabin'].fillna('N', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always

In [15]:
# 데이터셋 분할
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, 
                                                    y_titanic_df, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

In [16]:
myclf = MyDummyClassifier()
myclf.fit(X_train, y_train)

In [18]:
from sklearn.metrics import accuracy_score

my_pred = myclf.predict(X_test)
accuracy_score(y_test, my_pred) # 남자 (Sex = 1)가 죽었을 것이라는 예측 정확도

0.7877094972067039

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, my_pred)

# x-axis: 예측값, y-axis: 실제값
# 첫 칸에 있는 92는 0이라고 예측했는데 실제로도 0 => 맞음
# 두번째 칸에 있는 18은 1이라고 예측했는데 실제로는 1 => 틀림
# 0은 Negative, 1은 Positive라고 부름



array([[92, 18],
       [20, 49]])

In [20]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_test, my_pred), recall_score(y_test, my_pred)

(np.float64(0.7313432835820896), np.float64(0.7101449275362319))

# 로지스틱 회귀, 랜덤포레스트, KNN의 정밀도, 재현율 비교하기

In [21]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)

    print(confusion)
    print('*'*20)
    print(accuracy, precision, recall)

In [24]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=2000)
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)

# 정확도, 정밀도, 재현율
get_clf_eval(y_test, pred)

[[92 18]
 [16 53]]
********************
0.8100558659217877 0.7464788732394366 0.7681159420289855


정확도: 81.0%\
정밀도: 74.6%\
재현율: 76.8%

In [30]:
pred_proba = lr_clf.predict_proba(X_test)
pos_proba = pred_proba[:,1] # 양성클래스일 확률

threshold = 0.4 # 임계치
custom_proba = (pos_proba >= threshold).astype(int) # 임계치보다 크면 1
confusion_matrix(y_test, custom_proba)
get_clf_eval(y_test, custom_proba)

[[86 24]
 [13 56]]
********************
0.7932960893854749 0.7 0.8115942028985508


정확도: 79.3%\
정밀도: 70.0%\
재현율: 81.2%

정확도와 정밀도는 낮아졌지만 재현율은 올라갔음

# 정밀도와 재현율의 변화

정밀도와 재현율의 불균형이 심할 때, 혹은 비지니스의 요구사항이 있을 때 임계치를 조정해야 함

임계치를 낮추면 정밀도는 낮아지고 재현율은 올라감

In [31]:
from sklearn.metrics import f1_score
f1_score(y_test, pred)

np.float64(0.7571428571428571)

In [32]:
f1_score(y_test, custom_proba)

np.float64(0.7516778523489933)

# 평가 결과 확인하기

In [39]:
from sklearn.metrics import f1_score, classification_report
f1_score(y_test, pred) # 정밀도와 재현율의 평균

np.float64(0.7571428571428571)

In [36]:
print(classification_report(y_test, pred)) # 평가보고서

              precision    recall  f1-score   support

           0       0.85      0.84      0.84       110
           1       0.75      0.77      0.76        69

    accuracy                           0.81       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [None]:
import pandas as pd
pd.Series(lr_clf.coef_[0]).sort_values() #피처의 중요도 개수

1   -2.593416
0   -0.901628
3   -0.368137
7   -0.107352
4   -0.059052
6   -0.058762
2   -0.042756
5    0.001286
dtype: float64