# 혼돈 행렬(Confusion Matrix)

In [6]:
import numpy as np 
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split

In [15]:
class MyFakeClassifier(BaseEstimator):
    def fit(self, X, y):
        return
    def predict(self, X):
        return np.zeros((len(X),1), dtype=int)

In [16]:
from sklearn.datasets import load_digits

digits = load_digits()
y = (digits.target == 7).astype(int)
X_train, X_test, y_train, y_test = train_test_split(
    digits.data, y, test_size=0.2, random_state=2021
)

In [18]:
my_clf = MyFakeClassifier()
my_clf.fit(X_train, y_train)
my_pred = my_clf.predict(X_test)

## Confusion Matrix

In [10]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, my_pred)

array([[326,   0],
       [ 34,   0]], dtype=int64)

## 정밀도(Precision), 재현율(Recall), F1 Score

In [21]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder

In [22]:
# Nan 처리 함수
def proc_nan(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    return df

# 불필요한 피처 제거 함수
def drop_features(df):
    df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1, inplace=True)
    return df

# 문자열을 숫자로 변환하는 함수
def transform_feature(df):
    le = LabelEncoder()
    for feature in ['Sex', 'Embarked']:
        df[feature] = le.fit_transform(df[feature])
    return df

# 위에서 정의한 함수들을 차례로 호출해주는 함수
def pre_process(df):
    df = proc_nan(df)
    df = drop_features(df)
    df = transform_feature(df)
    return df

In [13]:
titanic_df = pd.read_csv('../00.data/titanic/train.csv')
y = titanic_df['Survived']
X = titanic_df.drop(['Survived'], axis=1)
X = pre_process(X)

FileNotFoundError: [Errno 2] No such file or directory: '../00.data/titanic/train.csv'

In [14]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2021
)

NameError: name 'X' is not defined

In [None]:

from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)

In [None]:
pred_proba = lr_clf.predict_proba(X_test)
pred_proba[:5, :]

In [None]:
pred[:5]

In [None]:

# Confusion Matrix
confusion_matrix(y_test, pred)

In [None]:
# 정밀도(Precision), 43 / (16 + 43)
# 1종 오류를 16건에서 10건으로 줄이면, 정밀도는 49 / (10 + 49) = 0.83
from sklearn.metrics import precision_score
precision_score(y_test, pred)

In [5]:
# 재현율(Recall), 43 / (28 + 43)
# 2종 오류를 28건에서 15건으로 줄이면, 재현율은 56 / (15 + 56) = 0.79
from sklearn.metrics import recall_score
recall_score(y_test, pred)

NameError: name 'y_test' is not defined

In [4]:
# F1 Score
from sklearn.metrics import f1_score
f1_score(y_test, pred)

NameError: name 'y_test' is not defined

In [3]:
# 정확도(Accuracy)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

NameError: name 'y_test' is not defined