In [1]:
from sklearn.base import BaseEstimator
import numpy as np

class MyDummyClassifier(BaseEstimator):
    # fit() 메서드는 아무것도 학습하지 않음.
    def fit(self, X, y=None):
            pass
    
    # predict() 메서드는 단순히 Sex 피처가 1이면 0, 그렇지 않으면 1로 예측함
    def predict(self, X):
        pred = np.zeros((X.shape[0],1))
        for i in range(X.shape[0]):
            if X['Sex'].iloc[i] == 1:  # 성별이 1(남자)이면
                pred[i] = 0            # 생존을 0(사망)으로 처리
            else:                      # 그렇지 않으면 (여자일 경우)
                pred[i] = 1            # 생존을 1(생존)으로 처리
        
        return pred


In [2]:
def fillna(df):
    df['Age'].fillna(df['Age'].mean(),inplace=True)
    df['Cabin'].fillna('N',inplace=True)
    df['Embarked'].fillna('N',inplace=True)
    df['Fare'].fillna(0,inplace=True)
    return df

def drop_features(df):
    df.drop(columns = ['PassengerId','Name','Ticket'],inplace=True)
    return df


def format_features(df):
    from sklearn.preprocessing import LabelEncoder
    features = ['Cabin','Sex','Embarked']
    df.Cabin = df.Cabin.str[:1]
    for feature in features:
        df[feature] = LabelEncoder().fit_transform(df[feature])
    return df


def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 원본 데이터를 재로딩, 데이터 가공, 학습 데이터/테스트 데이터 분할.

titanic_df = pd.read_csv('data/titanic_train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived',axis=1)
X_titanic_df = transform_features(X_titanic_df)
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df,y_titanic_df,test_size=0.2,random_state=0)

In [4]:
myclf = MyDummyClassifier()
myclf.fit(X_train,y_train)

mypredictions = myclf.predict(X_test)
print('Dummy Classifier의 정확도는 : {0:.4f}'.format(accuracy_score(y_test,mypredictions)))

Dummy Classifier의 정확도는 : 0.7877


In [5]:
class MyFakeClassifier(BaseEstimator):
    # fit() 메서드는 아무것도 학습하지 않음.
    def fit(self, X, y=None):
            pass
    # predict() 메서드는 단순히 Sex 피처가 1이면 0, 그렇지 않으면 1로 예측함
    def predict(self, X):
        return np.zeros((len(X),1),dtype=bool)

In [6]:
from sklearn.datasets import load_digits
digits = load_digits()
y = (digits.target ==7).astype(int)
X_train, X_test, y_train, y_test = train_test_split(digits.data,y,test_size=0.2,random_state=11)

In [7]:
fake_clf = MyFakeClassifier()
fake_clf.fit(X_train,y_train)

In [8]:
pred = fake_clf.predict(X_test)

In [9]:
accuracy_score(y_test,pred)

0.9

In [10]:
from sklearn.metrics import confusion_matrix

In [11]:
confusion_matrix(y_test,pred)

array([[324,   0],
       [ 36,   0]], dtype=int64)

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

In [13]:
def get_clf_eval(y_test,pred):
    confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test,pred)
    precision = precision_score(y_test,pred)
    recall = recall_score(y_test,pred)
    print('오차행렬')
    print(confusion)
    print(f'정확도: {accuracy:.4f}, 정밀도: {precision:.4f}, 재현율:{recall:.4f}')

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 원본 데이터를 재로딩, 데이터 가공, 학습 데이터/테스트 데이터 분할.

titanic_df = pd.read_csv('titanic_train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived',axis=1)
X_titanic_df = transform_features(X_titanic_df)
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df,y_titanic_df,test_size=0.2,random_state=0)

FileNotFoundError: [Errno 2] No such file or directory: 'titanic_train.csv'

In [None]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train,y_train)
pred = lr_clf.predict(X_test)
get_clf_eval(y_test,pred)

In [None]:
pred_proba = lr_clf.predict_proba(X_test)
pred = lr_clf.predict(X_test)
pred_proba[:5], pred[:5]

In [None]:
# 임계값 조정방법

from sklearn.preprocessing import Binarizer

In [None]:
X = [[1,-1,2],[2,0,0],[0,1.1,1.2]]

In [None]:
binarizer = Binarizer(threshold=1.1)
binarizer.fit_transform(X)

In [None]:
custom_threshold = 0.6
pred_proba_1 = pred_proba[:,1].reshape(-1,1)
binarizer = Binarizer(threshold=custom_threshold)
custom_pred = binarizer.fit_transform(pred_proba_1)
get_clf_eval(y_test,custom_pred)

In [None]:
def get_eval_by_threshold(y_test,pred_proba):
    thresholds = np.linspace(0.4,0.6,5)
    for th in thresholds:
        pred_proba_1 = pred_proba[:,1].reshape(-1,1)
        binarizer = Binarizer(threshold=th)
        custom_pred = binarizer.fit_transform(pred_proba_1)
        print('임계값 : ',th)
        get_clf_eval(y_test,custom_pred)
    

In [None]:
np.linspace(0.4,0.6,5)

In [None]:
get_eval_by_threshold(y_test,pred_proba)

In [None]:
from sklearn.metrics import precision_recall_curve

In [None]:
pred_proba_class1 = lr_clf.predict_proba(X_test)[:,1]

In [None]:
#정밀도, 재현율, threshold 세가지 배열이 존재
precision , recall, threshold = precision_recall_curve(y_test,pred_proba_class1)

In [None]:
precision.shape , recall.shape, threshold.shape

In [None]:
thr_index = np.arange(0,threshold.shape[0],15)

In [None]:
print(np.round(precision[thr_index],3))
print(np.round(recall[thr_index],3))


In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as kicker

In [None]:
def precision_recall_curve_plot(y_test,pred_proba):
    predcision, recall, threshold = precision_recall_curve(y_test,pred_proba)
    threshold_boundary = threshold.shape[0]
    plt.plot(threshold,predcision[0:threshold_boundary], linestyle='--',label='precision')
    plt.plot(threshold,recall[0:threshold_boundary],label='recall')
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start,end,0.1),2))
    plt.legend()
    plt.grid()
    plt.show()
    

In [None]:
precision_recall_curve_plot(y_test,pred_proba_class1)

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1 = f1_score(y_test,pred)
f1

In [None]:
def get_clf_eval(y_test,pred):
    confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test,pred)
    precision = precision_score(y_test,pred)
    recall = recall_score(y_test,pred)
    f1 = f1_score(y_test,pred)
    print('오차행렬')
    print(confusion)
    print(f'정확도: {accuracy:.4f}, 정밀도: {precision:.4f}, 재현율:{recall:.4f}, f1-스코어:{f1:.4f}')

In [None]:
def get_eval_by_threshold(y_test,pred_proba):
    thresholds = np.linspace(0.4,0.6,5)
    for th in thresholds:
        pred_proba_1 = pred_proba[:,1].reshape(-1,1)
        binarizer = Binarizer(threshold=th)
        custom_pred = binarizer.fit_transform(pred_proba_1)
        print('임계값 : ',th)
        get_clf_eval(y_test,custom_pred)
    

In [None]:
get_eval_by_threshold(y_test,pred_proba)

In [None]:
from sklearn.metrics import roc_curve

In [None]:
fprs, tprs, thresholds = roc_curve(y_test,pred_proba_class1)

In [None]:
thr_index = np.arange(1,thresholds.shape[0],5)
thr_index

In [None]:
fprs[thr_index]

In [None]:
tprs[thr_index]

In [None]:
thresholds[thr_index]

In [None]:
def roc_curve_plot(y_test,pred_proba_c1):
    # 임곗값에 따른 FPR, TPR 값을 반환받음
    fprs, tprs, thresholds = roc_curve(y_test,pred_proba_c1)
    plt.plot(fprs,tprs,label='ROC')
    plt.plot([0,1],[0,1],'k--',label='Random')
    start, end = plt.xlim()
    plt.legend()
    

In [None]:
roc_curve_plot(y_test,pred_proba_class1)