In [7]:
import numpy as np
from sklearn.base import BaseEstimator
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

class MyDummyClassifier(BaseEstimator):
    #fit매서드는 아무것도 학습하지 않음.
    def fit(self,x,y=None):
        pass
    #predict 매서드는 단순히 sex피처가 1이면 0, 그렇지 않으면 1로 예측함.
    def predict(self,x):
        pred = np.zeros((x.shape[0], 1))
        for i in range(x.shape[0]):
            if x['Sex'].iloc[i] == 1:
                pred[i] = 0
            else:
                pred[i] =1 
        return pred
    
def get_category(age):
    cat=''
    if age<=-1: cat='Unknown'
    elif age<=5: cat='baby'
    elif age<=12: cat='child'
    elif age<=18: cat='teenager'
    elif age<=25: cat='student'
    elif age<=35: cat='yong adult'
    elif age<=60: cat='adult'
    else: cat ='elderly'
    
    return cat

def fillna(df):   #NULL처리 함수
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N',inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

#머신러닝 알고리즘에 불필요한 요소 제거
def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
    return df

#레이블 인코딩 수행.
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df 


titanic_df = pd.read_csv('./titanic_train.csv')

y_titanic_df = titanic_df['Survived']
x_titanic_df = titanic_df.drop('Survived', axis=1)
x_titanic_df = transform_features(x_titanic_df)

xtrain,xtest,ytrain,ytest = train_test_split(x_titanic_df, y_titanic_df, test_size=0.2, random_state=0)
myclf = MyDummyClassifier()
myclf.fit(xtrain, ytrain)

mypredictions = myclf.predict(xtest)
print('{0: 4f}'.format(accuracy_score(ytest, mypredictions)))


 0.787709


In [31]:
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.datasets import load_digits
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Binarizer

class MyFakeClassifier(BaseEstimator):
    #fit매서드는 아무것도 학습하지 않음.
    def fit(self,x,y=None):
        pass
    #predict 매서드는 단순히 sex피처가 1이면 0, 그렇지 않으면 1로 예측함.
    def predict(self,x): 
        return np.zeros( (len(x),1), dtype = bool)

digits = load_digits()

y = (digits.target == 7).astype(int)
    
xtrain,xtest,ytrain,ytest = train_test_split( digits.data , y, random_state=11)

print(ytest.shape)
print('테스트 세트 레이블 0과 1의 분포도')
print(pd.Series(ytest).value_counts())

fakeclf = MyFakeClassifier()
fakeclf.fit(xtrain,ytrain)
fakepred = fakeclf.predict(xtest)
print('{: 3f}'.format(accuracy_score(ytest, fakepred)))

#1~4사분면은, FP, TN, FN, TP 
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest, fakepred)

(450,)
테스트 세트 레이블 0과 1의 분포도
0    405
1     45
dtype: int64
 0.900000


array([[405,   0],
       [ 45,   0]], dtype=int64)

In [36]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

def get_clf_eval(ytest, pred):
    confusion = confusion_matrix(ytest,pred)
    accuracy = accuracy_score(ytest, pred)
    precision = precision_score(ytest, pred)
    recall = recall_score(ytest, pred)
    print('오차행렬')
    print(confusion)
    f1= f1_score(ytest, pred)
    print('정확도: {0:4f}, 정밀도: {1:4f}, 재현율: {2:4f}, f1스코어: {3:4f}'.format(accuracy, precision, recall, f1))

def get_eval_by_threshold(ytest, pred_proba_c1, thresholds):
    print(thresholds)
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold = custom_threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print("임계값: ",custom_threshold)
        get_clf_eval(ytest, custom_predict)
        


titanic_df = pd.read_csv('./titanic_train.csv')

y_titanic_df = titanic_df['Survived']
x_titanic_df = titanic_df.drop('Survived', axis=1)
x_titanic_df = transform_features(x_titanic_df)

xtrain,xtest,ytrain,ytest = train_test_split(x_titanic_df, y_titanic_df, test_size=0.20, random_state=11)

lr_clf = LogisticRegression()

lr_clf.fit(xtrain, ytrain)

pred = lr_clf.predict(xtest)
get_clf_eval(ytest, pred)

pred_proba = lr_clf.predict_proba(xtest)
print(pred_proba[:3])
#기본적으로 pred의 임계값은 0.5이다. 이것을 조정해서 정밀도와 재현율 트레이드를 한다.

pred_proba_result = np.concatenate([pred_proba, pred.reshape(-1,1)], axis =1)
print(pred_proba_result[:3])
print('#######')


#분류 결정 임계값
custom_threshold = 0.5
pred_proba_1 = pred_proba[:, 1].reshape(-1,1)

binarizer = Binarizer(threshold = custom_threshold).fit(pred_proba_1)
custom_predict = binarizer.transform(pred_proba_1)

#get_clf_eval(ytest, custom_predict)



print("\n#########\n")
#precision_recall_curve() API를 이용해본다.
from sklearn.metrics import precision_recall_curve

pred_proba_class1 = lr_clf.predict_proba(xtest)[:,1]
precisions, recalls, thresholds = precision_recall_curve(ytest, pred_proba_class1)
print("반환된 분류 결정 임곗값 배열의 shape",thresholde.shape)

#너무 많아서 샘플로 10건만 추출함
thr_index = np.arange(0, thresholds.shape[0], 15)
print("임계값 배열 index", thr_index)
print("임계값",np.round(thresholds[thr_index],2))


print("정밀도: ",np.round(precisions[thr_index], 3))
print("재현율: ",np.round(recalls[thr_index], 3))

print("\n##########\n")

thresholds = [0.4, 0.45, 0.5 ,0.55, 0.6]
pred_proba = lr_clf.predict_proba(xtest)
get_eval_by_threshold(ytest, pred_proba[:, 1].reshape(-1,1), thresholds)


오차행렬
[[104  14]
 [ 13  48]]
정확도: 0.849162, 정밀도: 0.774194, 재현율: 0.786885, f1스코어: 0.780488
[[0.4618237  0.5381763 ]
 [0.8785559  0.1214441 ]
 [0.87721379 0.12278621]]
[[0.4618237  0.5381763  1.        ]
 [0.8785559  0.1214441  0.        ]
 [0.87721379 0.12278621 0.        ]]
#######

#########

반환된 분류 결정 임곗값 배열의 shape (143,)
임계값 배열 index [  0  15  30  45  60  75  90 105 120 135]
임계값 [0.1  0.12 0.14 0.19 0.28 0.4  0.57 0.67 0.82 0.95]
정밀도:  [0.389 0.44  0.466 0.539 0.647 0.729 0.836 0.949 0.958 1.   ]
재현율:  [1.    0.967 0.902 0.902 0.902 0.836 0.754 0.607 0.377 0.148]

##########

[0.4, 0.45, 0.5, 0.55, 0.6]
임계값:  0.4
오차행렬
[[99 19]
 [10 51]]
정확도: 0.837989, 정밀도: 0.728571, 재현율: 0.836066, f1스코어: 0.778626
임계값:  0.45
오차행렬
[[103  15]
 [ 12  49]]
정확도: 0.849162, 정밀도: 0.765625, 재현율: 0.803279, f1스코어: 0.784000
임계값:  0.5
오차행렬
[[104  14]
 [ 13  48]]
정확도: 0.849162, 정밀도: 0.774194, 재현율: 0.786885, f1스코어: 0.780488
임계값:  0.55
오차행렬
[[109   9]
 [ 15  46]]
정확도: 0.865922, 정밀도: 0.836364, 재현율: 0.754098, f1스코어: 0.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [56]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score, roc_auc_score
import numpy as np


def get_clf_eval(ytest, pred, pred_proba=None):
    confusion = confusion_matrix(ytest,pred)
    accuracy = accuracy_score(ytest, pred)
    precision = precision_score(ytest, pred)
    recall = recall_score(ytest, pred)
    #ROC-AUC추가
    roc_auc = metrics.roc_auc_score(ytest, pred_proba)
    
    print('오차행렬')
    print(confusion)
    f1= f1_score(ytest, pred)
    print('정확도: {0:4f}, 정밀도: {1:4f}, 재현율: {2:4f}, f1스코어: {3:4f}, AUC: {4:4f}'.format(accuracy, precision, recall, f1, roc_auc))

def get_eval_by_threshold(ytest, pred_proba_c1, thresholds):
    print(thresholds)
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold = custom_threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print("임계값: ",custom_threshold)
        get_clf_eval(ytest, custom_predict)
        


titanic_df = pd.read_csv('./titanic_train.csv')

y_titanic_df = titanic_df['Survived']
x_titanic_df = titanic_df.drop('Survived', axis=1)
x_titanic_df = transform_features(x_titanic_df)

xtrain,xtest,ytrain,ytest = train_test_split(x_titanic_df, y_titanic_df, test_size=0.20, random_state=11)

lr_clf = LogisticRegression()

lr_clf.fit(xtrain, ytrain)

pred = lr_clf.predict(xtest)
confusion = confusion_matrix(ytest,pred)
print(confusion)
roc_auc = roc_auc_score(ytest, None)
print(roc_auc)
pred_proba = lr_clf.predict_proba(xtest)
thresholds = [0.4, 0.45, 0.5 ,0.55, 0.6]
pred_proba = lr_clf.predict_proba(xtest)
#get_eval_by_threshold(ytest, pred_proba[:, 1].reshape(-1,1), thresholds)

from sklearn.metrics import roc_curve

pred_proba_class1 = lr_clf.predict_proba(xtest)[:,1]

fprs, tprs, thresholds = roc_curve(ytest, pred_proba_class1)
thr_index = np.arange(1, thresholds.shape[0],5)


print("FPR", np.round(fprs[thr_index], 3))
print("TPR", np.round(tprs[thr_index], 3))

get_clf_eval(ytest, pred)

[[104  14]
 [ 13  48]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').