In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
get_ipython().run_line_magic('matplotlib', 'inline')
plt.rcParams['font.family']='Malgun Gothic' # 한글폰트
plt.rcParams['axes.unicode_minus'] = False

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, auc, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import plot_confusion_matrix, classification_report
from sklearn.pipeline import Pipeline, make_pipeline
import scikitplot as skplt
from joblib import dump, load

# 불균형데이터의 예측 성능

## 사례  ROC < PR 곡선 (불균형자료)

# Data Obtain

In [None]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_classes=2, weights=[0.99, 0.01],
                          random_state=1)

plt.scatter(X[:, 0], X[:, 1], c=y, s=20, cmap='summer')

In [None]:
X.shape, y.shape

In [None]:
unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

In [None]:
pd.Series(y).value_counts()

## 반반씩 나눠 봄 - stratify

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)

In [None]:
# summarize dataset
print(f'Dataset: Class0={(y==0).sum()}, Class1={(y==1).sum()}')
print(f'Train  : Class0={(y_train==0).sum()}, Class1={(y_train==1).sum()}')
print(f'Test   : Class0={(y_test==0).sum()}, Class1={(y_test==1).sum()}')

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='lbfgs')
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report
y_pred   = clf.predict(X_test)
y_probas = clf.predict_proba(X_test)

target_names = ['Negative(0)', 'Positive(1)']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
import scikitplot as skplt 
skplt.metrics.plot_roc(y_test, y_probas)

In [None]:
skplt.metrics.plot_precision_recall(y_test, y_probas)

In [None]:
np.sort(y_probas[:, 1].round(2))

In [None]:
pd.Series(y_pred).value_counts()

## y_probas 분포
#### 한쪽에 몰려 있음

In [None]:
y_scores = y_probas[:, 1]
pd.DataFrame(y_scores).hist(bins=100)
plt.title('양성판정확률의 분포')
plt.xlabel('양성판정확률')
np.max(y_scores)

In [None]:
data = pd.DataFrame(y_scores, columns=['y_predict'])
data

## Calibrate Probabilities

In [None]:
from sklearn.calibration import CalibratedClassifierCV
calibrated_clf = CalibratedClassifierCV(base_estimator=clf, cv=3)
calibrated_clf.fit(X_train, y_train)

In [None]:
calibrated_clf.predict_proba(X_train)[:5, :]

### 골고루 균형있게 변화

In [None]:
yhat = calibrated_clf.predict_proba(X_test)
model_probs = yhat[:, 1]
pd.DataFrame(model_probs).hist(bins=100)
plt.title('양성판정확률의 분포')
plt.xlabel('양성판정확률')

In [None]:
data['y_predict_calibrated'] = yhat[:, 1]
data.hist(figsize=(15,4))

In [None]:
data.head()

In [None]:
data.reset_index().rename(columns={'index':'idx'})

In [None]:
import warnings
warnings.filterwarnings(action='ignore') 
data.reset_index().plot.scatter(x='index', y='y_predict',            c='red', figsize=(25,4))
data.reset_index().plot.scatter(x='index', y='y_predict_calibrated', c='blue', figsize=(25,4))

https://scikit-learn.org/stable/modules/calibration.html#calibration

# Imbalanced_classification_with_python(by Jason brown)
### ROC Curve의 최적 Threshold  찾기

https://machinelearningmastery.com/imbalanced-classification-with-python/

In [None]:
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
                           n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=4)

plt.scatter(X[:, 0], X[:, 1], c=y, s=20, cmap='winter')

### LogisticRegression으로 학습
### roc_curve

In [None]:
# split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)
# fit a model
clf = LogisticRegression(solver='lbfgs')
clf.fit(X_train, y_train)

y_pred   = clf.predict(X_test)
y_probas = clf.predict_proba(X_test)[:, 1]

# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test, y_probas)

#### y_probas 분포

In [None]:
plt.hist(y_probas)

### Geometric Mean이 최대가 되는 점 찾기

In [None]:
# calculate the g-mean for each threshold
gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print(f'Best Threshold={thresholds[ix]}, G-mean={gmeans[ix]}')

In [None]:
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='무작위')
plt.plot(fpr, tpr, marker='.', label='Logistic')
plt.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

## Youden’s J statistic을 이용하면 좀더 쉽게 계산

In [None]:
# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test, y_probas)
# get the best threshold
J = tpr - fpr
ix = np.argmax(J)
best_thresh = thresholds[ix]
print(f'Best Threshold={best_thresh}')

### Precision-Recall Curve의 최적 Threshold  찾기

In [None]:
# calculate pr-curve
precision, recall, thresholds = precision_recall_curve(y_test, y_probas)

#### f1-score가 가장 커지는 점 찾기

In [None]:
# convert to f-measure
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f-measure
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]:.3f}, F-measure={fscore[ix]:.5f}')

In [None]:
# plot the Precision-Recall curve for the model
no_skill = len(y_test[y_test==1]) / len(y_test)
plt.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
plt.scatter(recall[ix], precision[ix], marker='o', color='black', label='Best')
plt.plot(recall, precision, marker='.', label='Logistic')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
# show the plot
plt.show()

### Optimal Threshold Tuning

#### threshold 보다 크면 1값을 갖도록 하는 함수

In [None]:
# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

#### y_probas를 구함

In [None]:
# fit a model
model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train)
# predict probabilities
y_probas = model.predict_proba(X_test)[:,1]

#### 여러 threshold 정의

In [None]:
# define thresholds
thresholds = np.arange(0, 1, 0.001)

#### f1_score를 계산하는 함수를 가지고 threshold에 따라 f1_score 값을 계산

In [None]:
# evaluate each threshold
from sklearn.metrics import f1_score
f_scores = [f1_score(y_test, to_labels(y_probas, t)) for t in thresholds]

# get best threshold
ix = np.argmax(f_scores)
print(f'Threshold={thresholds[ix]:.3f}, F-measure={f_scores[ix]:.5f}')

## 성과지표 요약

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, y_probas)
display(pd.DataFrame(np.vstack([precision, recall]), index=['정밀도', '재현율']).round(2))
display(pd.DataFrame(thresholds,columns=['분류임계값']).T.round(3))

### 분류임계값에 따라 Positive 판정율 내리는 비율

In [None]:
scores = [np.sum(to_labels(y_probas, t))/len(y_probas) for t in thresholds]

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds): 
    plt.rcParams["figure.figsize"] = (15,6)
    plt.rcParams['font.family']='Malgun Gothic' # 한글폰트
    plt.title('분류임계값과 (정밀도, 재현율, F1-score, 양성판정비율)', fontsize='16') 
    plt.plot(thresholds, precisions[:-1], 'b:', label='정밀도(Precision)') # 판사
    plt.plot(thresholds, recalls[:-1], 'y-.', label='재현율(Recall)') # 검사
    plt.plot(thresholds, (2*precisions[:-1]*recalls[:-1])/(precisions[:-1] +recalls[:-1]), 
             'r-', label='f1 score(PR조화평균)') # 변호사
    plt.plot(thresholds, scores, 'k--', label='양성판정비율') # q-ratio
    plt.ylabel('정밀도 /  재현율', fontsize='14') 
    plt.xlabel('분류임계값', fontsize='14') 
    plt.legend(loc='best', fontsize='14') 
    
plot_precision_recall_vs_threshold(precision, recall, thresholds)

## 다양한 metrics로 Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0, stratify=y)
# fit a model
model = LogisticRegression(solver='lbfgs')

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=False)

In [None]:
params = {'C': np.logspace(-4, 4, 20), 'penalty': ['l1', 'l2']}

In [None]:
randomized = RandomizedSearchCV(model, param_distributions=params, cv=kf, scoring = 'accuracy', random_state=1)
y_predict = randomized.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_predict))
print(randomized.best_estimator_)

In [None]:
randomized = RandomizedSearchCV(model, param_distributions=params, cv=kf, scoring = 'recall', random_state=1)
y_predict = randomized.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_predict))
print(randomized.best_estimator_)

In [None]:
randomized = RandomizedSearchCV(model, param_distributions=params, cv=kf, scoring = 'precision', random_state=1)
y_predict = randomized.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_predict))
print(randomized.best_estimator_)

In [None]:
randomized = RandomizedSearchCV(model, param_distributions=params, cv=kf, scoring = 'f1', random_state=1)
y_predict = randomized.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_predict))
print(randomized.best_estimator_)

In [None]:
from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=2)

randomized = RandomizedSearchCV(model, param_distributions=params, cv=kf, scoring = ftwo_scorer, random_state=1)
y_predict = randomized.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_predict))
print(randomized.best_estimator_)

### train과정의 값이 잘 맞는 것과 test set에 적용한 결과는 다름

In [None]:
print(classification_report(y_train, randomized.predict(X_train)))