# Evaluation matrices (binary classification)

## Setting up
- Breat cancer data
- 2 classes
- 30 features
- SVC


Classes:
- `0` = Malignant - Tumor grows rapidly, invade and destroy nearby normal tissues, and spread throughout the body.
- `1` = Benign - Tumor grows slowly and do not spread.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import seaborn as sns

# Breast cancer data
from sklearn.datasets import load_breast_cancer

# Load data
dataObj = load_breast_cancer()
X = dataObj.data
y = dataObj.target
print(np.unique(y))
print(X.shape)

# Visualize with dataframe
df = pd.DataFrame(data=X, columns=dataObj.feature_names)
df.insert(loc=0, column='class', value=y)
df['class'] = df['class'].map({0: dataObj.target_names[0], 1: dataObj.target_names[1]})
display(df)

In [None]:
# Class distribution
df['class'].value_counts()

In [None]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.20,
    stratify=y,
    random_state=1)

# Constructing a pipeline object
pipe_svc = Pipeline([('scl', StandardScaler()),
            ('clf', SVC(random_state=1))])

## Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

# Training
pipe_svc.fit(X_train, y_train)

# Prediction from test data
y_pred = pipe_svc.predict(X_test)

In [None]:
# Confusion matrix (works but very ugly)
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[0,1])
print(confmat)

In [None]:
# Manual plot
sns.heatmap(confmat, annot=True, cmap='Blues')

In [None]:
# Confusion matrix (more beautiful)
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(estimator=pipe_svc, X=X_test, y_true=y_test, labels=[0,1])  
plt.show()

### Note 

- The class 0 samples that are correctly predicted as class 0 are now in the upper left corner of the matrix. 
- In order to change the ordering, we can use the "labels" argument.

In [None]:
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[1, 0])
print(confmat)

In [None]:
plot_confusion_matrix(estimator=pipe_svc, X=X_test, y_true=y_test, labels=[1,0])  
plt.show()  

## Accuracy, Precision, Recall, and F1

- Be careful with the definition of "positive" label.  In this case, we want `0` to be positive (เป็นโรค).
- Therefore, we need to set `pos_label=0` when calculating precision, recall and F1

In [None]:
from sklearn.metrics import accuracy_score, precision_score ,recall_score, f1_score

# Accuracy
ACC = accuracy_score(y_true=y_test, y_pred=y_pred)
print(f"Accuracy:{ACC:6.3f}")

# Precision
PRE = precision_score(y_true=y_test, y_pred=y_pred, pos_label=0)
print(f"Precision:{PRE:6.3f}")

# Recall
REC = recall_score(y_true=y_test, y_pred=y_pred, pos_label=0)
print(f"Recall:{REC:6.3f}")

# F1
F1 = f1_score(y_true=y_test, y_pred=y_pred, pos_label=0)
print(f"F1:{REC:6.3f}")

## Use `precision` score in grid search

- Scoring: https://scikit-learn.org/stable/modules/model_evaluation.html
- Note that when using `scoring='precision'`, default parameters will be use, which means that `pos_label=1`.

### Incorrect 

In [None]:
from sklearn.model_selection import GridSearchCV

c_gamma_range = [0.01, 0.1, 1.0, 10.0]
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
set1 = {'clf__C': param_range, 'clf__kernel': ['linear']}
set2 = {'clf__C': param_range, 'clf__gamma': param_range, 'clf__kernel': ['rbf']}
param_grid = [set1, set2]

gs = GridSearchCV(estimator=pipe_svc,
                  param_grid=param_grid,
                  # This means that pos_label=1
                  scoring='precision',
                  cv=10,
                  n_jobs=-1)
                  
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

### Correct

In [None]:
from sklearn.metrics import make_scorer

# Making scorer wrapper so that we can pass the desired argument.
scorer = make_scorer(precision_score, pos_label=0)

# Grid search.
gs = GridSearchCV(estimator=pipe_svc,
                  param_grid=param_grid,
                  # Use scorer here
                  scoring=scorer,
                  cv=10,
                  n_jobs=-1)
                  
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)