In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, matthews_corrcoef, f1_score

import pandas as pd

In [2]:
data = load_breast_cancer()
df=pd.DataFrame(data.data, columns=data.feature_names)

df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
df.shape

(569, 30)

## Statistics with the same dataset

In [4]:
## First dataset
df1 = df.copy()
df1['data_label'] = 0  ## Give '0' as target variable

## Second dataset
df2 = df.copy()
df2['data_label'] = 1  ## Give '1' as target variable

print(df1.shape, df2.shape)

(569, 31) (569, 31)


In [5]:
## Concatinate the same
df_total = pd.concat([df1, df2], axis=0)
df_total.shape

(1138, 31)

In [6]:
## Set the dataset to train
X = df_total[df_total.columns[df_total.columns != 'data_label']]
y = df_total['data_label']

In [7]:
## Generate classifier
clf = LogisticRegression(random_state=0).fit(X,y)
y_pred = clf.predict(X)

In [8]:
## Biased result
confusion_matrix(y, y_pred)

array([[569,   0],
       [569,   0]], dtype=int64)

For the typical confusion matrix, we define MCM as follows

$$ MCM := {(TP * TN - FN * FP) \over \sqrt{(TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)  }}  $$

|                         | Positive(pred) |  Negative(pred) |
| ----------------------- | ---------------| --------------- |
| Positive(Ground Truth)  |  TP            |  FN             |
| Negative (Ground Truth) |  FP            |  TN             |

In general $ -1 \le MCM \le 1$.

if $MCM \fallingdotseq 1$, then the classifier is completely predictable.

if $MCM \fallingdotseq 0$, then the classifier output is random.

if $MCM \fallingdotseq -1$, then the classifier  completely misses the answer.

Also, MCM is applicable, even if the dataset is imbalanced.

In [9]:
## the classifier output is random
matthews_corrcoef(y, y_pred)

0.0

In [10]:
## Not good..
f1_score(y, y_pred)

0.0

## Statistics with the DIFFERENT dataset

In [11]:
df_all = pd.DataFrame(data.data, columns=data.feature_names)
df_all.shape

(569, 30)

In [12]:
df_all['target'] = data.target
df_all.shape

(569, 31)

In [13]:
df_all.columns != 'target'

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False])

In [14]:
X = df_all[df_all.columns[df_all.columns != 'target']]
y = df_all['target']

In [15]:
X.shape, y.shape

((569, 30), (569,))

In [16]:
clf = LogisticRegression(random_state=0, max_iter=10000).fit(X,y)
y_pred = clf.predict(X)

In [17]:
## good classifier :)
confusion_matrix(y, y_pred)

array([[197,  15],
       [  9, 348]], dtype=int64)

In [18]:
## MCM is similar 1!
matthews_corrcoef(y, y_pred)

0.9094952936272364

In [19]:
## F1 score is also good!
f1_score(y, y_pred)

0.9666666666666667