In [1]:
import numpy as np
import pandas as pd

# Effects of class imbalance

Consider:

- 10 people, 1 of which has Corona

In [2]:
people = list(range(1, 11))
corona = np.append(np.zeros((8,)), np.ones((2,))).astype(int)

In [3]:
df = pd.DataFrame({'Person': people, 'Corona': corona})
df

Unnamed: 0,Person,Corona
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
5,6,0
6,7,0
7,8,0
8,9,1
9,10,1


- Test A, which is positive for the person with Corona and negative for the rest

In [4]:
test_a = corona

In [5]:
df = pd.concat([df, pd.Series(test_a, name='Test_A')], axis=1)

- Test B, which is negative for everyone.

In [6]:
test_b = np.zeros((10,)).astype(int)

In [7]:
df = pd.concat([df, pd.Series(test_b, name='Test_B')], axis=1)
df

Unnamed: 0,Person,Corona,Test_A,Test_B
0,1,0,0,0
1,2,0,0,0
2,3,0,0,0
3,4,0,0,0
4,5,0,0,0
5,6,0,0,0
6,7,0,0,0
7,8,0,0,0
8,9,1,1,0
9,10,1,1,0


Which test do we prefer?

Which test does a machine learning algorithm prefer?

- It depends

Assume machine learning algorithm A predicts a positive case with the probabilities:

In [8]:
proba_a = np.append(np.ones((8,))*0.4, np.ones((2,))*0.6)

and algorithm B predicts a positive case with the probabilities:

In [9]:
proba_b = [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]

In [10]:
df = pd.concat([df, pd.Series(proba_a, name='Proba_A'), pd.Series(proba_b, name='Proba_B')], axis=1)
df

Unnamed: 0,Person,Corona,Test_A,Test_B,Proba_A,Proba_B
0,1,0,0,0,0.4,0.01
1,2,0,0,0,0.4,0.01
2,3,0,0,0,0.4,0.01
3,4,0,0,0,0.4,0.01
4,5,0,0,0,0.4,0.01
5,6,0,0,0,0.4,0.01
6,7,0,0,0,0.4,0.01
7,8,0,0,0,0.4,0.01
8,9,1,1,0,0.6,0.01
9,10,1,1,0,0.6,0.01


Let's check the log-loss

In [11]:
from sklearn.metrics import log_loss

In [14]:
log_loss(df['Corona'], df['Proba_A'])

0.5108256237659907

In [15]:
log_loss(df['Corona'], df['Proba_B'])

0.9290743058804193