In [1]:
import sys
sys.path.append('..')

import numpy as np
from sklearn.metrics import confusion_matrix
from pandas import crosstab
from korr import confusion

## Test 1 - Small Datasets
It really does not matter for tiny dataset what you use.

In [2]:
np.random.seed(43)
y_true = (np.random.normal(size=(100,2)) > 0).astype(int)
y_pred = y_true[:, 0]
y_true = y_true[:, 1]

In [3]:
%%time
cm = crosstab(y_true, y_pred)  # pandas
del cm

CPU times: user 30.6 ms, sys: 4.2 ms, total: 34.8 ms
Wall time: 33.6 ms


In [4]:
%%time
cm = confusion_matrix(y_true, y_pred) # sklearn
del cm

CPU times: user 2.37 ms, sys: 1.45 ms, total: 3.82 ms
Wall time: 2.44 ms


In [5]:
%%time
cm = confusion(y_true, y_pred)  # korr
del cm

CPU times: user 183 µs, sys: 59 µs, total: 242 µs
Wall time: 215 µs


## Test 2 - Medium Sized Data
pandas `crosstab` starts to outperform sklearn's `confusion_matrix`

In [6]:
np.random.seed(43)
y_true = (np.random.normal(size=(50000,2)) > 0).astype(int)
y_pred = y_true[:, 0]
y_true = y_true[:, 1]

In [7]:
%%time
cm = crosstab(y_true, y_pred)  # pandas
del cm

CPU times: user 33.5 ms, sys: 6.31 ms, total: 39.8 ms
Wall time: 51.8 ms


In [8]:
%%time
cm = confusion_matrix(y_true, y_pred) # sklearn
del cm

CPU times: user 76.3 ms, sys: 4.32 ms, total: 80.6 ms
Wall time: 106 ms


In [9]:
%%time
cm = confusion(y_true, y_pred)  # korr
del cm

CPU times: user 4.71 ms, sys: 3.6 ms, total: 8.31 ms
Wall time: 4.01 ms


## Test 3 - Millions of Obs
* sklearn's `confusion_matrix` does not seem to scale well.
* pandas' `crosstab` does well. I gues the difference to `korr.confusion` are just error checking routines.

In [10]:
np.random.seed(43)
y_true = (np.random.normal(size=(10000000,2)) > 0).astype(int)
y_pred = y_true[:, 0]
y_true = y_true[:, 1]

In [11]:
%%time
cm = crosstab(y_true, y_pred)  # pandas
del cm

CPU times: user 1.38 s, sys: 925 ms, total: 2.3 s
Wall time: 3.18 s


In [12]:
%%time
cm = confusion_matrix(y_true, y_pred) # sklearn
del cm

CPU times: user 12.8 s, sys: 768 ms, total: 13.6 s
Wall time: 14.5 s


In [13]:
%%time
cm = confusion(y_true, y_pred)  # korr
del cm

CPU times: user 367 ms, sys: 12.8 ms, total: 380 ms
Wall time: 385 ms
