# The AUC from example 1 of the Calf paper

While calfpy yields an auc of 0.875 in example 1 from the Calf paper [1], calfcv produces an auc of 0.82.

In [1]:
# Author: Rolf Carlson, Carlson Research LLC, <hrolfrc@gmail.com>
# License: 3-clause BSD

### Get the data

In [2]:
import pandas as pd
from sklearn.metrics import roc_auc_score
from calfcv import CalfCV, Calf

In [3]:
input_file = "../../../data/n2.csv"
df = pd.read_csv(input_file, header=0, sep=",")

# The input data is everything except the first column
X = df.loc[:, df.columns != 'ctrl/case']
# The outcome or diagnoses are in the first ctrl/case column
Y = df['ctrl/case']

# The header row is the feature set
features = list(X.columns)

# label the outcomes
Y_names = Y.replace({0: 'non_psychotic', 1: 'pre_psychotic'})

# glmnet requires float64
x = X.to_numpy(dtype='float64')
y = Y.to_numpy(dtype='float64')

### Data overview

Here we look at the feature names, number of features, shape, and category balance.

In [4]:
features[0:5]

['ADIPOQ', 'SERPINA3', 'AMBP', 'A2M', 'ACE']

In [5]:
x.size

9720

In [6]:
x.shape

(72, 135)

In [7]:
print(list(Y).count(1), list(Y).count(0))

32 40


In [8]:
len(y)

72

### Predict diagnoses

In [9]:
y_pred = Calf().fit(x, y).predict_proba(x)
roc_auc_score(y, y_pred[:, 1])

0.78359375

The class probabilities predicted by Calf

In [10]:
y_pred

array([[0.73105858, 0.26894142],
       [0.65967676, 0.34032324],
       [0.67352505, 0.32647495],
       [0.47684355, 0.52315645],
       [0.49403688, 0.50596312],
       [0.63393725, 0.36606275],
       [0.55006102, 0.44993898],
       [0.66281609, 0.33718391],
       [0.62506968, 0.37493032],
       [0.59501702, 0.40498298],
       [0.71191589, 0.28808411],
       [0.55821739, 0.44178261],
       [0.52240219, 0.47759781],
       [0.71537548, 0.28462452],
       [0.6175692 , 0.3824308 ],
       [0.63651384, 0.36348616],
       [0.62584695, 0.37415305],
       [0.55156817, 0.44843183],
       [0.60770431, 0.39229569],
       [0.64037289, 0.35962711],
       [0.44494103, 0.55505897],
       [0.71615345, 0.28384655],
       [0.39693984, 0.60306016],
       [0.52301264, 0.47698736],
       [0.45921654, 0.54078346],
       [0.4121336 , 0.5878664 ],
       [0.62750285, 0.37249715],
       [0.33856961, 0.66143039],
       [0.43123335, 0.56876665],
       [0.59092639, 0.40907361],
       [0.

In [11]:
y_pred = CalfCV().fit(x, y).predict_proba(x)
roc_auc_score(y, y_pred[:, 1])

0.8242187500000001

The classe probabilities predicted by CalfCV

In [12]:
y_pred

array([[0.57255396, 0.42744604],
       [0.57190911, 0.42809089],
       [0.49905191, 0.50094809],
       [0.37396104, 0.62603896],
       [0.43673843, 0.56326157],
       [0.42049986, 0.57950014],
       [0.58727104, 0.41272896],
       [0.73105858, 0.26894142],
       [0.55325599, 0.44674401],
       [0.53275445, 0.46724555],
       [0.60855296, 0.39144704],
       [0.71632012, 0.28367988],
       [0.47160497, 0.52839503],
       [0.6000901 , 0.3999099 ],
       [0.5182401 , 0.4817599 ],
       [0.61569573, 0.38430427],
       [0.44520824, 0.55479176],
       [0.61067857, 0.38932143],
       [0.43740406, 0.56259594],
       [0.50806646, 0.49193354],
       [0.45940988, 0.54059012],
       [0.5844512 , 0.4155488 ],
       [0.6167177 , 0.3832823 ],
       [0.45002242, 0.54997758],
       [0.48864786, 0.51135214],
       [0.51369577, 0.48630423],
       [0.37743322, 0.62256678],
       [0.45003696, 0.54996304],
       [0.61891498, 0.38108502],
       [0.5935698 , 0.4064302 ],
       [0.

In [13]:
y_pred = Calf().fit(x, y).predict(x)
roc_auc_score(y, y_pred)

0.696875

The classes predicted by Calf

In [14]:
y_pred

array([0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1.,
       0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 0., 1.,
       0., 0., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 0.])

In [15]:
y_pred = CalfCV().fit(x, y).predict(x)
roc_auc_score(y, y_pred)

0.709375

The classes predicted by CalfCV

In [16]:
y_pred

array([0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
       0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0.,
       0., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.,
       1., 1., 1., 1.])

### References:
[1] Jeffries, C.D., Ford, J.R., Tilson, J.L. et al. A greedy regression algorithm with coarse weights offers novel advantages. Sci Rep 12, 5440 (2022). https://doi.org/10.1038/s41598-022-09415-2