
# Small pathological example

This example shows that AUC according to the R version of Calf may be meaningless when the data is collinear.

In [41]:
# Author: Rolf Carlson, Carlson Research LLC, <hrolfrc@gmail.com>
# License: 3-clause BSD

Here is a description of the problem

In [42]:
#   For five subjects, suppose the control = 0, case = 1 status is
# 
#   0
#   0
#   0
#   1
#   1
# 
#   Suppose for five markers the observed values (column = a marker) are
# 
#   0.3801	0.2484	-0.1280	-0.5741	1.0631
#   -0.9703	-0.5551	-0.3680	1.1324	-1.0930
#   0.5148	-0.9927	0.2833	1.0068	0.5449
#   1.1880	1.5985	-1.2621	-0.5094	0.5316
#   -1.1126	-0.2992	1.4748	-1.0558	-1.0467
# 
#   What weight vector provides us with AUC = 1.0?
# 
#   Running the problem through calf gets the following:
# 
#   Marker  Weight
#     F4       1
#     F2      -1
#     F5       1
# 
#   AUC: 1.0
#   Final p-value: 0.05582771541247467

In [43]:
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Lasso
import numpy as np
import re
import pprint

rng = np.random.default_rng()
import seaborn as sns
sns.set_theme(style="darkgrid")

## Create the problem

In [44]:
X_str = """0.3801	0.2484	-0.1280	-0.5741	1.0631
-0.9703	-0.5551	-0.3680	1.1324	-1.0930
0.5148	-0.9927	0.2833	1.0068	0.5449
1.1880	1.5985	-1.2621	-0.5094	0.5316
-1.1126	-0.2992	1.4748	-1.0558	-1.0467
"""
# convert whitespace to a single space
X_str = re.sub(r"\s+", " ", X_str)

# make an array
X = np.fromstring(X_str, dtype='float64', sep=' ')
X = X.reshape(5, 5)
# X = X.tolist()
# print(X)
y = np.array([0, 0, 0, 1, 1]).reshape(5, 1)
# # Y = [0, 0, 0, 1, 1]
# x = X.astype('float64')
# y = Y.astype('float64')
feature_names = ['F1', 'F2', 'F3', 'F4', 'F5']

In [45]:
X

array([[ 0.3801,  0.2484, -0.128 , -0.5741,  1.0631],
       [-0.9703, -0.5551, -0.368 ,  1.1324, -1.093 ],
       [ 0.5148, -0.9927,  0.2833,  1.0068,  0.5449],
       [ 1.188 ,  1.5985, -1.2621, -0.5094,  0.5316],
       [-1.1126, -0.2992,  1.4748, -1.0558, -1.0467]])

In [46]:
y

array([[0],
       [0],
       [0],
       [1],
       [1]])

## Generalization failure

Lasso does not learn enough to predict the data

In [47]:
clf = Lasso().fit(X, y)
print("Accuracy on the data: {:.2f}".format(clf.score(X, y)))

Accuracy on the data: 0.00


None of the features are identified as important

In [48]:
result = permutation_importance(clf, X, y, n_repeats=10, random_state=42, n_jobs=-1)
selected_features = []
perm_sorted_idx = []
for i in result.importances_mean.argsort()[::-1]:
    # defining importance as the mean - 2 stdev produces a better prediction
    # than just mean
    if  result.importances_mean[i] - 2 * result.importances_std[i] > 0.003:
        selected_features.append(feature_names[i])
        perm_sorted_idx.append(i)
        print(f"{feature_names[i]: <8}"
              f"{result.importances_mean[i]: .3f}"
              f" +/- {result.importances_std[i]: .3f}")
        
pprint.pprint(result)

{'importances': array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 'importances_mean': array([0., 0., 0., 0., 0.]),
 'importances_std': array([0., 0., 0., 0., 0.])}
