# ARFS CollinearityThreshold bug testing

## import and setup

In [1]:
import arfs
import arfs.feature_selection as arfsfs
import janitor
import numpy as np
import pandas as pd
from arfs.utils import load_data

%load_ext watermark
%watermark -u -n -t -v -m -iv

Last updated: Fri Feb 09 2024 20:59:36

Python implementation: CPython
Python version       : 3.10.13
IPython version      : 8.21.0

Compiler    : Clang 16.0.6 
OS          : Darwin
Release     : 21.4.0
Machine     : arm64
Processor   : arm
CPU cores   : 8
Architecture: 64bit

arfs   : 2.2.2
janitor: 0.26.0
numpy  : 1.25.2
pandas : 2.2.0



In [2]:
def style_df(styler):
    styler.highlight_between(left=0.85)
    styler.highlight_between(left=-1, right=-0.85)
    styler.format(precision=3)
    return styler


####################
# ARFS versions w/ verbose printing
####################


def _most_collinear_old(association_matrix, threshold, verbose):
    cols_to_drop = [
        column
        for column in association_matrix.columns
        if any(association_matrix.loc[:, column].abs() > threshold)
    ]
    rows_to_drop = [
        row
        for row in association_matrix.index
        if any(association_matrix.loc[row, :].abs() > threshold)
    ]
    to_drop = list(set(cols_to_drop).union(set(rows_to_drop)))
    most_collinear_series = (
        association_matrix[to_drop].abs().sum(axis=1).sort_values(ascending=False)
    )
    most_collinear_series += (
        association_matrix[to_drop].abs().sum(axis=0).sort_values(ascending=False)
    )
    most_collinear_series /= 2
    if verbose:
        print(association_matrix[[most_collinear_series.index[0]]].abs().max())
    return most_collinear_series.index[0], to_drop


def _recursive_collinear_elimination_old(association_matrix, threshold, verbose=False):
    dum = association_matrix.copy()
    most_collinear_features = []

    while True:
        most_collinear_feature, to_drop = _most_collinear_old(dum, threshold, verbose)

        # Break if no more features to drop
        if not to_drop:
            break

        if most_collinear_feature not in most_collinear_features:
            most_collinear_features.append(most_collinear_feature)
            dum = dum.drop(columns=most_collinear_feature, index=most_collinear_feature)

    return most_collinear_features


####################
# updated versions w/ verbose printing
####################
def _most_collinear(association_matrix, threshold, verbose):

    cols_to_drop = association_matrix.loc[
        :, (association_matrix > threshold).any(axis=0)
    ].columns.values
    rows_to_drop = association_matrix.loc[
        (association_matrix > threshold).any(axis=1), :
    ].index.values
    to_drop = list(set(cols_to_drop).union(set(rows_to_drop)))
    if not to_drop:
        return None, None
    most_collinear_series = association_matrix.loc[:, to_drop].sum(axis=0)
    most_collinear_series += association_matrix.loc[to_drop, :].sum(axis=1)
    most_collinear_series /= 2
    if verbose:
        print(
            association_matrix[
                [most_collinear_series.sort_values(ascending=False).index[0]]
            ].max()
        )
    return most_collinear_series.sort_values(ascending=False).index[0], to_drop


def _recursive_collinear_elimination(association_matrix, threshold, verbose=False):
    dum = association_matrix.copy().abs()
    most_collinear_features = []

    while True:
        most_collinear_feature, to_drop = _most_collinear(dum, threshold, verbose)

        # Break if no more features to drop
        if not to_drop:
            break

        if most_collinear_feature not in most_collinear_features:
            most_collinear_features.append(most_collinear_feature)
            dum = dum.drop(columns=most_collinear_feature, index=most_collinear_feature)

    return most_collinear_features

## Load and preprocess the data

In [3]:
cancer = load_data(name="cancer")
X, y = cancer.data, cancer.target
y = y.astype(int)
display(X.head(), X.shape)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,random_num1,random_num2,genuine_num
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.496714,0,-0.24934
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,-0.138264,1,-0.04441
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.647689,3,0.128395
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1.52303,0,-0.079921
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,-0.234153,0,-0.094302


(569, 33)

In [4]:
arfs_selector = arfsfs.CollinearityThreshold(0.85, n_jobs=7)
arfs_selector = arfs_selector.fit(X)
X_filtered = arfs_selector.transform(X)

## Timing comparisons between ARFS 2.2.2 and my proposed updates

In [5]:
%%timeit
most_collinear_features = _recursive_collinear_elimination(
    arfs_selector.assoc_matrix_, threshold=0.85
)

13.3 ms ± 281 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
%%timeit
most_collinear_features_old = _recursive_collinear_elimination_old(
    arfs_selector.assoc_matrix_, threshold=0.85
)

52.8 ms ± 228 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
%%timeit
most_collinear_features_old = arfsfs.unsupervised._recursive_collinear_elimination(
    arfs_selector.assoc_matrix_, threshold=0.85
)

54.6 ms ± 2.89 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Demonstrating the bug

This is a test example of the bug I was seeing.  
Here we can see the old version is removing features that it shouldn't.

In [8]:
most_collinear_features = _recursive_collinear_elimination(
    arfs_selector.assoc_matrix_, threshold=0.85
)
most_collinear_features_old = arfsfs.unsupervised._recursive_collinear_elimination(
    arfs_selector.assoc_matrix_, threshold=0.85
)
np.all(np.isin(most_collinear_features, most_collinear_features_old))

False

In [9]:
np.array(most_collinear_features)[
    ~np.isin(most_collinear_features, most_collinear_features_old)
]

array(['radius error', 'worst texture'], dtype='<U20')

In [10]:
np.array(most_collinear_features_old)[
    ~np.isin(most_collinear_features_old, most_collinear_features)
]

array(['mean area', 'compactness error', 'mean texture', 'area error'],
      dtype='<U20')

So we can clearly see that we are getting different results.  
We can dive into this a little with the print statement I added in the above versions.  
This print statement will show us what feature is being removed in each step and what it's max correlation value is.

In [11]:
most_colliner_features_old = _recursive_collinear_elimination_old(
    arfs_selector.assoc_matrix_,
    threshold=0.85,
    verbose=True,
)

worst perimeter    0.993708
dtype: float64
mean concave points    0.921391
dtype: float64
worst radius    0.984015
dtype: float64
mean perimeter    0.997855
dtype: float64
mean concavity    0.884103
dtype: float64
worst area    0.959213
dtype: float64
worst concave points    0.855434
dtype: float64
mean area    0.987357
dtype: float64
mean radius    0.735864
dtype: float64
perimeter error    0.972794
dtype: float64
worst concavity    0.892261
dtype: float64
compactness error    0.803269
dtype: float64
mean compactness    0.865809
dtype: float64
mean texture    0.912045
dtype: float64
area error    0.95183
dtype: float64
concave points error    0.771804
dtype: float64


In [12]:
arfs_selector.assoc_matrix_.loc[
    :, ["mean radius", "compactness error", "concave points error"]
].style.pipe(style_df)

Unnamed: 0,mean radius,compactness error,concave points error
area error,0.736,0.285,0.416
compactness error,0.206,0.0,0.744
concave points error,0.376,0.744,0.0
concavity error,0.194,0.801,0.772
fractal dimension error,-0.043,0.803,0.611
genuine_num,-0.238,-0.214,-0.185
mean area,0.987,0.213,0.372
mean compactness,0.506,0.739,0.642
mean concave points,0.823,0.49,0.616
mean concavity,0.677,0.67,0.683


We can see that `compactness error` and `concave points error` both should never  
have been removed as neither meets the threshold.

In [13]:
most_col_features_new = _recursive_collinear_elimination(
    arfs_selector.assoc_matrix_,
    threshold=0.85,
    verbose=True,
)

mean concavity    0.921391
dtype: float64
mean concave points    0.910155
dtype: float64
mean compactness    0.865809
dtype: float64
worst concave points    0.855434
dtype: float64
worst perimeter    0.993708
dtype: float64
worst concavity    0.892261
dtype: float64
worst radius    0.984015
dtype: float64
perimeter error    0.972794
dtype: float64
mean perimeter    0.997855
dtype: float64
worst area    0.959213
dtype: float64
radius error    0.95183
dtype: float64
mean radius    0.987357
dtype: float64
worst texture    0.912045
dtype: float64


## Illustrating the sorting problem

The problem here is that by sorting a series and then adding another sorted series to it,  
the order stays in the same order as the first series.

In [14]:
series_1 = pd.Series([2, 1, 3], index=["A", "B", "C"])
series_2 = pd.Series([2.1, 3.3, 1.2], index=["A", "B", "C"])

In [15]:
print(series_1.sort_values(ascending=False))
print(series_2.sort_values(ascending=False))

C    3
A    2
B    1
dtype: int64
B    3.3
A    2.1
C    1.2
dtype: float64


In [16]:
presort_series = series_1.sort_values(ascending=False)
presort_series += series_2.sort_values(ascending=False)
print(presort_series)

C    4.2
A    4.1
B    4.3
dtype: float64


In [17]:
postsort_series = series_1
postsort_series += series_2
print(postsort_series.sort_values(ascending=False))

B    4.3
C    4.2
A    4.1
dtype: float64


## Illustrating the selecting problem

The problem here is `df[]` selects columns, but `df.sum(axis=1)` will give you rows,  
then when you combine `most_collinear_series`, you have a mixture of row and columns labels.  
Combined with the above sorting problem this meant that all features (or more specifically,   
any feature in the df's index) were being used in the sort call and could be dropped if they  
had the highest average collinearity to the features in `to_drop`.

In [18]:
df_1 = pd.DataFrame(data=np.ones((3, 2)), index=["1", "2", "A"], columns=["A", "B"])
to_drop = ["A"]
print(df_1)

     A    B
1  1.0  1.0
2  1.0  1.0
A  1.0  1.0


In [19]:
print(df_1[to_drop].sum(axis=1))

1    1.0
2    1.0
A    1.0
dtype: float64


In [20]:
print(df_1[to_drop].sum(axis=0))

A    3.0
dtype: float64


In [21]:
combined_series = df_1[to_drop].sum(axis=1)
combined_series += df_1[to_drop].sum(axis=0)
print(combined_series)

1    NaN
2    NaN
A    4.0
dtype: float64
