# Task 3: Feature importance

In [23]:
import pickle
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.linear_model import RidgeCV, LinearRegression, Ridge
from scipy.stats import pearsonr
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

## Load data

In [27]:
with open("./task3_feature-importance_data.pickle", "rb") as f:
    X, y = pickle.load(f)

## Fit model

In [28]:
model = make_pipeline(StandardScaler(), RidgeCV())
model.fit(X, y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('ridgecv', RidgeCV(alphas=array([ 0.1,  1. , 10. ])))])

## Analyse correlations and model coefficients

In [29]:
correlations_r = [pearsonr(X[:, i], y)[0] for i in range(X.shape[1])]
correlations_p = [pearsonr(X[:, i], y)[1] for i in range(X.shape[1])]

In [30]:
df = pd.DataFrame({
        "coef": model["ridgecv"].coef_, 
        "correlation r": correlations_r, 
        "correlation p": correlations_p}, 
    index=[f"feature {i}" for i in range(X.shape[1])])
df

Unnamed: 0,coef,correlation r,correlation p
feature 0,-1.529528,-0.034627,0.273966
feature 1,0.516905,0.009749,0.7581588
feature 2,59.958697,0.49661,2.160929e-63
feature 3,0.05312,0.049881,0.1149366
feature 4,68.390912,0.55458,1.019725e-81
feature 5,72.159892,0.577704,4.339894999999999e-90
feature 6,-2.134884,-0.02104,0.5063249
feature 7,39.197821,-0.009722,0.7588043
feature 8,40.087218,0.024289,0.4429447
feature 9,1.91571,0.013164,0.677574


In [31]:
# feature 7 and 8 are binary
np.unique(X[:,7]), np.unique(X[:,8])

(array([0., 1.]), array([0., 1.]))

In [32]:
# feature 7 and 8 are mutually exclusive
((X[:,7] == 1) & (X[:,8] == 1)).sum()

0

In [33]:
# feature 7 and 8 nearly cover the whole dataset!
((X[:,7] == 1) | (X[:,8] == 1)).sum() / X.shape[0]

0.99

## Tasks

Feature 7 and 8 seem to be important features for the model (with coefficients > 30!). However, taking a closer look, they are both binary, mutually exclusive, and nearly cover the whole dataset. Also they are barely correlated to the outcome by themselves. I would not expect them to both have such a high importance for the model and on top of that both positive! What is going on?