In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
import math
from sklearn.metrics.pairwise import rbf_kernel
from numpy import genfromtxt
from sklearn.preprocessing import OneHotEncoder
from numpy.linalg import eig
from numpy.linalg import norm

In [10]:
X = pd.read_csv("life_expectancy_X.csv").to_numpy()
Y = pd.read_csv("life_expectancy_y.csv").to_numpy()
n = len(Y)
Y = np.reshape(Y,(n,1))

In [11]:
# Adding Bias to X
X = np.hstack((X, np.zeros((X.shape[0], 1))))
# Scale the data between 0 and 1
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

Use HSIC between each feature and label to identify the features that have the strongest dependency on the label.

In [12]:
def HSIC(X, Y):
    γ = 1 
    K_x = rbf_kernel(X, gamma=1)
    K_y = rbf_kernel(X, gamma=1)
    C = np.eye(n) - (1/n) * np.ones((n,n))
    return np.trace(K_x.dot(C).dot(K_y).dot(C))/ (n*n)

features = ['Exercise amount', 
            'Amount of supportive relationships' ,
            'Number of siblings', 
            'Alcohol / Drugs / Smoking consumption', 
            'Height', 'Attractiveness' ,'work ethicsm', 'bias']

HSIC_list = []
for f in X.T:
    f = np.reshape(f, (n,1))
    HSIC_list.append(HSIC(f, Y))


In [13]:
df = pd.DataFrame([HSIC_list], columns=features)
# Display the DataFrame
print(df)

   Exercise amount  Amount of supportive relationships  Number of siblings  \
0         0.015822                            0.017959            0.026307   

   Alcohol / Drugs / Smoking consumption    Height  Attractiveness  \
0                               0.016401  0.016604        0.016032   

   work ethicsm          bias  
0      0.015122  7.476939e-31  


Use ISM to identify the most important features.

In [24]:
γ = 1
X = genfromtxt("life_expectancy_X.csv", delimiter=',')
y = genfromtxt("life_expectancy_y.csv")
Y = np.reshape(y, (len(y), 1))
# Scale the data between 0 and 1
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [32]:
Y_0 = OneHotEncoder(categories='auto', sparse=False).fit_transform(Y)
K_r = Y_0.dot(np.transpose(Y_0))
R = np.transpose((np.transpose(K_r - np.mean(K_r, axis=0))) - np.mean(np.transpose((K_r - np.mean(K_r, axis=0))), axis=0))
D = np.diag(np.sum(R, axis=0))
L = D - R
Φ = -X.T.dot(L).dot(X) 
eigen_val, W_1 = eig(Φ)
W_1 = W_1[:, ::-1][:, 0:Φ.shape[0]]
eigen_val = np.flip(eigen_val)[0:Φ.shape[0]]
Σ = np.diag(eigen_val)



In [27]:
eigen_val_0 = np.array([1,1,1,1,1,1,1])
while True:
    K_x = rbf_kernel(X.dot(W_1), gamma = γ)
    Ψ =  R * K_x
    D = np.diag(np.sum(Ψ,axis=0))
    L = D - Ψ 
    Φ = -X.T.dot(L).dot(X)
    eigen_val, W_1 = eig(Φ)
    W_1 = W_1[:, ::-1][:, 0:Φ.shape[0]]
    eigen_val = np.flip(eigen_val)[0:Φ.shape[0]]
    Σ = np.diag(eigen_val)
    if norm(eigen_val_0 - eigen_val) < 0.01: break
    eigen_val_0 = eigen_val

In [33]:
name = np.array([['Exercise amount'], 
            ['Amount of supportive relationships'] ,
            ['Number of siblings'], 
            ['Alcohol / Drugs / Smoking consumption'], 
            ['Height'], ['Attractiveness'] ,['work ethic']])

eigen_val, W_1 = eig(Φ)
W_1 = W_1[:, ::-1][:, 0:Φ.shape[0]]
eigen_val = np.flip(eigen_val)[0:Φ.shape[0]]
Σ = np.diag(eigen_val)
W_1 = np.round(W_1,3)
print(np.hstack((W_1, name)))


[['-0.249' '0.449' '0.236' '0.131' '-0.166' '0.781' '0.162'
  'Exercise amount']
 ['-0.038' '-0.479' '-0.395' '-0.584' '-0.302' '0.423' '-0.031'
  'Amount of supportive relationships']
 ['0.103' '-0.179' '-0.027' '0.068' '-0.032' '-0.076' '0.972'
  'Number of siblings']
 ['-0.195' '0.49' '-0.706' '0.11' '-0.387' '-0.242' '0.052'
  'Alcohol / Drugs / Smoking consumption']
 ['-0.666' '0.145' '0.338' '-0.542' '-0.025' '-0.337' '0.117' 'Height']
 ['0.62' '0.329' '0.322' '-0.365' '-0.502' '-0.135' '0.002'
  'Attractiveness']
 ['0.245' '0.409' '-0.267' '-0.445' '0.691' '0.123' '0.106'
  'work ethicsm']]


What is the difference between the results from ISM and measuring via HSIC feature by feature?