In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
import math
from sklearn.metrics.pairwise import rbf_kernel
from numpy import genfromtxt
from sklearn.preprocessing import OneHotEncoder
from numpy.linalg import eig
from numpy.linalg import norm

In [2]:
X = pd.read_csv("life_expectancy_X.csv").to_numpy()
Y = pd.read_csv("life_expectancy_y.csv").to_numpy()
n = len(Y)
Y = np.reshape(Y,(n,1))

In [3]:
# Adding Bias to X
X = np.hstack((X, np.ones((X.shape[0], 1))))
# Scale the data between 0 and 1
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

Use HSIC between each feature and label to identify the features that have the strongest dependency on the label.

In [4]:
def HSIC(X, Y):
    γ = 1 
    K_x = rbf_kernel(X, gamma=1)
    K_y = rbf_kernel(X, gamma=1)
    C = np.eye(n) - ((1/n) * np.ones((n,n)))
    return np.trace(K_x.dot(C).dot(K_y).dot(C))/ (n*n)

features = np.array([['Exercise amount'], 
            ['supportive relationships'] ,
            ['Num siblings'], 
            ['Alcohol / Drugs consumption'], 
            ['Height'], ['Attractiveness'] ,['work ethic'], ['bias']])

HSIC_list = []
for f in X.T:
    f = np.reshape(f, (n,1))
    HSIC_list.append(HSIC(f, Y))


In [5]:
# Convert features to a list
features_list = features.flatten().tolist()

# Create DataFrame
df = pd.DataFrame(data=[HSIC_list], columns=features_list)
df.head()

Unnamed: 0,Exercise amount,supportive relationships,Num siblings,Alcohol / Drugs consumption,Height,Attractiveness,work ethic,bias
0,0.015822,0.017959,0.026307,0.016401,0.016604,0.016032,0.015122,7.476939e-31


Use ISM to identify the most important features.

In [6]:
γ = 1
X = genfromtxt("life_expectancy_X.csv", delimiter=',')
y = genfromtxt("life_expectancy_y.csv")
Y = np.reshape(y, (len(y), 1))
# Scale the data between 0 and 1
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [7]:
Y_0 = OneHotEncoder(categories='auto', sparse=False).fit_transform(Y)
K_r = Y_0.dot(np.transpose(Y_0))
R = np.transpose((np.transpose(K_r - np.mean(K_r, axis=0))) - np.mean(np.transpose((K_r - np.mean(K_r, axis=0))), axis=0))
D = np.diag(np.sum(R, axis=0))
L = D - R
Φ = -X.T.dot(L).dot(X) 
eigen_val, W_1 = eig(Φ)
W_1 = W_1[:, ::-1][:, 0:Φ.shape[0]]
eigen_val = np.flip(eigen_val)[0:Φ.shape[0]]
Σ = np.diag(eigen_val)



In [9]:
eigen_val_0 = np.array([1,1,1,1,1,1,1])
while True:
    K_x = rbf_kernel(X.dot(W_1), gamma = γ)
    Ψ =  R * K_x
    D = np.diag(np.sum(Ψ,axis=0))
    L = D - Ψ 
    Φ = -X.T.dot(L).dot(X)
    eigen_val, W_1 = eig(Φ)
    W_1 = W_1[:, ::-1][:, 0:Φ.shape[0]]
    eigen_val = np.flip(eigen_val)[0:Φ.shape[0]]
    Σ = np.diag(eigen_val)
    if norm(eigen_val_0 - eigen_val) < 0.01: break
    eigen_val_0 = eigen_val

In [13]:
name = np.array([['Exercise amount'], 
            ['Amount of supportive relationships'] ,
            ['Number of siblings'], 
            ['Alcohol / Drugs / Smoking consumption'], 
            ['Height'], ['Attractiveness'] ,['work ethic']])

eigen_val, W_1 = eig(Φ)
W_1 = W_1[:, ::-1][:, 0:Φ.shape[0]]
eigen_val = np.flip(eigen_val)[0:Φ.shape[0]]
index_of_largest_eigen = np.argmax(eigen_val)
W_1 = W_1[index_of_largest_eigen]
W_1 = np.round(W_1,3)

In [14]:
# Convert name and W_1 to DataFrame
df = pd.DataFrame(data=W_1, columns=['Weights'], index=name.flatten())

In [15]:
print(df)

                                       Weights
Exercise amount                          0.415
Amount of supportive relationships       0.262
Number of siblings                       0.486
Alcohol / Drugs / Smoking consumption   -0.315
Height                                   0.631
Attractiveness                          -0.081
work ethic                              -0.133


What is the difference between the results from ISM and measuring via HSIC feature by feature?



For ISM, the Alcohol and Drug Consumption, Attractiveness and work ethic seem to negatively impact life expectancy with Alcohol and Drug consumption being the largest negative weight having the greatest negative impact. Height is the highest weighted positive amount in ISM. While in HSIC supportive relationships and Alcohol and Drug consumption seem to be the highest impacted features. 