# 2) This notebook contains the experiments on Heart Statlog dataset with Altruist

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from altruist import Altruist
from fi_techniques import FeatureImportance
import pandas as pd 
import numpy as np
import urllib
import networkx as nx
import matplotlib.pyplot as plt
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

Using TensorFlow backend.


Firstly, we load the dataset and we set the feature and class names

In [3]:
url="http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/heart/heart.dat"
raw_data = urllib.request.urlopen(url)
credit=np.genfromtxt(raw_data)
X,y = credit[:,:-1], credit[:,-1].squeeze()
y = [int(i-1) for i in y]
feature_names = ['age','sex','chest pain','resting blood pressure','serum cholestoral',
               'fasting blood sugar','resting ecg results','maximum heart rate achieved','exercise induced angina','oldpeak',
               'the slope of the peak exercise','number of major vessels','reversable defect']
class_names = ['absence','presence']

heart_statlog = pd.DataFrame(X,columns=feature_names)

We can plot some instances to see the features and their values

In [4]:
heart_statlog.head()

Unnamed: 0,age,sex,chest pain,resting blood pressure,serum cholestoral,fasting blood sugar,resting ecg results,maximum heart rate achieved,exercise induced angina,oldpeak,the slope of the peak exercise,number of major vessels,reversable defect
0,70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0
1,67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0
2,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0
3,64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0


Moreover, we can use pandas.describe() to see the ranges of each feature. For example, we observe that curtosis's range is -5.286 to 17.927

In [5]:
heart_statlog.describe()

Unnamed: 0,age,sex,chest pain,resting blood pressure,serum cholestoral,fasting blood sugar,resting ecg results,maximum heart rate achieved,exercise induced angina,oldpeak,the slope of the peak exercise,number of major vessels,reversable defect
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,54.433333,0.677778,3.174074,131.344444,249.659259,0.148148,1.022222,149.677778,0.32963,1.05,1.585185,0.67037,4.696296
std,9.109067,0.468195,0.95009,17.861608,51.686237,0.355906,0.997891,23.165717,0.470952,1.14521,0.61439,0.943896,1.940659
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0
25%,48.0,0.0,3.0,120.0,213.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,3.0
50%,55.0,1.0,3.0,130.0,245.0,0.0,2.0,153.5,0.0,0.8,2.0,0.0,3.0
75%,61.0,1.0,4.0,140.0,280.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0


Then We extract the train data from the dataframe

In [6]:
len(X)

270

We have 270 instances. We are going to use the build-in GridSearch of LionForests to find and train the best classifier for this dataset

In [7]:
scaler = MinMaxScaler(feature_range=(-1,1))

In [8]:
classifiers = {}
scalers = {}

In [9]:
pipe = Pipeline(steps=[('scaler', scaler), ('rf', RandomForestClassifier(random_state=0))])
parameters =[{
    'rf__max_depth': [5],#1, 5, 7, 10
    'rf__max_features': ['sqrt'], #'sqrt', 'log2', 0.75, None
    'rf__bootstrap': [False], #True, False
    'rf__min_samples_leaf' : [5], #1, 2, 5, 10, 0.10
    'rf__n_estimators': [500] #10, 100, 500, 1000
}]
clf = GridSearchCV(pipe, parameters, scoring='f1', cv=10, n_jobs=-1)
clf.fit(X, y)
scaler_rf = clf.best_estimator_.steps[0][1]
rf = clf.best_estimator_.steps[1][1]
classifiers[1] = [rf, str("Random Forests: "+ str(clf.best_score_))]
scalers[1] = scaler_rf

In [10]:
pipe = Pipeline(steps=[('scaler', scaler), ('svm', SVC(probability=True,random_state=77))])
#parameters = [
#  {'svm__C': [-3, 1, 3, 10, 100, 1000], 'svm__kernel': ['linear']},
#  {'svm__C': [-3, 1, 3, 10, 100, 1000], 'svm__gamma': [0.1, 0.01, 0.001, 0.0001], 'svm__kernel': ['rbf']},
#]
parameters = {'svm__C': [100], 'svm__gamma': [0.001], 'svm__kernel': ['rbf']} #best
clf = GridSearchCV(pipe, parameters, scoring='f1', cv=10, n_jobs=-1)
clf.fit(X, y)
scaler_svm = clf.best_estimator_.steps[0][1]
svm = clf.best_estimator_.steps[1][1]
classifiers[2] = [svm, str("SVM: "+ str(clf.best_score_))]
scalers[2] = scaler_svm

In [11]:
pipe = Pipeline(steps=[('scaler', scaler), ('lr', LogisticRegression(random_state=77))])
#parameters = [
#  {'lr__C': [-3, 1, 3, 10, 100, 1000], 'lr__penalty': ['l1'], 'lr__solver': ['liblinear', 'saga']},
#  {'lr__C': [-3, 1, 3, 10, 100, 1000], 'lr__penalty': ['l2'], 'lr__solver': ['newton-cg', 'lbfgs', 'sag','saga']}
#]
parameters = {'lr__C': [1], 'lr__penalty': ['l1'], 'lr__solver': ['saga']}#best
clf = GridSearchCV(pipe, parameters, scoring='f1', cv=10, n_jobs=-1)
clf.fit(X, y)
scaler_lr = clf.best_estimator_.steps[0][1]
lr = clf.best_estimator_.steps[1][1]
classifiers[3] = [lr, str("Logistic Regression: "+ str(clf.best_score_))]
scalers[3] = scaler_lr

In [12]:
pipe = Pipeline(steps=[('scaler', scaler), ('nn', MLPClassifier(early_stopping=True, random_state=77))])
#parameters = {
#    'nn__hidden_layer_sizes': [(2,10),(5,10),(10,100),(20,200),(50,500), (100,1000)],
#    'nn__activation': ['logistic', 'tanh', 'relu'],
#    'nn__solver': ['sgd', 'adam'],
#    'nn__alpha': [0.000001,0.0001,0.001, 0.01, 0.1],
#    'nn__learning_rate': ['constant', 'invscaling', 'adaptive']}
parameters = {
    'nn__hidden_layer_sizes': [(100,1000)], 
    'nn__activation': ['tanh'],
    'nn__solver': ['adam'],
    'nn__alpha': [0.000001],
    'nn__learning_rate': ['constant']}
clf = GridSearchCV(pipe, parameters, scoring='f1', cv=10, n_jobs=-1)
clf.fit(X, y)
scaler_nn = clf.best_estimator_.steps[0][1]
nn = clf.best_estimator_.steps[1][1]
classifiers[4] = [nn, str("Neural Network: "+ str(clf.best_score_))]
scalers[4] = scaler_nn

In [13]:
classifiers

{1: [RandomForestClassifier(bootstrap=False, max_depth=5, max_features='sqrt',
                         min_samples_leaf=5, n_estimators=500, random_state=0),
  'Random Forests: 0.8188916011524707'],
 2: [SVC(C=100, gamma=0.001, probability=True, random_state=77),
  'SVM: 0.8195089355089354'],
 3: [LogisticRegression(C=1, penalty='l1', random_state=77, solver='saga'),
  'Logistic Regression: 0.8120600762065292'],
 4: [MLPClassifier(activation='tanh', alpha=1e-06, early_stopping=True,
                hidden_layer_sizes=(100, 1000), random_state=77),
  'Neural Network: 0.7701775669029673']}

In [None]:
@interact(eli_5=False, lime=True, shap=True, perm_importance=True, intristic=True, cl=(1,4))
def g(eli_5, lime, shap, perm_importance, intristic, cl=1):
    print(classifiers[cl][1])
    X_t = scalers[cl].transform(X)
    fi = FeatureImportance(X_t, y, feature_names, class_names)
    fi_names = {fi.fi_lime:'Lime',fi.fi_shap:'Shap',fi.fi_eli:'Eli5',fi.fi_perm_imp:'Permuation Importance',fi.fi_rf:'Pseudo-Intristic RFs', fi.fi_coef_lr:'Intristic LR'}
    fis = []
    if (eli_5 and not cl == 2 and not cl == 3):
        fis.append(fi.fi_eli)
    if lime:
        fis.append(fi.fi_lime)
    if shap:
        fis.append(fi.fi_shap)
    if perm_importance:
        fis.append(fi.fi_perm_imp)
    if intristic and cl == 1:
        fis.append(fi.fi_rf)
    if intristic and cl == 3:
        fis.append(fi.fi_coef_lr)
    fis_scores = []
    for i in fis:
        fis_scores.append([])
    count = 0
    for instance in X_t:
        if (count + 1) % 50 == 0:
            print(count+1,"/",len(X_t),"..",end=", ")
        count = count + 1
        altruistino = Altruist(classifiers[cl][0], X_t, fis, feature_names, None)
        untruthful_features = altruistino.find_untruthful_features(instance)
        for i in range(len(untruthful_features[0])):
            fis_scores[i].append(len(untruthful_features[0][i]))
    print(len(X_t),"/",len(X_t))
    count = 0
    for fis_score in fis_scores:
        fi = fis[count]
        count = count + 1
        print(fi_names[fi],np.array(fis_score).mean())