# The interpretations using LIME

# Heart dataset

In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt


heart = pd.read_csv('data/datasets_33180_43520_heart.csv', sep = ',')

# LIME requires numpy.ndarray with all features with the same type
heart['age'] = heart['age'].astype(np.float64)
heart['sex'] = heart['sex'].astype(np.float64)
heart['cp'] = heart['cp'].astype(np.float64)
heart['trestbps'] = heart['trestbps'].astype(np.float64)
heart['chol'] = heart['chol'].astype(np.float64)
heart['fbs'] = heart['fbs'].astype(np.float64)
heart['restecg'] = heart['restecg'].astype(np.float64)
heart['thalach'] = heart['thalach'].astype(np.float64)
heart['exang'] = heart['exang'].astype(np.float64)
heart['oldpeak'] = heart['oldpeak'].astype(np.float64)
heart['slope'] = heart['slope'].astype(np.float64)
heart['ca'] = heart['ca'].astype(np.float64)
heart['thal'] = heart['thal'].astype(np.float64)


# Diabetes dataset

In [2]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt

wine = pd.read_csv('data/wine_limpo.csv', sep = ',')
wine.drop(['Unnamed: 0'], axis = 1, inplace = True)

# LIME requires numpy.ndarray with all features with the same type
wine['fixed acidity'] = wine['fixed acidity'].astype(np.float64)
wine['volatile acidity'] = wine['volatile acidity'].astype(np.float64)
wine['citric acid'] = wine['citric acid'].astype(np.float64)
wine['residual sugar'] = wine['residual sugar'].astype(np.float64)
wine['chlorides'] = wine['chlorides'].astype(np.float64)
wine['free sulfur dioxide'] = wine['free sulfur dioxide'].astype(np.float64)
wine['total sulfur dioxide'] = wine['total sulfur dioxide'].astype(np.float64)
wine['density'] = wine['density'].astype(np.float64)
wine['pH'] = wine['pH'].astype(np.float64)
wine['sulphates'] = wine['sulphates'].astype(np.float64)
wine['alcohol'] = wine['alcohol'].astype(np.float64)

# Wine dataset

In [3]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
import datetime as dt

diabetes = pd.read_csv('data/diabetes_limpo.csv', sep = ',')
diabetes.drop(['Unnamed: 0'], axis = 1, inplace = True)

# LIME requires numpy.ndarray with all features with the same type
diabetes['Pregnancies'] = diabetes['Pregnancies'].astype(np.float64)
diabetes['Glucose'] = diabetes['Glucose'].astype(np.float64)
diabetes['BloodPressure'] = diabetes['BloodPressure'].astype(np.float64)
diabetes['SkinThickness'] = diabetes['SkinThickness'].astype(np.float64)
diabetes['Insulin'] = diabetes['Insulin'].astype(np.float64)
diabetes['BMI'] = diabetes['BMI'].astype(np.float64)
diabetes['DiabetesPedigreeFunction'] = diabetes['DiabetesPedigreeFunction'].astype(np.float64)
diabetes['Age'] = diabetes['Age'].astype(np.float64)


# LIME 

## Choosing 4 instances (TP, FP, TN, FN) of each dataset randomly from DT to be our baseline

Code to choose 4 instances (TP, FP, TN, FN) of each dataset randomly, we have choosed the decision tree as the baseline classificator to choose the confusion matrix metrics

### Get the samples from the baseline

In [5]:
import numpy as np
import pandas as pd
import lime
import lime.lime_tabular
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
sys.path.append('./explanation_helper')
import expla_helper as explanation
from sklearn.model_selection import train_test_split

np.random.seed(1)

heart = pd.read_csv('data/datasets_33180_43520_heart.csv', sep = ',')
model = serializer.load_model(file_name = 'serialized_model/DT_heart.pkl')

heart_index = explanation.sample_baseline_model(model, heart)
# print(heart_index)

wine = pd.read_csv('data/wine_limpo.csv', sep = ',')
model = serializer.load_model(file_name = 'serialized_model/DT_wine.pkl')

wine_index = explanation.sample_baseline_model(model, wine)
# print(wine_index)

diabetes = pd.read_csv('data/diabetes_limpo.csv', sep = ',')
model = serializer.load_model(file_name = 'serialized_model/DT_diabetes.pkl')

diabetes_index = explanation.sample_baseline_model(model, diabetes)
# print(diabetes_index)

## MLP

### Heart dataset - instance 01

In [9]:
# import numpy as np
import lime
import lime.lime_tabular
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn.model_selection import train_test_split


model_file_name = 'serialized_model/MLP_heart.pkl'
model = serializer.load_model(model_file_name)

dataset = heart.copy(deep = True)
target_names = np.array(['doente', 'saudável'])

folder = 'explanation/'
model_dataset = model_file_name.replace('serialized_model/','').replace('.pkl', '')
instance_index = heart_index[0]
num_features = dataset.columns.size - 1 #removing the target

output_file = """{0}{1}_instance_{2}_num_features_{3}.html""".format(folder, 
                                                                      model_dataset, 
                                                                      instance_index, 
                                                                      num_features)

explain_it(model, 
           dataset, 
           np.array(['doente', 'saudável']), 
           num_features, 
           instance_index, 
           output_file)

### Heart dataset - instance 02

In [10]:
# import numpy as np
import lime
import lime.lime_tabular
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn.model_selection import train_test_split
# MLP_diabetes.pkl
# MLP_wine.pkl

model_file_name = 'serialized_model/MLP_heart.pkl'
model = serializer.load_model(model_file_name)

dataset = heart.copy(deep = True)
target_names = np.array(['doente', 'saudável'])

folder = 'explanation/'
model_dataset = model_file_name.replace('serialized_model/','').replace('.pkl', '')
instance_index = heart_index[1]
num_features = dataset.columns.size - 1 #removing the target

output_file = """{0}{1}_instance_{2}_num_features_{3}.html""".format(folder, 
                                                                      model_dataset, 
                                                                      instance_index, 
                                                                      num_features)

explain_it(model, 
           dataset, 
           np.array(['doente', 'saudável']), 
           num_features, 
           instance_index, 
           output_file)

### Heart dataset - instance 03

In [11]:
# import numpy as np
import lime
import lime.lime_tabular
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn.model_selection import train_test_split
# MLP_diabetes.pkl
# MLP_wine.pkl

model_file_name = 'serialized_model/MLP_heart.pkl'
model = serializer.load_model(model_file_name)

dataset = heart.copy(deep = True)
target_names = np.array(['doente', 'saudável'])

folder = 'explanation/'
model_dataset = model_file_name.replace('serialized_model/','').replace('.pkl', '')
instance_index = heart_index[2]
num_features = dataset.columns.size - 1 #removing the target

output_file = """{0}{1}_instance_{2}_num_features_{3}.html""".format(folder, 
                                                                      model_dataset, 
                                                                      instance_index, 
                                                                      num_features)

explain_it(model, 
           dataset, 
           np.array(['doente', 'saudável']), 
           num_features, 
           instance_index, 
           output_file)

### Heart dataset - instance 04

In [12]:
# import numpy as np
import lime
import lime.lime_tabular
import sys
sys.path.append('./ml_helper')
import ml_helper_train_model as model_train
import ml_helper_serialize_model as serializer
from sklearn.model_selection import train_test_split
# MLP_diabetes.pkl
# MLP_wine.pkl

model_file_name = 'serialized_model/MLP_heart.pkl'
model = serializer.load_model(model_file_name)

dataset = heart.copy(deep = True)
target_names = np.array(['doente', 'saudável'])

folder = 'explanation/'
model_dataset = model_file_name.replace('serialized_model/','').replace('.pkl', '')
instance_index = heart_index[3]
num_features = dataset.columns.size - 1 #removing the target

output_file = """{0}{1}_instance_{2}_num_features_{3}.html""".format(folder, 
                                                                      model_dataset, 
                                                                      instance_index, 
                                                                      num_features)

explain_it(model, 
           dataset, 
           np.array(['doente', 'saudável']), 
           num_features, 
           instance_index, 
           output_file)

### Wine dataset

### Diabetes dataset

## Random Forest

### Heart dataset

### Wine dataset

### Diabetes dataset

## Naive Bayes

### Heart dataset

### Wine dataset

### Diabetes dataset

## Decision Tree

### Heart dataset

### Wine dataset

### Diabetes dataset

## KNN

### Heart dataset

### Wine dataset

### Diabetes dataset

## Gaussian Process

### Heart dataset

### Wine dataset

### Diabetes dataset

## SVM

### Heart dataset

### Wine dataset

### Diabetes dataset

# SHAP

In [13]:
# import sklearn
# import shap
# from sklearn.model_selection import train_test_split

# # print the JS visualization code to the notebook
# shap.initjs()

# # train a SVM classifier
# X_train,X_test,Y_train,Y_test = train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0)
# svm = sklearn.svm.SVC(kernel='rbf', probability=True)
# svm.fit(X_train, Y_train)

# # use Kernel SHAP to explain test set predictions
# explainer = shap.KernelExplainer(svm.predict_proba, X_train, link="logit")
# shap_values = explainer.shap_values(X_test, nsamples=100)

# # plot the SHAP values for the Setosa output of the first instance
# shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:], link="logit")


In [14]:
# shap.force_plot(explainer.expected_value[0], shap_values[0], X_test, link="logit")

In [15]:
# shap.summary_plot(shap_values, X_test.iloc[0:1000,:])

# Meu exemplo