In [None]:
import sys
import os
sys.path.append(os.getcwd() + '/diro2c/')
sys.path.append(os.getcwd() + '/data/')

In [None]:
import pandas as pd
from data.getdata import loaddata
from data.split3fold import split3fold
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score
from data.runningexampleblackboxes import *
import pickle
import numpy as np
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.inspection import permutation_importance

plt.style.use('ggplot')

In [None]:
param_grid_SVM = {
    'C':[0.1,1,10, 100],
    'kernel':['rbf','sigmoid','linear'],
    'gamma': ['scale', 'auto']}

param_grid_NB = {
    'var_smoothing': np.logspace(0,-9, num=10)
}

### Bank Marketing dataset

In [None]:
dataA, dataB, cols, discrete, continuous, le = loaddata('bankmarketing')
blackboxtrainA, trainA, testA = split3fold(dataA, 0.4, 0.2, random_state=1)
blackboxtrainB, trainB, testB = split3fold(dataB, 0.4, 0.2, random_state=1)

#### Training

##### Data A

In [None]:
svmA = svm.SVC(random_state=1)
grid_searchA = GridSearchCV(estimator = svmA, param_grid = param_grid_SVM, cv = 3, verbose = 4)
CV_svmA = grid_searchA.fit(blackboxtrainA[cols].values, blackboxtrainA['y'].values)
print(CV_svmA.best_params_)

In [None]:
#{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
svmA = svm.SVC(random_state=1, C=0.1, gamma = 'scale', kernel = 'linear')
svmA.fit(blackboxtrainA[cols].values, blackboxtrainA['y'].values)
filename = 'blackboxes/bankmarketingA.sav'
pickle.dump(svmA, open(filename, 'wb'))
pred=svmA.predict(trainA[cols].values)
accA = accuracy_score(trainA['y'].values,pred)
precAmacro = precision_score(trainA['y'].values,pred, average='macro')
recAmacro = recall_score(trainA['y'].values,pred, average='macro')
print("Accuracy on X_train: ",accA)

##### Data B

In [None]:
nbB = GaussianNB()
grid_searchB = GridSearchCV(estimator = nbB, param_grid = param_grid_NB, cv = 3, verbose=4)
CV_nbB = grid_searchB.fit(blackboxtrainB[cols].values, blackboxtrainB['y'].values)
print(CV_nbB.best_params_)

In [None]:
#{'var_smoothing': 0.01}
nbB=GaussianNB(var_smoothing = 0.01)
nbB.fit(blackboxtrainB[cols].values, blackboxtrainB['y'].values)
filename = 'blackboxes/bankmarketingB.sav'
pickle.dump(nbB, open(filename, 'wb'))
pred=nbB.predict(trainB[cols].values)
accB = accuracy_score(trainB['y'].values,pred)
precBmacro = precision_score(trainB['y'].values,pred, average='macro')
recBmacro = recall_score(trainB['y'].values,pred, average='macro')
print("Accuracy on X_train: ", accB)

##### Performance table

In [None]:
performance_bankmarketing = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision (macro)', 'Recall (macro)'],
    'Data A': [accA, precAmacro, recAmacro],
    'Data B': [accB, precBmacro, recBmacro]
})

##### Parameter table

In [None]:
parameter_bankmarketingA = pd.DataFrame.from_dict({k: str(v) for k, v in svmA.get_params().items()}, orient='index',
                                           columns = ['Bank-Marketing']).reset_index()
parameter_bankmarketingB = pd.DataFrame.from_dict({k: str(v) for k, v in nbB.get_params().items()}, orient='index',
                                           columns = ['Bank-Marketing']).reset_index()

### Compas dataset

In [None]:
dataA, dataB, cols, continuous, discrete, le = loaddata('compas')
blackboxtrainA, trainA, testA = split3fold(dataA, 0.4, 0.2, random_state=1)
blackboxtrainB, trainB, testB = split3fold(dataB, 0.4, 0.2, random_state=1)

#### Training

##### Data A

In [None]:
svmA = svm.SVC(random_state=1)
grid_searchA = GridSearchCV(estimator = svmA, param_grid = param_grid_SVM, cv = 3, verbose = 4)
CV_svmA = grid_searchA.fit(blackboxtrainA[cols].values, blackboxtrainA['y'].values)
print(CV_svmA.best_params_)

In [None]:
#{'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}
svmA = svm.SVC(random_state=1, C=100, gamma = 'scale', kernel = 'rbf')
svmA.fit(blackboxtrainA[cols].values, blackboxtrainA['y'].values)
filename = 'blackboxes/compasA.sav'
pickle.dump(svmA, open(filename, 'wb'))
pred=svmA.predict(trainA[cols].values)
accA = accuracy_score(trainA['y'].values,pred)
precAmicro = precision_score(trainA['y'].values,pred, average='micro')
precAmacro = precision_score(trainA['y'].values,pred, average='macro')
recAmicro = recall_score(trainA['y'].values,pred, average='micro')
recAmacro = recall_score(trainA['y'].values,pred, average='macro')
print("Accuracy on X_train: ",accA)

##### Data B

In [None]:
nbB = GaussianNB()
grid_searchB = GridSearchCV(estimator = nbB, param_grid = param_grid_NB, cv = 3, verbose=4)
CV_nbB = grid_searchB.fit(blackboxtrainB[cols].values, blackboxtrainB['y'].values)
print(CV_nbB.best_params_)

In [None]:
nbB=GaussianNB(var_smoothing = 0.0001)
nbB.fit(blackboxtrainB[cols].values, blackboxtrainB['y'].values)
filename = 'blackboxes/compasB.sav'
pickle.dump(nbB, open(filename, 'wb'))
pred=nbB.predict(trainB[cols].values)
accB = accuracy_score(trainB['y'].values,pred)
precBmicro = precision_score(trainB['y'].values,pred, average='micro')
precBmacro = precision_score(trainB['y'].values,pred, average='macro')
recBmicro = recall_score(trainB['y'].values,pred, average='micro')
recBmacro = recall_score(trainB['y'].values,pred, average='macro')
print("Accuracy on X_train: ", accB)

#### Permutation importance

In [None]:
val = pd.concat([trainA, trainB])

In [None]:
from sklearn.inspection import permutation_importance
r = permutation_importance(svmA, val[cols].values, val.y.values, n_repeats=30,random_state=0)

In [None]:
for i in r.importances_mean.argsort()[::-1]:
    print(f"{cols[i]:<8}"
          f"{r.importances_mean[i]:.3f}"
          f" +/- {r.importances_std[i]:.3f}")
#Negative values for permutation importance indicate that the predictions on the shuffled (or noisy) data are more accurate than the real data

In [None]:
r = permutation_importance(nbB, val[cols].values, val.y.values, n_repeats=30,random_state=0)
for i in r.importances_mean.argsort()[::-1]:
    print(f"{cols[i]:<8}"
          f"{r.importances_mean[i]:.3f}"
          f" +/- {r.importances_std[i]:.3f}")

##### Performance table

In [None]:
performance_compas = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision (macro)', 'Recall (macro)'],
    'Data A': [accA, precAmacro, recAmacro],
    'Data B': [accB, precBmacro, recBmacro]
})
parameter_compasA = pd.DataFrame.from_dict({k: str(v) for k, v in svmA.get_params().items()}, orient='index',
                                           columns = ['Compas']).reset_index()
parameter_compasB = pd.DataFrame.from_dict({k: str(v) for k, v in nbB.get_params().items()}, orient='index',
                                          columns = ['Compas']).reset_index()

## Summary Performance table

In [None]:
performance = pd.merge(performance_bankmarketing, performance_compas, on = 'Metric')
performance.columns = pd.MultiIndex.from_tuples([
    ('', 'Metric'), ('Bank-Marketing', 'Data A'), ('Bank-Marketing', 'Data B'), ('Compas', 'Data A'), ('Compas', 'Data B')
])

In [None]:
performance

## Summary Parameter table

In [None]:
parameterA = pd.merge(pd.DataFrame.from_dict({k: str(v) for k, v in param_grid_SVM.items()}, orient='index',
                       columns = ['List of values']).reset_index(), parameter_compasA)
parameterA = pd.merge(parameterA, parameter_bankmarketingA)
parameterA['Model'] = 'Black box A: SVM'
parameterB = pd.merge(pd.DataFrame.from_dict({k: str(v) for k, v in param_grid_NB.items()}, orient='index',
                                             columns = ['List of values']).reset_index(), parameter_compasB)
parameterB = pd.merge(parameterB, parameter_bankmarketingB)
parameterB['Model'] = 'Black box B: Naive Bayes'

In [None]:
parameter = pd.concat([parameterA, parameterB])[['Model','index','List of values','Bank-Marketing','Compas']].\
    rename(columns = {'index':'Parameter'})

### Running Example 1

In [None]:
filename = 'blackboxes/running1A.sav'
mod = FClassifier()
pickle.dump(mod, open(filename, 'wb'))
filename = 'blackboxes/running1B.sav'
mod = SClassifier()
pickle.dump(mod, open(filename, 'wb'))

### Running Example 2

In [None]:
data, cols = loaddata('running2')

In [None]:
svmA = svm.SVC(random_state=1, kernel = 'linear')
svmA.fit(data[cols].values, data.y)
filename = 'blackboxes/running2A.sav'
pickle.dump(svmA, open(filename, 'wb'))

In [None]:
svmB = svm.SVC(random_state=1, kernel = 'rbf', gamma = 0.2)
svmB.fit(data[cols].values, data.y)
filename = 'blackboxes/running2B.sav'
pickle.dump(svmB, open(filename, 'wb'))