In [83]:
import os, sys, pickle
import numpy as np 
import pandas as pd 

# sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

# shap
import shap 

# Directories
data_dir = os.getcwd() + '/data'
code_dir = os.getcwd() + '/fpdash'
sys.path.append(code_dir)

# custom modules
from cbr import prep

#### Import data

In [113]:
data_id = 1590 # adult
data_id = 31 # german credit dataset

In [114]:
data = prep.openmlwrapper(data_id=data_id, random_state=1, n_samples = 3000, verbose=True, scale=True, test_size=0.5)

Start preprocessing...
...Used all 1000 samples from dataset 31.
...Filled missing values.
...Decoded to original feature values.
...Scaled data.
Preprocessing done.


  return self.partial_fit(X, y)
  X_train = pd.DataFrame(scaler.transform(X_train), columns=list(X_train))
  X_test = pd.DataFrame(scaler.transform(X_test), columns=list(X_test))


##### Split test data further
* train: 
    - true class known
    - used during training
* test: 
    - true class known
    - not used during training
* application: 
    - true class "unknown"
    - not used during training

In [115]:
data['X_test_pre'], data['X_test_post'], data['y_test_pre'], data['y_test_post'] = train_test_split(data['X_test'], 
                                                                                                    data['y_test'].reset_index(drop=True), 
                                                                                                    random_state=1,
                                                                                                    test_size=0.5)

In [116]:
from sklearn.metrics import roc_auc_score, brier_score_loss


#### Train classifier

In [117]:
clf = RandomForestClassifier(n_estimators = 100, n_jobs=-2, random_state=1)
clf.fit(data['X_train'], np.array(data['y_train']).ravel())
print('Training Accuracy:    %.2f' % clf.score(data['X_train'], data['y_train']))
print('Test Accuracy:        %.2f' % clf.score(data['X_test_pre'], data['y_test_pre']))
print()
print('Application Accuracy: %.3f' % clf.score(data['X_test_post'], data['y_test_post']))
y_app_score = [i[1] for i in clf.predict_proba(data['X_test_post'])]
print('Application AUC:      %.3f' % roc_auc_score(y_true=data['y_test_post']['class'].ravel(), y_score=y_app_score))
print('Application Brier:    %.3f' % brier_score_loss(y_true=data['y_test_post']['class'].ravel(), y_prob=y_app_score))

Training Accuracy:    1.00
Test Accuracy:        0.74

Application Accuracy: 0.768
Application AUC:      0.775
Application Brier:    0.158


#### Create casebase and alerts
* The case base consists of instances from the training dataset and test dataset.
* The alert data consists of instances from the application dataset for which the model predicted a positive.

In [118]:
pre_indices = data['X_test_pre'].index
post_indices = data['X_test_post'].index

In [119]:
# Case Base
data['X_base'] = pd.concat([data['X_train'], data['X_test_pre']]).reset_index(drop=True)
data['y_base'] = pd.concat([data['y_train'], data['y_test_pre']]).reset_index(drop=True)
data['X_base_decoded'] = pd.concat([data['X_train_decoded'], 
                                    data['X_test_decoded'].reset_index(drop=True).iloc[pre_indices]]
                                  ).reset_index(drop=True)

# Alerts
y_test_post_pred = pd.DataFrame({'prediction' : clf.predict(data['X_test_post'])})
y_test_post_pred['index'] = data['y_test_post'].index
y_test_post_pred = y_test_post_pred.set_index('index')
# alert_indices = y_test_post_pred[y_test_post_pred['prediction']==1].index
alert_indices = y_test_post_pred.index
data['X_alert'] = data['X_test_post'].copy().loc[alert_indices].reset_index(drop=True)
data['y_alert'] = data['y_test_post'].copy().loc[alert_indices].reset_index(drop=True)
data['X_alert_decoded'] = data['X_test_decoded'].reset_index(drop=True).loc[alert_indices].reset_index(drop=True)

#### Retrieve metadata
* Retrieve prediction probabilities (case base + alerts)
* Retrieve historical performance (case base)

In [120]:
# Compute prediction probabilities
y_base_score = [i[1] for i in clf.predict_proba(data['X_base'])]
y_alert_score = [i[1] for i in clf.predict_proba(data['X_alert'])]

# Compute performance for cases in de case base
y_base_pred = clf.predict(data['X_base'])
base_performance = []
for pred, true in zip(y_base_pred, data['y_base'].values.ravel()):
    if (pred==1) and (true==1):
        base_performance.append('TP')
    elif (pred==1) and (true==0):
        base_performance.append('FP')
    elif (pred==0) and (true==0):
        base_performance.append('TN')
    elif (pred==0) and (true==1):
        base_performance.append('FN')

# gather metadata
meta_base = pd.DataFrame({'performance' : base_performance, 'score' : y_base_score})
meta_alert = pd.DataFrame({'score' : y_alert_score})

#### Compute SHAP

In [121]:
explainer = shap.TreeExplainer(clf)
SHAP_base = pd.DataFrame(explainer.shap_values(X=data['X_base'])[1], columns=list(data['X_base']))
SHAP_alert = pd.DataFrame(explainer.shap_values(X=data['X_alert'])[1], columns=list(data['X_alert']))
print('Explained.')

Explained.


#### Train NN

In [122]:
# Find nearest SHAP neighbors
nn = NearestNeighbors(n_neighbors=10, algorithm='brute', metric='euclidean', n_jobs=1)
nn.fit(SHAP_base)
print('Initialized nearest neighbor.')

Initialized nearest neighbor.


#### Save files

In [123]:
# Set jobs to 1 
clf.set_params(n_jobs=1)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [124]:
# Save classifier
with open(os.getcwd() + '/data/clf.pickle', 'wb') as handle:
    pickle.dump(clf, handle, protocol=pickle.HIGHEST_PROTOCOL)
# Save nearest neighbor on SHAP
with open(os.getcwd() + '/data/nn.pickle', 'wb') as handle:
    pickle.dump(nn, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# Save case base
data['X_base'].to_csv(os.getcwd() + '/data/X_base.csv', index=False)
data['X_base_decoded'].to_csv(os.getcwd() + '/data/X_base_decoded.csv', index=False)
meta_base.to_csv(os.getcwd() + '/data/meta_base.csv', index=False)
SHAP_base.to_csv(os.getcwd() + '/data/SHAP_base.csv', index=False)
data['y_base'].to_csv(os.getcwd() + '/data/y_base.csv', index=False)

# Save alerts
data['X_alert'].to_csv(os.getcwd() + '/data/X_alert.csv', index=False)
data['X_alert_decoded'].to_csv(os.getcwd() + '/data/X_alert_decoded.csv', index=False)
meta_alert.to_csv(os.getcwd() + '/data/meta_alert.csv', index=False)
SHAP_alert.to_csv(os.getcwd() + '/data/SHAP_alert.csv', index=False)
data['y_alert'].to_csv(os.getcwd() + '/data/y_alert.csv', index=False)

# Save training data separately
data['X_train'].to_csv(os.getcwd() + '/data/X_train.csv', index=False)

print('Saved!')

Saved!


In [126]:
meta_alert

Unnamed: 0,score
0,0.54
1,0.62
2,0.44
3,0.06
4,0.30
5,0.11
6,0.27
7,0.10
8,0.07
9,0.01
