In [1]:
import os, sys, pickle
import numpy as np 
import pandas as pd 

# sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

# shap
import shap 

# Directories
data_dir = os.getcwd() + '/data'
code_dir = os.getcwd() + '/fpdash'
sys.path.append(code_dir)

# custom modules
from cbr import prep

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


#### Import data

In [2]:
data = prep.openmlwrapper(data_id=31, random_state=1, n_samples = 2000, verbose=True, scale=True, test_size=0.5)

Start preprocessing...
...Used all 1000 samples from dataset 31.
...Filled missing values.
...Decoded to original feature values.
...Scaled data.
Preprocessing done.


  return self.partial_fit(X, y)
  X_train = pd.DataFrame(scaler.transform(X_train), columns=list(X_train))
  X_test = pd.DataFrame(scaler.transform(X_test), columns=list(X_test))


##### Split test data further
* train: 
    - true class known
    - used during training
* test pre: 
    - true class known
    - not used during training
* test post: 
    - true class unknown
    - not used during training

In [3]:
data['X_test_pre'], data['X_test_post'], data['y_test_pre'], data['y_test_post'] = train_test_split(data['X_test'], 
                                                                                                    data['y_test'], 
                                                                                                    random_state=1,
                                                                                                    test_size=0.5)

#### Train classifier

In [4]:
clf = RandomForestClassifier(n_estimators = 100, n_jobs=-2, random_state=1)
clf.fit(data['X_train'], np.array(data['y_train']).ravel())
print('Training Accuracy:    %.2f' % clf.score(data['X_train'], data['y_train']))
print('Test (pre) Accuracy:  %.2f' % clf.score(data['X_test_pre'], data['y_test_pre']))
print('Test (post) Accuracy: %.2f' % clf.score(data['X_test_post'], data['y_test_post']))

Training Accuracy:    1.00
Test (pre) Accuracy:  0.74
Test (post) Accuracy: 0.77


#### Create casebase and alerts
* The case base consists of instances from the training dataset and test-pre dataset.
* The alert data consists of instances from test-post dataset.

In [5]:
pre_indices = data['X_test_pre'].index
post_indices = data['X_test_post'].index

In [6]:
# Case Base
data['X_base'] = pd.concat([data['X_train'], data['X_test_pre']]).reset_index(drop=True)
data['y_base'] = pd.concat([data['y_train'], data['y_test_pre']]).reset_index(drop=True)
data['X_base_decoded'] = pd.concat([data['X_train_decoded'], data['X_test_decoded'].iloc[pre_indices]]).reset_index(drop=True)

# Alerts
data['X_alert'] = data['X_test_post'].copy().reset_index(drop=True)
data['y_alert'] = data['y_test_post'].copy().reset_index(drop=True)
data['X_alert_decoded'] = data['X_test_decoded'].iloc[post_indices].reset_index(drop=True)

#### Retrieve metadata
* Retrieve prediction probabilities (case base + alerts)
* Retrieve historical performance (case base)

In [7]:
# Compute prediction probabilities
y_base_score = [i[1] for i in clf.predict_proba(data['X_base'])]
y_alert_score = [i[1] for i in clf.predict_proba(data['X_alert'])]

# Compute performance for cases in de case base
y_base_pred = clf.predict(data['X_base'])
base_performance = []
for pred, true in zip(y_base_pred, data['y_base'].values.ravel()):
    if (pred==1) and (true==1):
        base_performance.append('TP')
    elif (pred==1) and (true==0):
        base_performance.append('FP')
    elif (pred==0) and (true==0):
        base_performance.append('TN')
    elif (pred==0) and (true==1):
        base_performance.append('FN')

# gather metadata
meta_base = pd.DataFrame({'performance' : base_performance, 'score' : y_base_score})
meta_alert = pd.DataFrame({'score' : y_alert_score})

#### Compute SHAP

In [8]:
explainer = shap.TreeExplainer(clf)
SHAP_base = pd.DataFrame(explainer.shap_values(X=data['X_base'])[1], columns=list(data['X_base']))
SHAP_alert = pd.DataFrame(explainer.shap_values(X=data['X_alert'])[1], columns=list(data['X_alert']))
print('Explained.')

Explained.


#### Train NN

In [9]:
# Find nearest SHAP neighbors
nn = NearestNeighbors(n_neighbors=10, algorithm='brute', metric='euclidean', n_jobs=-2)
nn.fit(SHAP_base)
print('Initialized nearest neighbor.')

Initialized nearest neighbor.


#### Save files

In [10]:
# Save classifier
with open(os.getcwd() + '/data/clf.pickle', 'wb') as handle:
    pickle.dump(clf, handle, protocol=pickle.HIGHEST_PROTOCOL)
# Save nearest neighbor on SHAP
with open(os.getcwd() + '/data/nn.pickle', 'wb') as handle:
    pickle.dump(nn, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# Save case base
data['X_base'].to_csv(os.getcwd() + '/data/X_base.csv', index=False)
data['X_base_decoded'].to_csv(os.getcwd() + '/data/X_base_decoded.csv', index=False)
meta_base.to_csv(os.getcwd() + '/data/meta_base.csv', index=False)
SHAP_base.to_csv(os.getcwd() + '/data/SHAP_base.csv', index=False)

# Save alerts
data['X_alert'].to_csv(os.getcwd() + '/data/X_alert.csv', index=False)
data['X_alert_decoded'].to_csv(os.getcwd() + '/data/X_alert_decoded.csv', index=False)
meta_base.to_csv(os.getcwd() + '/data/meta_alert.csv', index=False)
SHAP_alert.to_csv(os.getcwd() + '/data/SHAP_alert.csv', index=False)

print('Saved!')

Saved!
