# Simple RFT Classifier with ExplainerDashboard

ExplainerDashboard is an interactive Dashboard where you can see the effect of different settings on your Classifier

In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix,classification_report

from emissions.data import load_data, clean_data, split
from emissions.trainer import MakeTransformer

from explainerdashboard import ClassifierExplainer, ExplainerDashboard

In [4]:
# get the data and split
df = load_data('../../data/sample201320.csv')
df = clean_data(df)
X_train, X_test, y_train, y_test = split(df)

[32m----------------start loading data----------------[0m
[34mData loaded: 187503 records[0m
[32m----------------start cleaning data----------------[0m
[31m
Records in input data: 187503[0m
[34m
Share of Pass and Fail before cleaning:[0m
[34mFail: 7%
Pass: 93%[0m
[34m
Unique vehicles in Fail: 10813[0m
[34mUnique vehicles in Pass: 84908[0m

Records with missing GVWR: 8125
[31m
Records after droping rows where GVWR is < 1000 or missing: 179373[0m
[31m
Records after keeping only the earliest test within a month for each vehicle: 165732[0m

Records where ODOMETER = 0: 796
[31m
Records after droping rows where ODOMETER is missing: 164855[0m
[31m
Records after droping rows where MILE_YEAR > 40,000: 163891[0m
[31m
Records in output data:163891[0m
[34m
Share of Pass and Fail after cleaning:[0m
[34mFail: 7%
Pass: 93%[0m
[34m
Unique vehicles in Fail: 10194[0m
[34mUnique vehicles in Pass: 78573[0m
['VEHICLE_TYPE' 'MODEL_YEAR' 'VEHICLE_AGE' 'MILE_YEAR' 'GVWR'
 'EN

In [6]:
# choose important columns
cols = ['MODEL_YEAR','VEHICLE_AGE','MILE_YEAR', 'ENGINE_WEIGHT_RATIO','MAKE']

# transform rare MAKE into other
mt = MakeTransformer().fit(X_train[cols])
print("\nMAKEs don't belong to other:", mt.makes_keep)
X_train_update = mt.transform(X_train[cols])
print('\nNumber of unique makes in train', X_train_update.MAKE.nunique())
X_test_update = mt.transform(X_test[cols])
print('\nNumber of unique makes in test', X_test_update.MAKE.nunique())
        
# transform MAKE into one-hot numeric array
enc = OneHotEncoder(handle_unknown='ignore')
MAKE_train = pd.DataFrame(enc.fit_transform(X_train_update[['MAKE']]).toarray())
MAKE_train = MAKE_train.add_prefix('MAKE_')
MAKE_test = pd.DataFrame(enc.fit_transform(X_test_update[['MAKE']]).toarray())
MAKE_test = MAKE_test.add_prefix('MAKE_')

# drop MAKE and add the one-hot numeric array to form one new data frame
X_train_rel = X_train_update.drop('MAKE',axis=1)
X_train_rel.reset_index(drop=True, inplace=True)
MAKE_train.reset_index(drop=True, inplace=True)
X_train_rel = pd.concat([X_train_rel, MAKE_train],axis=1)
X_test_rel = X_test_update.drop('MAKE',axis=1)
X_test_rel.reset_index(drop=True, inplace=True)
MAKE_test.reset_index(drop=True, inplace=True)
X_test_rel = pd.concat([X_test_rel, pd.DataFrame(MAKE_test)],axis=1)


57 make labels each account for less than 1.0% of cars and together account for 9.98% of cars

MAKEs don't belong to other: ['bmw', 'buick', 'chevrolet', 'chrysler', 'dodge', 'ford', 'gmc', 'honda', 'hyundai', 'jeep', 'kia', 'lexus', 'mazda', 'mitsubishi', 'nissan', 'pontiac', 'subaru', 'toyota', 'volkswagen']

Number of unique makes in train 20

Number of unique makes in test 20


In [8]:
# RandomForestClassifier based on settings from GridSearch
model = RandomForestClassifier(n_estimators=1000,n_jobs=-1,max_depth=30,
                              min_samples_leaf=1,min_samples_split=2)
model.fit(X_train_rel, y_train)
y_pred = model.predict(X_test_rel)
tmp = confusion_matrix(y_test,y_pred)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.99      0.96     30496
           1       0.26      0.02      0.04      2283

    accuracy                           0.93     32779
   macro avg       0.60      0.51      0.50     32779
weighted avg       0.88      0.93      0.90     32779



In [14]:
# use Explainer Dashboard with less estimators
e_model = RandomForestClassifier(n_estimators=10,n_jobs=-1,max_depth=30,
                              min_samples_leaf=1,min_samples_split=2)
e_model.fit(X_train_rel, y_train)
explainer = ClassifierExplainer(e_model, X_test_rel, y_test, n_jobs=-1)
explainer.dump("RFT_explainer.joblib")

Detected RandomForestClassifier model: Changing class type to RandomForestClassifierExplainer...
Note: model_output=='probability', so assuming that raw shap output of RandomForestClassifier is in probability space...
Generating self.shap_explainer = shap.TreeExplainer(model)


In [15]:
db = ExplainerDashboard(explainer,shap_interaction=False,mode='inline')
db.run()

Building ExplainerDashboard..
Generating layout...
Calculating shap values...
Calculating dependencies...
Calculating permutation importances (if slow, try setting n_jobs parameter)...
Calculating prediction probabilities...
Calculating classification_dfs...
Calculating predictions...
Calculating pr auc curves...
Calculating pred_percentiles...
Calculating roc auc curves...
Calculating confusion matrices...
Calculating liftcurve_dfs...
Calculating metrics...
Calculating ShadowDecTree for each individual decision tree...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...
Starting ExplainerDashboard inline (terminate it with ExplainerDashboard.terminate(8050))
