# Simple RFT Classifier with ExplainerDashboard

ExplainerDashboard is an interactive Dashboard where you can see the effect of different settings on your Classifier

In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix,classification_report

from emissions.data import load_data, clean_data, split
from emissions.trainer import MakeTransformer

from explainerdashboard import ClassifierExplainer, ExplainerDashboard

In [3]:
# get the data and split
df = load_data('../../data/sample201320.csv')
df = clean_data(df)
X_train, X_test, y_train, y_test = split(df)

[32m----------------start loading data----------------[0m


KeyError: "['AFTER_COVID'] not in index"

In [None]:
# choose important columns
cols = ['MODEL_YEAR','VEHICLE_AGE','MILE_YEAR', 'ENGINE_WEIGHT_RATIO','MAKE']

# transform rare MAKE into other
mt = MakeTransformer().fit(X_train[cols])
print("\nMAKEs don't belong to other:", mt.makes_keep)
X_train_update = mt.transform(X_train[cols])
print('\nNumber of unique makes in train', X_train_update.MAKE.nunique())
X_test_update = mt.transform(X_test[cols])
print('\nNumber of unique makes in test', X_test_update.MAKE.nunique())
        
# transform MAKE into one-hot numeric array
enc = OneHotEncoder(handle_unknown='ignore')
MAKE_train = pd.DataFrame(enc.fit_transform(X_train_update[['MAKE']]).toarray())
MAKE_train = MAKE_train.add_prefix('MAKE_')
MAKE_test = pd.DataFrame(enc.fit_transform(X_test_update[['MAKE']]).toarray())
MAKE_test = MAKE_test.add_prefix('MAKE_')

# drop MAKE and add the one-hot numeric array to form one new data frame
X_train_rel = X_train_update.drop('MAKE',axis=1)
X_train_rel.reset_index(drop=True, inplace=True)
MAKE_train.reset_index(drop=True, inplace=True)
X_train_rel = pd.concat([X_train_rel, MAKE_train],axis=1)
X_test_rel = X_test_update.drop('MAKE',axis=1)
X_test_rel.reset_index(drop=True, inplace=True)
MAKE_test.reset_index(drop=True, inplace=True)
X_test_rel = pd.concat([X_test_rel, pd.DataFrame(MAKE_test)],axis=1)

In [None]:
# RandomForestClassifier based on settings from GridSearch
model = RandomForestClassifier(n_estimators=1000,n_jobs=-1,max_depth=30,
                              min_samples_leaf=1,min_samples_split=2)
model.fit(X_train_rel, y_train)
y_pred = model.predict(X_test_rel)
tmp = confusion_matrix(y_test,y_pred)
print(classification_report(y_test, y_pred))

In [None]:
# use Explainer Dashboard with less estimators
e_model = RandomForestClassifier(n_estimators=10,n_jobs=-1,max_depth=30,
                              min_samples_leaf=1,min_samples_split=2)
e_model.fit(X_train_rel, y_train)
explainer = ClassifierExplainer(e_model, X_test_rel, y_test, n_jobs=-1)
explainer.dump("RFT_explainer.joblib")

In [None]:
db = ExplainerDashboard(explainer,shap_interaction=False,mode='inline')
db.run()