# Analysis of Random Forest Model

In [None]:
import altair as alt
import joblib
import numpy as np
import os
import pandas as pd
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, accuracy_score, roc_curve

shap.initjs()

In [None]:
project_base = os.path.dirname(os.path.realpath('.'))
RANDOM_SEED = 17

In [None]:
X_train_processed = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_train_data.csv'))
X_test_processed = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_test_data.csv'))


y_train = pd.read_csv(os.path.join(project_base,  'data', 'cleaned_data', 'processed_train_y.csv'))
y_test = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_test_y.csv'))

In [None]:
rf_clf = joblib.load(os.path.join(project_base, 'data', 'trained_models','rf_clf.pkl'))

In [None]:
rf_test_preds = rf_clf.predict(X_test_processed)

### Classification Report

In [None]:
print(f'Accuracy Score:\n\n{accuracy_score(y_test, rf_test_preds)}\n')
print(f'Classification Report:\n\n{classification_report(y_test, rf_test_preds)}\n')

### Confusion Matrix

In [None]:
print(f'Confusion Matrix:\n\n{confusion_matrix(y_test, rf_test_preds)}')

### RF Variable Importance

In [None]:
import matplotlib.pyplot as plt

In [None]:
importances = rf_clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_clf.estimators_], axis=0)

forest_importances = pd.DataFrame(importances, index=list(X_test_processed), columns=['feature_importance'])
# forest_importances['std'] = std
# forest_importances.reset_index(inplace=True)

In [None]:
forest_importances.iloc[:32,:]

In [None]:
forest_importances_slim = forest_importances.iloc[:32,:].copy()

fig, ax = plt.subplots()
forest_importances_slim.plot.bar(yerr=std[:32], ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.set_size_inches(12.5, 6.5)
# fig.tight_layout()

In [None]:
bars = alt.Chart().mark_bar().encode(
    x='index:O',
    y=alt.Y('feature_importance', title='Feature Importance'),
)

error_bars = alt.Chart().mark_errorbar(extent='ci').encode(
    x='index:O',
    y='str:Q'
)

alt.layer(bars, error_bars, data=forest_importances)

### Error by Sentence Length

### Ablation Analysis (Accuracy With Different Feature Subsets)

In [None]:
# train on all featuress except word2vec features

# rf = RandomForestClassifier()
# rf.fit(X_train_processed.iloc[:, :26], y_train.values)
# preds = rf.predict(X_test_processed.iloc[:, :26])
# print(f'Accuracy Score:\n\n{accuracy_score(y_test, preds)}\n')
# # 0.70089

In [None]:
# train on only word2vec features

# rf = RandomForestClassifier()
# rf.fit(X_train_processed.iloc[:, 26:], y_train.values)
# preds = rf.predict(X_test_processed.iloc[:, 26:])
# print(f'Accuracy Score:\n\n{accuracy_score(y_test, preds)}\n')
# # 0.66633

### ROC Curve

In [None]:
rf_test_preds_probas = rf_clf.predict_proba(X_test_processed)
fpr, tpr, thresholds = roc_curve(y_test, rf_test_preds_probas[:, 1])
roc_df = pd.DataFrame({'fpr':fpr,'tpr':tpr})

In [None]:
alt.Chart(roc_df).mark_line().encode(
    x='fpr',
    y='tpr')

### Precision Recall Curve

In [None]:
precision_, recall_, threshold = precision_recall_curve(y_test, np.round(rf_test_preds_probas[:, 1], 2))
pr_df = pd.DataFrame({'precision':precision_, 'recall':recall_})

In [None]:
alt.Chart(pr_df).mark_line().encode(
    x='recall',
    y='precision')

## SHAP Values

In [None]:
explainer = shap.TreeExplainer(rf_clf)

### SHAP to Interpret the Specific Observations (Local Explanation)

In [None]:
# single random observation

idx = 57
observation = X_train_processed.iloc[[idx]]
print(f"Observation true label: {y_train.iloc[idx]}")
print(f"Observation predicted label (proba): {rf_clf.predict_proba(observation)[:,1]}")

observation

In [None]:
count = 0
for idx in range(1000):
    observation = X_train_processed.iloc[[idx]]
    actual = y_train.iloc[idx].values[0]
    pred = rf_clf.predict_proba(observation)[:,1][0]

    if actual == 0 and pred > 0.5:
        print(idx)
        count += 1
        if count > 4:
            break
#     print(f"Observation true label: {y_train.iloc[idx]}")
#     print(f"Observation predicted label (proba): {rf_clf.predict_proba(observation)[:,1]}")

In [None]:
observation.values

In [None]:
shap_values = explainer.shap_values(observation.values)

In [None]:
# one shape value for every variable
shap_values[1].shape

In [None]:
# If you need to check in what unit the output is, run the following:
explainer.model.tree_output

In [None]:
# average value of "1" across the entire data set
explainer.expected_value[1]

In [None]:
predicted_probas = rf_clf.predict_proba(processed_train_df)
np.mean(predicted_probas[:, 1])

In [None]:
rf_clf.predict_proba(observation)[:, 1] - explainer.expected_value[1]

In [None]:
np.sum(shap_values[1])

In [None]:
shap.force_plot(explainer.expected_value[1], shap_values[1], features=observation)

### SHAP to Interpret the Model (Global Explanation)

In [None]:
observations = X_train_processed.sample(25, random_state=RANDOM_SEED)

In [None]:
%time
shap_values = explainer.shap_values(observations)

In [None]:
shap_values[1].shape

In [None]:
# shap.force_plot(explainer.expected_value[1], shap_values[1], features=observations)

In [None]:
shap.summary_plot(shap_values[1], features=observations)