# Random Forest Model + Explainable AI Techniques (XAI)

## Data Preparation

In [1]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

Load Train Dataset

In [2]:
df_train = pd.read_csv("../0_Datasets/Farm-Flow/train.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../0_Datasets/Farm-Flow/train.csv'

In [None]:
display(df_train)

Load Test Dataset

In [None]:
df_test = pd.read_csv("../0_Datasets/Farm-Flow/test.csv")

In [None]:
display(df_test)

-----
## Train and Test Datasets

Drop Multiclass Column

In [None]:
df_train = df_train.drop('traffic', axis=1)
df_test = df_test.drop('traffic', axis=1)

Excluding the target variable

In [None]:
X_columns = df_train.columns.drop('is_attack')

Create a feature matrix X by selecting only the columns specified in X_columns. Then convert the selected data into a NumPy array.

In [None]:
X = df_train[X_columns].values

Creates a target variable y containing the target variable

In [None]:
y = df_train["is_attack"].values

Split into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

Load Previous Trained Model

In [None]:
# Load the saved Random Forest model
model_folder = "../2_Modeling_Phase/Saved-Models/"
model_filename = "Farm-Flow_RF_Random_Forest_Model.joblib"
model_path = model_folder + model_filename

model = joblib.load(model_path)

Get Features Names and Class Names

In [None]:
feature_names = list(X_columns)
class_names = ["Normal", "Malicious"]
response_dict = {0: 'Normal', 1: 'Malicious'}

Generate Prediction

In [None]:
pred = model.predict(X_test)

Labeled Df's

In [None]:
X_test_labeled = pd.DataFrame(X_test, columns=feature_names)
X_train_labeled = pd.DataFrame(X_train, columns=feature_names)

# Since both are one-dimensional NumPy arrays
pred_series = pd.Series(pred)
y_test_target_series = pd.Series(y_test)
y_train_target_series = pd.Series(y_train)

Create a subset of the Train DF for faster training

In [None]:
subset_percentage = 0.1
X_subset, _, y_subset, _ = train_test_split(X_train, y_train, test_size=1 - subset_percentage, stratify=y_train)

In [None]:
subset_percentage = 0.1
X_subset_labeled, _, y_subset_labeled, _ = train_test_split(X_train_labeled, y_train_target_series, test_size=1 - subset_percentage, stratify=y_train_target_series)

Row to explain

In [None]:
idx = 0

---

## yellowbrick

### How it works
Yellowbrick is a suite of visual diagnostic tools called "Visualizers" that extend the scikit-learn API to allow human steering of the model selection process.

### How it applys

### Repository:
- https://github.com/DistrictDataLabs/yellowbrick

### Paper:
- https://github.com/DistrictDataLabs/yellowbrick/blob/develop/paper/paper.md

In [None]:
#!pip install yellowbrick

In [None]:
from yellowbrick.classifier import ROCAUC

visualizer = ROCAUC(model,binary=True)
visualizer.fit(X,y)
visualizer.score(X,y)
visualizer.show()

Rank1D and Rank2D evaluate single features or pairs of features using a variety of metrics that score the features on the scale [-1, 1] or [0, 1] allowing them to be ranked. A similar concept to SPLOMs, the scores are visualized on a lower-left triangle heatmap so that patterns between pairs of features can be easily discerned for downstream analysis.

In [None]:
from yellowbrick.features import Rank1D

# Instantiate the visualizer with the Covariance ranking algorithm 
visualizer = Rank1D(features=feature_names, algorithm='shapiro')

visualizer.fit(X, y)                # Fit the data to the visualizer
visualizer.transform(X)             # Transform the data
visualizer.show()                   # Finalize and render the figure

In [None]:
from yellowbrick.features import Rank2D

visualizer = Rank2D(
    features=feature_names, algorithm='covariance'
)
visualizer.fit(X, y)                # Fit the data to the visualizer
visualizer.transform(X)             # Transform the data
visualizer.show()                   # Finalize and render the figure

# pairwise comparisons of each feature in the data set with a specific metric or 
# algorithm and then returns them ranked as a lower left triangle diagram.

In [None]:
# Instantiate the visualizer with the Pearson ranking algorithm 
visualizer = Rank2D(features=feature_names, algorithm='pearson')

visualizer.fit(X, y)                # Fit the data to the visualizer
visualizer.transform(X)             # Transform the data
visualizer.show()                   # Finalize and render the visualizer

RadViz is a multivariate data visualization algorithm that plots each feature dimension uniformly around the circumference of a circle then plots points on the interior of the circle such that the point normalizes its values on the axes from the center to each arc. This mechanism allows as many dimensions as will easily fit on a circle, greatly expanding the dimensionality of the visualization.

In [None]:
from yellowbrick.features import RadViz 

# Instantiate the visualizer
visualizer = visualizer = RadViz(classes=class_names, features=feature_names)

visualizer.fit(X, y)      # Fit the data to the visualizer
visualizer.transform(X)   # Transform the data
visualizer.show()         # Finalize and render the visualizer

In [None]:
from yellowbrick.features import ParallelCoordinates

# Instantiate the visualizer
visualizer = ParallelCoordinates(classes=class_names, features=feature_names)

visualizer.fit(X, y)      # Fit the data to the visualizer
visualizer.transform(X)   # Transform the data
visualizer.show()         # Finalize and render the visualizer

In [None]:
# Instantiate the visualizer
visualizer = visualizer = ParallelCoordinates(
    classes=class_names, features=feature_names,
    normalize='standard', sample=0.1,
)

visualizer.fit(X, y)      # Fit the data to the visualizer
visualizer.transform(X)   # Transform the data
visualizer.show()         # Finalize and render the visualizer

The JointPlotVisualizer plots a feature against the target and shows the distribution of each via a histogram on each axis.

In [None]:
from yellowbrick.features import PCADecomposition

visualizer = PCADecomposition(scale=True, center=False, col=y)
visualizer.fit_transform(X,y)
visualizer.show()

In [None]:
from yellowbrick.classifier import ClassificationReport, ROCAUC, ClassBalance,  ConfusionMatrix

In [None]:
visualizer = ClassificationReport(model, classes=class_names)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data 
g = visualizer.show()      