In [3]:
# Step 1: Install the required library
!pip install shap

# Step 2: Import all necessary packages
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Step 3: Load and prepare the data
print("Loading data...")
df = pd.read_csv('data.csv')

# Drop serial number if present
if 'sno' in df.columns:
    df = df.drop("sno", axis=1)

# Encode categorical variables automatically
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype(str).str.strip()   # clean spaces
    df[col] = pd.factorize(df[col])[0]          # convert categories to integers

# Drop missing values
df.dropna(inplace=True)

print("Data preparation complete.")

# Step 4: Split the data for training and testing
X = df.drop("target", axis=1)
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Data splitting complete.")

# Step 5: Define and train the Random Forest model
rf_grid = {
    "n_estimators": np.arange(10, 1000, 50),
    "max_depth": [None, 3, 5, 10],
    "min_samples_split": np.arange(2, 20, 2),
    "min_samples_leaf": np.arange(1, 20, 2)
}
rs_rfc = RandomizedSearchCV(
    RandomForestClassifier(),
    param_distributions=rf_grid,
    n_iter=20,
    cv=5,
    verbose=1,
    random_state=42
)
print("\n--- Training Random Forest model... ---")
rs_rfc.fit(X_train, y_train)
print("--- Model training complete. ---")

# Step 6: Perform SHAP analysis and generate the plot
print("\n--- Performing SHAP Analysis... ---")
best_model = rs_rfc.best_estimator_
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)

print("\n--- Generating SHAP Summary Plot ---")
shap.summary_plot(shap_values[1], X_test)

Loading data...
Data preparation complete.
Data splitting complete.

--- Training Random Forest model... ---
Fitting 5 folds for each of 20 candidates, totalling 100 fits
--- Model training complete. ---

--- Performing SHAP Analysis... ---

--- Generating SHAP Summary Plot ---


AssertionError: The shape of the shap_values matrix does not match the shape of the provided data matrix.