# Feature Importance
This python file will be to perform Feature Importance using Shapley values to determine the ranking of features in terms of how much they influence the model in effectively predicting the crime codes. The file will start with preprocessing the data similar to previous learning files then performing feature importance with Shapley values


**Authors:** Kevin Lu, Shrusti Jain, Smeet Patel, Taobo Liao


# Imports and Graph Configurations

In [18]:
import numpy as np
import pandas as pd
import time
import datetime
import tensorflow as tf
import random
import matplotlib
import torch
import torch.nn as nn
#%matplotlib notebook
import matplotlib.pyplot as plt
import scipy.stats
import matplotlib.offsetbox as offsetbox
from matplotlib.ticker import StrMethodFormatter
from google.colab import drive
from sklearn import datasets, linear_model

In [19]:
!pip install shap
import shap



In [20]:
#for some reason, this needs to be in a separate cell
params={
    "font.size":15,
    "lines.linewidth":5,
}
plt.rcParams.update(params)

In [21]:
#download train and debug and models
!gdown 1enR3DLH7iDuI0mG8rV3Z21tPhdZZRXOv
!gdown 1zeyltSH_KaN0qQCRCiZR8kXOG6VUXU9T
!gdown 1GOlOSBBJdWWdh8z2oS19aEROm-3oTB4d
!gdown 1_tvxGoEQEMFialDlTOjGmLaU8xz7_Yom
!gdown 1enYGeEeWNFDr-qHQuCNnb4q6yV7O9cpg

Downloading...
From (original): https://drive.google.com/uc?id=1enR3DLH7iDuI0mG8rV3Z21tPhdZZRXOv
From (redirected): https://drive.google.com/uc?id=1enR3DLH7iDuI0mG8rV3Z21tPhdZZRXOv&confirm=t&uuid=427909fb-a13b-4876-b671-c0c06a459393
To: /content/train.pkl
100% 224M/224M [00:07<00:00, 30.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1zeyltSH_KaN0qQCRCiZR8kXOG6VUXU9T
To: /content/debug.pkl
100% 11.2M/11.2M [00:00<00:00, 147MB/s]
Downloading...
From: https://drive.google.com/uc?id=1GOlOSBBJdWWdh8z2oS19aEROm-3oTB4d
To: /content/basic_nn_model.pkl
100% 658k/658k [00:00<00:00, 88.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1_tvxGoEQEMFialDlTOjGmLaU8xz7_Yom
To: /content/basic_nn_model_time_sensitive.pkl
100% 659k/659k [00:00<00:00, 127MB/s]
Downloading...
From: https://drive.google.com/uc?id=1enYGeEeWNFDr-qHQuCNnb4q6yV7O9cpg
To: /content/basic_nn_model_part12.pkl
100% 637k/637k [00:00<00:00, 104MB/s]


In [22]:
# Basic Neural Network with 3 hidden layers, BatchNorm and dropout
class BasicNN(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, dropout_rate):
        super(BasicNN, self).__init__()
        self.layers = nn.Sequential(
            # Input layer
            nn.Linear(input_size, hidden_sizes[0]),
            nn.BatchNorm1d(hidden_sizes[0]),
            nn.ReLU(),

            # Hidden Layer 1
            nn.Linear(hidden_sizes[0], hidden_sizes[1]),
            nn.BatchNorm1d(hidden_sizes[1]),
            nn.ReLU(),
            nn.Dropout(p=dropout_rate),

            # Hidden Layer 2
            nn.Linear(hidden_sizes[1], hidden_sizes[2]),
            nn.BatchNorm1d(hidden_sizes[2]),
            nn.ReLU(),

            # Hidden Layer 3 (New layer added)
            nn.Linear(hidden_sizes[2], hidden_sizes[3]),
            nn.BatchNorm1d(hidden_sizes[3]),
            nn.ReLU(),

            # Output layer
            nn.Linear(hidden_sizes[3], output_size),
        )

    def forward(self, x):
        return self.layers(x)


In [23]:
# Load dataframe objects from the Deep Learning notebook
crime_df_train = pd.read_pickle('/content/train.pkl')
crime_df_debug = pd.read_pickle('/content/debug.pkl')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

input_size = 297
hidden_sizes = [256, 180, 120, 80]
output_size = 69
learning_rate = 1e-3
dropout_rate = 0.2
milestones = [10, 15]

model = BasicNN(input_size=input_size, hidden_sizes=hidden_sizes, output_size=output_size, dropout_rate=dropout_rate)

# Load pre-trained weights into the model
model.load_state_dict(torch.load('/content/basic_nn_model.pkl', map_location=device))
model.to(device)
model.eval()

  model.load_state_dict(torch.load('/content/basic_nn_model.pkl', map_location=device))


BasicNN(
  (layers): Sequential(
    (0): Linear(in_features=297, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Linear(in_features=256, out_features=180, bias=True)
    (4): BatchNorm1d(180, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): Dropout(p=0.2, inplace=False)
    (7): Linear(in_features=180, out_features=120, bias=True)
    (8): BatchNorm1d(120, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU()
    (10): Linear(in_features=120, out_features=80, bias=True)
    (11): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU()
    (13): Linear(in_features=80, out_features=69, bias=True)
  )
)

# Preprocessing

In [24]:
# Add a binary column indicating if Vict Age is 0
crime_df_train['Vict Age Was 0'] = (crime_df_train['Vict Age'] == 0).astype(int)

# Select relevant columns for analysis
selected_columns = [
    'Status',
    'Weapon Used Cd',
    'Vict Descent',
    'Vict Sex',
    'Vict Age',
    'Mocodes',
    'Crm Cd',
    'Part 1-2',
    'Rpt Dist No',
    'AREA',
    'TIME OCC',
    'DATE OCC',
    'Premis Cd',
    'Vict Age Was 0'
]

# Create a DataFrame with only the selected columns
crime_selected_df = crime_df_train[selected_columns]
crime_selected_df.head()

Unnamed: 0,Status,Weapon Used Cd,Vict Descent,Vict Sex,Vict Age,Mocodes,Crm Cd,Part 1-2,Rpt Dist No,AREA,TIME OCC,DATE OCC,Premis Cd,Vict Age Was 0
0,AA,,O,M,0,,510,1,784,7,2130,2020-03-01,101.0,1
1,IC,,O,M,47,1822 1402 0344,330,1,182,1,1800,2020-02-08,128.0,0
2,IC,,X,X,19,0344 1251,480,1,356,3,1700,2020-11-04,502.0,0
3,IC,,O,M,19,0325 1501,343,1,964,9,2037,2020-03-10,405.0,0
4,IC,,H,M,28,1822 1501 0930 2004,354,2,666,6,1200,2020-08-17,102.0,0


In [25]:
counts = crime_selected_df['Crm Cd'].value_counts()
codes = counts[counts>500].index.tolist()

In [26]:
def convert_to_minutes(military_time):
    """
    Convert military time to minutes from midnight.

    Parameters:
    military_time (int): Time in military format, e.g., 2305 for 11:05 PM.

    Returns:
    int: Total minutes from midnight.
    """
    # Ensure the time is a four-digit string (e.g., '2305')
    military_time = str(military_time).zfill(4)

    # Extract hours and minutes from the string
    hours = int(military_time[:2])
    minutes = int(military_time[2:])

    # Calculate and return the total minutes from midnight
    total_minutes = hours * 60 + minutes
    return total_minutes

# Apply the convert_to_minutes function to 'TIME OCC' column
crime_selected_df['TIME OCC'] = crime_selected_df['TIME OCC'].apply(convert_to_minutes)

# Function to one-hot encode specified categorical columns
def one_hot_encode(df, columns):
    """
    Apply one-hot encoding to specified columns in the DataFrame.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    columns (list): List of columns to one-hot encode.

    Returns:
    pd.DataFrame: DataFrame with one-hot encoded columns.
    """
    labels = []
    for column in columns:

        # Create one-hot encoded columns for each category in the column
        one_hot = pd.get_dummies(df[column], prefix=column)

        # Convert one-hot encoded DataFrame to integer type for compactness
        one_hot = one_hot.astype(int)

        # Replace the original column with the one-hot encoded columns
        df[column] = one_hot.values.tolist()

        # Add corresponding labels to labels
        labels += one_hot.columns.to_list()
    return df, labels

# Function to multi-hot encode 'Mocodes' column where each row may contain multiple codes
def multi_hot_encode_mocodes(df):
    """
    Multi-hot encode the 'Mocodes' column.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: DataFrame with 'Mocodes' column as multi-hot encoded vectors.
    """
    # Initialize a set of all unique Mocodes for multi-hot encoding
    mocode_counts = {}

    # Populate the set with unique Mocodes from each row (handling NaN values)
    for mocode_str in df['Mocodes'].dropna():
        mocode_str = str(mocode_str)
        mocodes = mocode_str.split(' ')
        for mocode in mocodes:
            mocode_counts[mocode] = mocode_counts.get(mocode, 0) + 1

    filtered_mocodes = {mocode for mocode, count in mocode_counts.items() if count > 1000}
    filtered_mocodes.add('NaN')
    mocode_index = {mocode: idx for idx, mocode in enumerate(sorted(filtered_mocodes))}

    # Define a helper function to encode Mocodes into a binary vector
    def encode_mocodes(mocode_str):
        # Split the Mocode string into individual codes, or set to 'NaN' if empty
        if isinstance(mocode_str, str):
            mocodes = mocode_str.split()
        else:
            mocodes = ['NaN']

        # Initialize a zero vector and set indices for each Mocode found
        encoded_vector = [0] * len(mocode_index)
        for mocode in mocodes:
            if mocode in mocode_index:
                encoded_vector[mocode_index[mocode]] = 1
        return encoded_vector

    # Apply the encoding function to the 'Mocodes' column
    df['Mocodes'] = df['Mocodes'].apply(encode_mocodes)
    mocodes = [f'Mocode_{mocode}' for mocode in list(mocode_index.keys())]
    return df, mocodes

# Specify columns to one-hot encode
columns_to_encode = ['Status', 'Vict Descent', 'Vict Sex', 'Weapon Used Cd']

# Apply one-hot encoding to specified columns and store the result in a new DataFrame
crime_selected_one_hot_df, labels = one_hot_encode(crime_selected_df.copy(), columns_to_encode)

crime_selected_one_hot_df, mocodes = multi_hot_encode_mocodes(crime_selected_one_hot_df.copy())
crime_selected_one_hot_df["Part 1-2"] -= 1
crime_selected_one_hot_df.head()
ungrouped_feature_names = ["Date Occ", "Vict Age Was 0", "Vict Age", "Rpt Dist No", "Area", "Time Occurred", "Premise Code"] + labels + mocodes
print(len(ungrouped_feature_names))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crime_selected_df['TIME OCC'] = crime_selected_df['TIME OCC'].apply(convert_to_minutes)


297


In [27]:
# Create the final X dataset for feature importance after preprocessing
crime_selected_one_hot_df['DATE OCC INT'] = crime_selected_one_hot_df['DATE OCC'].astype('int64') // (10**9 * 60 * 60 * 24)
crime_selected_one_hot_df [['Status','Weapon Used Cd','Vict Descent','Vict Sex','Vict Age','Crm Cd','Part 1-2','Rpt Dist No','AREA','TIME OCC','DATE OCC','Premis Cd','Vict Age Was 0']].isna().any()
crime_selected_one_hot_df['Premis Cd'] = crime_selected_one_hot_df['Premis Cd'].fillna(0)
non_list = crime_selected_one_hot_df[['DATE OCC INT','Vict Age Was 0', 'Vict Age', 'Rpt Dist No', 'AREA', 'TIME OCC','Premis Cd']].to_numpy(dtype=np.float32)
X = np.concatenate([non_list, np.array(crime_selected_one_hot_df['Status'].to_list()), np.array(crime_selected_one_hot_df['Weapon Used Cd'].to_list()), np.array(crime_selected_one_hot_df['Vict Descent'].to_list()), np.array(crime_selected_one_hot_df['Vict Sex'].to_list()), np.array(crime_selected_one_hot_df['Mocodes'].to_list())], axis=1)

In [28]:
# Creating the Y dataset for feature importance evaluation
Y = crime_selected_one_hot_df['Crm Cd'].to_numpy()
unique_classes = np.unique(Y)
counts = crime_selected_df['Crm Cd'].value_counts()
retained_classes = counts[counts>500].index.tolist()
removed_classes = counts[counts<=500].index.tolist()
class_to_index = {cls: idx for idx, cls in enumerate(retained_classes)}
class_to_index.update({cls: len(retained_classes) for cls in removed_classes})
Y_indices = np.array([class_to_index[cls] for cls in Y])
size = crime_selected_one_hot_df.shape[0]
weights = counts[counts>500].to_list()
weights.append(counts[counts<=500].sum())
weights = (np.sqrt(crime_selected_one_hot_df['Crm Cd'].shape[0]/np.array(weights)))/10
Y_one_hot = np.zeros((len(Y), len(retained_classes)+1), dtype=np.float32)
Y_one_hot[np.arange(len(Y)), Y_indices] = 1
Y_part12 = crime_selected_one_hot_df['Part 1-2'].to_numpy()
print(f"One-hot encoded Y shape: {Y_one_hot.shape}")

One-hot encoded Y shape: (986500, 69)


In [29]:
import torch
from torch.utils.data import Dataset, DataLoader

# Initializing our dataset for x features and y labels
class CrimeDataset(Dataset):
    def __init__(self, features, crimes):
        self.features = features
        self.labels = crimes

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        x = self.features[idx]
        y = self.labels[idx]
        return x, y

# Utilize T4 GPU for faster computation
device = 'cuda' if torch.cuda.is_available() else 'cpu'
X_torch = torch.tensor(X, dtype=torch.float32).to(device)
print(X_torch.shape)
Y_torch = torch.tensor(Y_one_hot, dtype=torch.float32).to(device)
dataset = CrimeDataset(X_torch, Y_torch)

# Initialization of our dataset will be 70/20/10 split being training, validation, and test respectively
train_set, val_set, test_set = torch.utils.data.random_split(dataset, [.70, .20, .10])
train_loader = DataLoader(train_set, batch_size=4096, shuffle=True)
val_loader = DataLoader(val_set, batch_size=4096, shuffle=False)
test_loader = DataLoader(test_set, batch_size=4096, shuffle=False)

torch.Size([986500, 297])


# Feature Importance

In [None]:
def model_predict(X):
    X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(X_tensor)
        return torch.softmax(outputs, dim=1).cpu().numpy()  # Return probabilities

# Select a single batch of data from the training set for SHAP analysis
batch_data, _ = next(iter(train_loader))
batch_data = batch_data.cpu().numpy()

explainer = shap.Explainer(model_predict, batch_data)

# Compute SHAP values for the batch
shap_values = explainer(batch_data)

PermutationExplainer explainer: 4097it [13:48,  4.90it/s]


In [None]:
print(shap)

<module 'shap' from '/usr/local/lib/python3.10/dist-packages/shap/__init__.py'>


In [None]:
average_shap_values = np.mean(shap_values.values, axis=0)  # Shape: (297, 69)
for target_class in range(69):
    class_shap_values = average_shap_values[:, target_class]  # Shape: (297,)

    sorted_indices = np.argsort(-np.abs(class_shap_values))  # Sort by absolute importance
    sorted_shap_values = class_shap_values[sorted_indices]
    sorted_feature_names = np.array(ungrouped_feature_names)[sorted_indices]

    # Plot the averaged SHAP values for this class
    plt.figure(figsize=(10, 6))
    plt.barh(sorted_feature_names[:20], sorted_shap_values[:20])  # Top 20 features
    plt.xlabel("Average SHAP Value")
    plt.ylabel("Feature")
    plt.title(f"Averaged Feature Importance for Class {target_class}")
    plt.gca().invert_yaxis()  # Invert y-axis for better readability
    plt.tight_layout()
    plt.savefig(f"averaged_summary_plot_class_{target_class}.png")
    plt.close()

Actual feature descriptions:
* Feature 1: Date Occured
* Feature 2: Vict Age Was 0 (Unlisted)
* Feature 3: Vict Age
* Feature 4: Rpt Dist No
* Feature 5: Area
* Feature 6: Time Occurred
* Feature 7: Premise Code
* Features 8-13: Status
* Features 14-92: Weapon Used Cd
* Features 93-112: Vict Descent
* Features 113-117: Vict Sex
* Features 118-297: Mocodes

In [None]:
mean_abs_shap_values = np.mean(np.abs(shap_values.values), axis=(0, 2))  # Shape: (297,)
# Sort features by their mean absolute SHAP value
sorted_indices = np.argsort(-mean_abs_shap_values)
sorted_shap_values = mean_abs_shap_values[sorted_indices]
sorted_feature_names = np.array(ungrouped_feature_names)[sorted_indices]

most_important_feature = sorted_feature_names[0]
most_important_value = sorted_shap_values[0]

print("\nTop 10 Features Across All Classes:")
for feature, importance in zip(sorted_feature_names[:10], sorted_shap_values[:10]):
    print(f"Feature: {feature}, Mean Absolute SHAP Value: {importance:.4f}")



Top 10 Features Across All Classes:
Feature: Feature 7, Mean Absolute SHAP Value: 0.0069
Feature: Feature 143, Mean Absolute SHAP Value: 0.0064
Feature: Feature 3, Mean Absolute SHAP Value: 0.0056
Feature: Feature 136, Mean Absolute SHAP Value: 0.0030
Feature: Feature 75, Mean Absolute SHAP Value: 0.0027
Feature: Feature 1, Mean Absolute SHAP Value: 0.0023
Feature: Feature 168, Mean Absolute SHAP Value: 0.0019
Feature: Feature 297, Mean Absolute SHAP Value: 0.0019
Feature: Feature 6, Mean Absolute SHAP Value: 0.0013
Feature: Feature 279, Mean Absolute SHAP Value: 0.0011


From this trial, we can see that most of the important features are Mocodes. This makes sense because mocodes provides a detailed description of the environment of the crime scene before it happens. Such as "Stranger" or "Domestic Violence" etc.

In [None]:
true_shap_values = np.empty(12)
true_shap_values[:7] = mean_abs_shap_values[:7]
true_shap_values[7] = mean_abs_shap_values[7:12].sum()
true_shap_values[8] = mean_abs_shap_values[13:91].sum()
true_shap_values[9] = mean_abs_shap_values[92:111].sum()
true_shap_values[10] = mean_abs_shap_values[112:116].sum()
true_shap_values[11] = mean_abs_shap_values[117:].sum()
true_feature_names = ["Date Occ", "Vict Age Was 0", "Vict Age", "Rpt Dist No", "Area", "Time Occurred", "Premise Code", "Status", "Weapon Used Cd", "Vict Descent", "Vict Sex", "Mocodes"]
sorted_true_indices = np.argsort(-true_shap_values)
sorted_true_shap_values = true_shap_values[sorted_true_indices]
sorted_true_feature_names = np.array(true_feature_names)[sorted_true_indices]

most_important_true_feature = sorted_true_feature_names[0]
most_important_true_value = sorted_true_shap_values[0]
print("SHAP Values of Each Feature Ranked")
for feature, importance in zip(sorted_true_feature_names, sorted_true_shap_values):
    print(f"Feature: {feature}, Summed Mean Absolute SHAP Value: {importance:.4f}")

SHAP Values of Each Feature Ranked
Feature: Mocodes, Summed Mean Absolute SHAP Value: 0.0322
Feature: Premise Code, Summed Mean Absolute SHAP Value: 0.0069
Feature: Vict Age, Summed Mean Absolute SHAP Value: 0.0056
Feature: Weapon Used Cd, Summed Mean Absolute SHAP Value: 0.0056
Feature: Date Occ, Summed Mean Absolute SHAP Value: 0.0023
Feature: Vict Descent, Summed Mean Absolute SHAP Value: 0.0017
Feature: Vict Sex, Summed Mean Absolute SHAP Value: 0.0013
Feature: Time Occurred, Summed Mean Absolute SHAP Value: 0.0013
Feature: Rpt Dist No, Summed Mean Absolute SHAP Value: 0.0011
Feature: Vict Age Was 0, Summed Mean Absolute SHAP Value: 0.0009
Feature: Status, Summed Mean Absolute SHAP Value: 0.0008
Feature: Area, Summed Mean Absolute SHAP Value: 0.0002


This confirms our expectation that Mocodes would provide the most help in determining the type of crime. This can be further strengthened by the fact that without Mocodes, we were only able to reach around 50% accuracy. Whereas with Mocodes, we could reach 65-70% accuracy in our deep learning model.

# Part 1-2 Feature Importance

In [None]:
part12_model = BasicNN(input_size=input_size, hidden_sizes=hidden_sizes, output_size=2, dropout_rate=dropout_rate)

# Load pre-trained weights into the model
part12_model.load_state_dict(torch.load('/content/basic_nn_model_part12.pkl', map_location=device))
part12_model.to(device)
part12_model.eval()

  part12_model.load_state_dict(torch.load('/content/basic_nn_model_part12.pkl', map_location=device))


BasicNN(
  (layers): Sequential(
    (0): Linear(in_features=297, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Linear(in_features=256, out_features=180, bias=True)
    (4): BatchNorm1d(180, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): Dropout(p=0.2, inplace=False)
    (7): Linear(in_features=180, out_features=120, bias=True)
    (8): BatchNorm1d(120, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU()
    (10): Linear(in_features=120, out_features=80, bias=True)
    (11): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU()
    (13): Linear(in_features=80, out_features=2, bias=True)
  )
)

In [None]:
def part12_model_predict(X):
    X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
    part12_model.eval()
    with torch.no_grad():
        outputs = part12_model(X_tensor)
        return torch.softmax(outputs, dim=1).cpu().numpy()  # Return probabilities

# Select a single batch of data from the training set for SHAP analysis
batch_data, _ = next(iter(train_loader))
batch_data = batch_data.cpu().numpy()

explainer = shap.Explainer(part12_model_predict, batch_data)

# Compute SHAP values for the batch
shap_values = explainer(batch_data)

PermutationExplainer explainer: 4097it [08:55,  7.52it/s]


Feature: Mocodes, Summed Mean Absolute SHAP Value: 0.0322
Feature: Premise Code, Summed Mean Absolute SHAP Value: 0.0069
Feature: Vict Age, Summed Mean Absolute SHAP Value: 0.0056
Feature: Weapon Used Cd, Summed Mean Absolute SHAP Value: 0.0056
Feature: Date Occ, Summed Mean Absolute SHAP Value: 0.0023
Feature: Vict Descent, Summed Mean Absolute SHAP Value: 0.0017
Feature: Vict Sex, Summed Mean Absolute SHAP Value: 0.0013
Feature: Time Occurred, Summed Mean Absolute SHAP Value: 0.0013
Feature: Rpt Dist No, Summed Mean Absolute SHAP Value: 0.0011
Feature: Vict Age Was 0, Summed Mean Absolute SHAP Value: 0.0009
Feature: Status, Summed Mean Absolute SHAP Value: 0.0008
Feature: Area, Summed Mean Absolute SHAP Value: 0.0002


In [None]:
average_shap_values = np.mean(shap_values.values, axis=0)  # Shape: (297, 69)
for target_class in range(69):
    class_shap_values = average_shap_values[:, target_class]  # Shape: (297,)

    sorted_indices = np.argsort(-np.abs(class_shap_values))  # Sort by absolute importance
    sorted_shap_values = class_shap_values[sorted_indices]
    sorted_feature_names = np.array(ungrouped_feature_names)[sorted_indices]

    # Plot the averaged SHAP values for this class
    plt.figure(figsize=(10, 6))
    plt.barh(sorted_feature_names[:20], sorted_shap_values[:20])  # Top 20 features
    plt.xlabel("Average SHAP Value")
    plt.ylabel("Feature")
    plt.title(f"Averaged Feature Importance for Class {target_class}")
    plt.gca().invert_yaxis()  # Invert y-axis for better readability
    plt.tight_layout()
    plt.savefig(f"averaged_summary_plot_class_{target_class}.png")
    plt.close()

In [None]:
true_shap_values = np.empty(12)
true_shap_values[:7] = mean_abs_shap_values[:7]
true_shap_values[7] = mean_abs_shap_values[7:12].sum()
true_shap_values[8] = mean_abs_shap_values[13:91].sum()
true_shap_values[9] = mean_abs_shap_values[92:111].sum()
true_shap_values[10] = mean_abs_shap_values[112:116].sum()
true_shap_values[11] = mean_abs_shap_values[117:].sum()
true_feature_names = ["Date Occ", "Vict Age Was 0", "Vict Age", "Rpt Dist No", "Area", "Time Occurred", "Premise Code", "Status", "Weapon Used Cd", "Vict Descent", "Vict Sex", "Mocodes"]
sorted_true_indices = np.argsort(-true_shap_values)
sorted_true_shap_values = true_shap_values[sorted_true_indices]
sorted_true_feature_names = np.array(true_feature_names)[sorted_true_indices]

most_important_true_feature = sorted_true_feature_names[0]
most_important_true_value = sorted_true_shap_values[0]

for feature, importance in zip(sorted_true_feature_names, sorted_true_shap_values):
    print(f"Feature: {feature}, Summed Mean Absolute SHAP Value: {importance:.4f}")

# Feature Importance over Time

In [30]:
time_sensitive_model = BasicNN(input_size=input_size, hidden_sizes=hidden_sizes, output_size=output_size, dropout_rate=dropout_rate)
# Load pre-trained weights into the model
time_sensitive_model.load_state_dict(torch.load('/content/basic_nn_model_time_sensitive.pkl', map_location=device))
time_sensitive_model.to(device)
time_sensitive_model.eval()

  time_sensitive_model.load_state_dict(torch.load('/content/basic_nn_model_time_sensitive.pkl', map_location=device))


BasicNN(
  (layers): Sequential(
    (0): Linear(in_features=297, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Linear(in_features=256, out_features=180, bias=True)
    (4): BatchNorm1d(180, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): Dropout(p=0.2, inplace=False)
    (7): Linear(in_features=180, out_features=120, bias=True)
    (8): BatchNorm1d(120, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU()
    (10): Linear(in_features=120, out_features=80, bias=True)
    (11): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU()
    (13): Linear(in_features=80, out_features=69, bias=True)
  )
)

In [None]:
def time_sensitive_model_predict(X):
    X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
    time_sensitive_model.eval()
    with torch.no_grad():
        outputs = time_sensitive_model(X_tensor)
        return torch.softmax(outputs, dim=1).cpu().numpy()  # Return probabilities

# Select a single batch of data from the training set for SHAP analysis
batch_data, _ = next(iter(train_loader))
batch_data = batch_data.cpu().numpy()

explainer = shap.Explainer(time_sensitive_model_predict, batch_data)

# Compute SHAP values for the batch
shap_values = explainer(batch_data)

PermutationExplainer explainer: 4097it [07:05,  9.46it/s]


In [None]:
average_shap_values = np.mean(shap_values.values, axis=0)  # Shape: (297, 69)
for target_class in range(69):
    class_shap_values = average_shap_values[:, target_class]  # Shape: (297,)

    sorted_indices = np.argsort(-np.abs(class_shap_values))  # Sort by absolute importance
    sorted_shap_values = class_shap_values[sorted_indices]
    sorted_feature_names = np.array(ungrouped_feature_names)[sorted_indices]

    # Plot the averaged SHAP values for this class
    plt.figure(figsize=(10, 6))
    plt.barh(sorted_feature_names[:20], sorted_shap_values[:20])  # Top 20 features
    plt.xlabel("Average SHAP Value")
    plt.ylabel("Feature")
    plt.title(f"Averaged Feature Importance for Class {target_class}")
    plt.gca().invert_yaxis()  # Invert y-axis for better readability
    plt.tight_layout()
    plt.savefig(f"averaged_summary_plot_class_{target_class}.png")
    plt.close()

In [None]:
true_shap_values = np.empty(12)
true_shap_values[:7] = mean_abs_shap_values[:7]
true_shap_values[7] = mean_abs_shap_values[7:12].sum()
true_shap_values[8] = mean_abs_shap_values[13:91].sum()
true_shap_values[9] = mean_abs_shap_values[92:111].sum()
true_shap_values[10] = mean_abs_shap_values[112:116].sum()
true_shap_values[11] = mean_abs_shap_values[117:].sum()
true_feature_names = ["Date Occ", "Vict Age Was 0", "Vict Age", "Rpt Dist No", "Area", "Time Occurred", "Premise Code", "Status", "Weapon Used Cd", "Vict Descent", "Vict Sex", "Mocodes"]
sorted_true_indices = np.argsort(-true_shap_values)
sorted_true_shap_values = true_shap_values[sorted_true_indices]
sorted_true_feature_names = np.array(true_feature_names)[sorted_true_indices]

most_important_true_feature = sorted_true_feature_names[0]
most_important_true_value = sorted_true_shap_values[0]

for feature, importance in zip(sorted_true_feature_names, sorted_true_shap_values):
    print(f"Feature: {feature}, Summed Mean Absolute SHAP Value: {importance:.4f}")