In [22]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import shap
from rdt import HyperTransformer
import warnings
warnings.simplefilter(action='ignore', category=Warning)
# Data Preparation and Preprocessing
filepath = './data/heloc_dataaset.csv'
df = pd.read_csv(filepath)

ht = HyperTransformer()
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)  # 20% of the data will be used for testing

X_train = train_df.iloc[:, :-1]
ht.detect_initial_config(data=X_train)
X_train = ht.fit_transform(X_train)
y_train = train_df.iloc[:, -1]

X_test = test_df.iloc[:, :-1]
ht.detect_initial_config(data=X_test)
X_test = ht.fit_transform(X_test)
y_test = test_df.iloc[:, -1]

# Resetting indices for direct indexing
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Building the Initial Model with Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Feature Importance Analysis using Logistic Regression for SHAP
lr_model = LogisticRegression(max_iter=10000)
lr_model.fit(X_train, y_train)
explainer = shap.Explainer(lr_model, X_train)
shap_values = explainer(X_test)

# Store results and counts of k values
results = []
k_counts = {}

# Instance-wise Feature Analysis
for instance_index in range(len(X_test)):
    true_label = y_test.iloc[instance_index]
    dt_prediction = dt_model.predict([X_test.iloc[instance_index]])[0]

    for k in range(2, X_train.shape[1] + 1):
        top_k_features = np.argsort(np.abs(shap_values.values[instance_index]))[-k:]

        # Modify the instance to keep only top k features, setting others to their mean
        modified_instance = X_train.mean(axis=0).copy()
        modified_instance[top_k_features] = X_test.iloc[instance_index, top_k_features]

        lr_prediction = lr_model.predict([modified_instance])[0]

        if lr_prediction == true_label:
            results.append([instance_index, k, true_label, dt_prediction, lr_prediction, dt_prediction == lr_prediction, list(top_k_features)])
            
            k_counts[k] = k_counts.get(k, 0) + 1
            break

# Save results and print k_counts
results_df = pd.DataFrame(results, columns=['Instance_Index', 'Num_Features_Used', 'True_Label', 'DT_Prediction', 'LR_Prediction', 'Matches_DT', 'Top_k_Features'])
results_df.to_csv('decision_tree_feature_influence_results_mean.csv', index=False)

print("Counts of predictions retained for each k:")
for k, count in sorted(k_counts.items()):
    print(f"k = {k}: {count}")
print(f"Total number of features: {X_train.shape[1]}")
print(f"Total number of Test Instances: {X_test.shape[0]}")
print("\nResults saved to 'decision_tree_feature_influence_results_mean.csv'")


Counts of predictions retained for each k:
k = 2: 1348
k = 3: 169
k = 4: 29
k = 5: 106
k = 6: 20
k = 7: 14
k = 8: 13
k = 9: 9
k = 10: 3
k = 11: 4
k = 13: 6
k = 14: 1
k = 15: 1
k = 18: 1
Total number of features: 24
Total number of Test Instances: 2092

Results saved to 'decision_tree_feature_influence_results_mean.csv'
