In [7]:
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import warnings
import shap
print("SHAP is working correctly!")

SHAP is working correctly!


In [8]:

from sklearn.model_selection import train_test_split

# Load the CSV file
df = pd.read_csv("lexical_features/lexical_features_20250525_184014.csv")  # e.g., "remove_duplicate_from_combined_dataset.csv"

# Separate features (X) and target label (y)
X = df.drop(columns=["label", "url", "source"], errors='ignore')  # Drop non-feature columns
y = df["label"]

# First split: 70% train, 30% temp
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)

# Second split: 15% validation, 15% test from remaining 30%
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")


NameError: name 'X' is not defined

In [3]:
df = pd.read_csv('lexical_features/lexical_features_20250525_184014.csv')
print("Data loaded successfully. Shape:", df.shape)


Data loaded successfully. Shape: (665795, 53)


In [4]:
# Filter out inactive rows
df = df[df['is_active'] == 1]

# Separate features and target
X = df.drop(columns=['label', 'domain', 'is_active'])
y = df['label']

# Handle numeric/non-numeric columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
X = X[numeric_cols]  # Drop non-numeric columns

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Data preprocessing completed.")

KeyError: 'is_active'

In [44]:
# Initialize models
rf = RandomForestClassifier(random_state=42)
svm = SVC(kernel='rbf', probability=True, random_state=42)
xgb = XGBClassifier(random_state=42, eval_metric='logloss')

# Train individual models
print("Training models...")
rf.fit(X_train_scaled, y_train)
svm.fit(X_train_scaled, y_train)
xgb.fit(X_train_scaled, y_train)

# Create ensemble
ensemble = VotingClassifier(estimators=[
    ('rf', rf),
    ('svm', svm),
    ('xgb', xgb)
], voting='soft')
ensemble.fit(X_train_scaled, y_train)
print("All models trained successfully.")

Training models...
All models trained successfully.


In [18]:
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = (model.predict_proba(X_test)[:, 1] >= 0.5).astype(int)
    y_prob = model.predict_proba(X_test)[:, 1]

    return {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_prob),
        'Confusion Matrix': confusion_matrix(y_test, y_pred)
    }

# Evaluate all models
results = pd.DataFrame([
    evaluate_model(rf, X_test_scaled, y_test, 'Random Forest'),
    evaluate_model(svm, X_test_scaled, y_test, 'SVM'),
    evaluate_model(xgb, X_test_scaled, y_test, 'XGBoost'),
    evaluate_model(ensemble, X_test_scaled, y_test, 'Ensemble')
])

print("\nModel Evaluation Results:")
print(results.drop(columns=['Confusion Matrix']))


Model Evaluation Results:
           Model  Accuracy  Precision    Recall  F1 Score   ROC AUC
0  Random Forest  0.824074   0.968750  0.632653  0.765432  0.932722
1            SVM  0.842593   0.880952  0.755102  0.813187  0.908682
2        XGBoost  0.824074   0.788462  0.836735  0.811881  0.909201
3       Ensemble  0.833333   0.844444  0.775510  0.808511  0.936700


In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

def evaluate_model(model, X_test, y_test, model_name, threshold=0.5):
    """
    Evaluate model performance with customizable probability threshold

    Parameters:
    - model: Trained model
    - X_test: Test features
    - y_test: True labels
    - model_name: Name of the model
    - threshold: Probability threshold for classification (default=0.5)
    """
    try:
        y_prob = model.predict_proba(X_test)[:, 1]
        y_pred = (y_prob >= threshold).astype(int)

        return {
            'Model': model_name,
            'Threshold': threshold,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred, zero_division=0),
            'Recall': recall_score(y_test, y_pred),
            'F1 Score': f1_score(y_test, y_pred),
            'ROC AUC': roc_auc_score(y_test, y_prob),
            'Confusion Matrix': confusion_matrix(y_test, y_pred),
            'Positive Rate': y_pred.mean()  # Percentage of positive predictions
        }
    except Exception as e:
        print(f"Error evaluating {model_name}: {str(e)}")
        return None

# Define custom thresholds for each model
model_thresholds = {
    'Random Forest': 0.43,   # Default threshold
    'SVM': 0.59,            # SVM often needs lower threshold
    'XGBoost': 0.45,       # Slightly more conservative
    'Ensemble': 0.55        # Default threshold
}

# Evaluate all models with their respective thresholds
evaluation_results = []
for model_name, model in [('Random Forest', rf),
                         ('SVM', svm),
                         ('XGBoost', xgb),
                         ('Ensemble', ensemble)]:
    result = evaluate_model(
        model,
        X_test_scaled,
        y_test,
        model_name,
        threshold=model_thresholds[model_name]
    )
    if result is not None:
        evaluation_results.append(result)

# Create results DataFrame
results = pd.DataFrame(evaluation_results)

# Print formatted results
print("\nModel Evaluation Results with Custom Thresholds:")
print(results.drop(columns=['Confusion Matrix']).to_string(index=False))

# Optional: Display confusion matrices separately
print("\nConfusion Matrices:")
for _, row in results.iterrows():
    print(f"\n{row['Model']} (Threshold={row['Threshold']}):")
    print(row['Confusion Matrix'])

NameError: name 'rf' is not defined

In [46]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import numpy as np
import pandas as pd

def cross_validate_model(model, X, y, model_name, threshold=0.5, n_splits=10):
    """
    Perform stratified k-fold cross-validation with custom threshold

    Parameters:
    - model: Model object
    - X: Features (numpy array or pandas DataFrame)
    - y: Target (numpy array or pandas Series)
    - model_name: Name of model
    - threshold: Probability threshold
    - n_splits: Number of cross-validation folds

    Returns:
    - Dictionary of mean metrics across all folds
    - Full confusion matrix
    """
    # Convert to numpy arrays if they're pandas objects
    X_array = X.values if hasattr(X, 'values') else X
    y_array = y.values if hasattr(y, 'values') else y

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    metrics = {
        'Accuracy': [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'ROC AUC': [],
        'Positive Rate': []
    }
    conf_matrices = []

    for train_idx, test_idx in skf.split(X_array, y_array):
        X_train, X_test = X_array[train_idx], X_array[test_idx]
        y_train, y_test = y_array[train_idx], y_array[test_idx]

        # Clone model to avoid refitting the same object
        model_clone = clone(model)
        model_clone.fit(X_train, y_train)

        try:
            y_prob = model_clone.predict_proba(X_test)[:, 1]
            y_pred = (y_prob >= threshold).astype(int)

            # Store metrics for this fold
            metrics['Accuracy'].append(accuracy_score(y_test, y_pred))
            metrics['Precision'].append(precision_score(y_test, y_pred, zero_division=0))
            metrics['Recall'].append(recall_score(y_test, y_pred))
            metrics['F1'].append(f1_score(y_test, y_pred))
            metrics['ROC AUC'].append(roc_auc_score(y_test, y_prob))
            metrics['Positive Rate'].append(y_pred.mean())
            conf_matrices.append(confusion_matrix(y_test, y_pred))
        except Exception as e:
            print(f"Error in fold: {str(e)}")
            continue

    # Calculate mean metrics
    mean_metrics = {
        'Model': model_name,
        'Threshold': threshold,
        'Accuracy': np.mean(metrics['Accuracy']),
        'Precision': np.mean(metrics['Precision']),
        'Recall': np.mean(metrics['Recall']),
        'F1 Score': np.mean(metrics['F1']),
        'ROC AUC': np.mean(metrics['ROC AUC']),
        'Positive Rate': np.mean(metrics['Positive Rate']),
        'Confusion Matrix': sum(conf_matrices),  # Sum across all folds
        'Std Accuracy': np.std(metrics['Accuracy']),
        'Std F1': np.std(metrics['F1'])
    }

    return mean_metrics

# Define custom thresholds for each model
model_thresholds = {
    'Random Forest': 0.44, #44
    'SVM': 0.65, #60 -90, 65 -91
    'XGBoost': 0.47,
    'Ensemble': 0.61
}

# Perform cross-validation for each model
cv_results = []
for model_name, model in [('Random Forest', rf),
                         ('SVM', svm),
                         ('XGBoost', xgb),
                         ('Ensemble', ensemble)]:
    print(f"\nRunning 10-fold CV for {model_name}...")
    result = cross_validate_model(
        model,
        X_train_scaled,  # Using training data for CV
        y_train,
        model_name,
        threshold=model_thresholds[model_name]
    )
    cv_results.append(result)

# Create and display results DataFrame
cv_df = pd.DataFrame(cv_results)
print("\nCross-Validation Results (10-fold):")
print(cv_df.drop(columns=['Confusion Matrix']).to_string(index=False))

# Display aggregated confusion matrices
print("\nAggregated Confusion Matrices (Sum across all folds):")
for _, row in cv_df.iterrows():
    print(f"\n{row['Model']} (Threshold={row['Threshold']}):")
    print(row['Confusion Matrix'])

# Final test set evaluation
print("\nFinal Evaluation on Test Set:")
test_results = []
for model_name, model in [('Random Forest', rf),
                         ('SVM', svm),
                         ('XGBoost', xgb),
                         ('Ensemble', ensemble)]:
    res = evaluate_model(
        model,
        X_test_scaled,
        y_test,
        model_name,
        threshold=model_thresholds[model_name]
    )
    if res is not None:
        test_results.append(res)

test_df = pd.DataFrame(test_results)
print(test_df.drop(columns=['Confusion Matrix']).to_string(index=False))


Running 10-fold CV for Random Forest...

Running 10-fold CV for SVM...

Running 10-fold CV for XGBoost...

Running 10-fold CV for Ensemble...

Cross-Validation Results (10-fold):
        Model  Threshold  Accuracy  Precision  Recall  F1 Score  ROC AUC  Positive Rate  Std Accuracy   Std F1
Random Forest       0.44     0.880   0.925000    0.90  0.873333 0.883333          0.430      0.228254 0.205372
          SVM       0.65     0.825   0.733333    0.70  0.683333 0.883333          0.325      0.191377 0.383333
      XGBoost       0.47     0.775   0.791667    0.75  0.720000 0.875000          0.415      0.249249 0.308113
     Ensemble       0.61     0.845   0.833333    0.75  0.750000 0.883333          0.345      0.176706 0.309570

Aggregated Confusion Matrices (Sum across all folds):

Random Forest (Threshold=0.44):
[[24  3]
 [ 2 16]]

SVM (Threshold=0.65):
[[25  2]
 [ 6 12]]

XGBoost (Threshold=0.47):
[[22  5]
 [ 5 13]]

Ensemble (Threshold=0.61):
[[25  2]
 [ 5 13]]

Final Evaluation on Te

In [6]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluation function for test set
def evaluate_model(model, X_test, y_test, model_name, threshold=0.5):
    try:
        y_prob = model.predict_proba(X_test)[:, 1]
        y_pred = (y_prob >= threshold).astype(int)

        return {
            'Model': model_name,
            'Threshold': threshold,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred, zero_division=0),
            'Recall': recall_score(y_test, y_pred),
            'F1 Score': f1_score(y_test, y_pred),
            'ROC AUC': roc_auc_score(y_test, y_prob),
            'Positive Rate': y_pred.mean(),
            'Confusion Matrix': confusion_matrix(y_test, y_pred),
            'Std Accuracy': 0.0,  # Not applicable here
            'Std F1': 0.0  # Not applicable here
        }
    except Exception as e:
        print(f"Error evaluating {model_name}: {e}")
        return None

# Cross-validation function
def cross_validate_model(model, X, y, model_name, threshold=0, n_splits=10):
    X_array = X.values if hasattr(X, 'values') else X
    y_array = y.values if hasattr(y, 'values') else y

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    metrics = {k: [] for k in ['Accuracy', 'Precision', 'Recall', 'F1', 'ROC AUC', 'Positive Rate']}
    conf_matrices = []

    for train_idx, test_idx in skf.split(X_array, y_array):
        X_train, X_test = X_array[train_idx], X_array[test_idx]
        y_train, y_test = y_array[train_idx], y_array[test_idx]

        model_clone = clone(model)
        model_clone.fit(X_train, y_train)

        try:
            y_prob = model_clone.predict_proba(X_test)[:, 1]
            y_pred = (y_prob >= threshold).astype(int)

            metrics['Accuracy'].append(accuracy_score(y_test, y_pred))
            metrics['Precision'].append(precision_score(y_test, y_pred, zero_division=0))
            metrics['Recall'].append(recall_score(y_test, y_pred))
            metrics['F1'].append(f1_score(y_test, y_pred))
            metrics['ROC AUC'].append(roc_auc_score(y_test, y_prob))
            metrics['Positive Rate'].append(y_pred.mean())
            conf_matrices.append(confusion_matrix(y_test, y_pred))
        except Exception as e:
            print(f"Error in fold: {e}")

    return {
        'Model': model_name,
        'Threshold': threshold,
        'Accuracy': np.mean(metrics['Accuracy']),
        'Precision': np.mean(metrics['Precision']),
        'Recall': np.mean(metrics['Recall']),
        'F1 Score': np.mean(metrics['F1']),
        'ROC AUC': np.mean(metrics['ROC AUC']),
        'Positive Rate': np.mean(metrics['Positive Rate']),
        'Confusion Matrix': sum(conf_matrices),
        'Std Accuracy': np.std(metrics['Accuracy']),
        'Std F1': np.std(metrics['F1'])
    }

# Define thresholds
model_thresholds = {
    'Random Forest': 0.44,
    'SVM': 0.65,
    'XGBoost': 0.47,
    'Ensemble': 0.61
}

# Perform CV
cv_results = []
for model_name, model in [('Random Forest', rf), ('SVM', svm), ('XGBoost', xgb), ('Ensemble', ensemble)]:
    print(f"\nRunning 10-fold CV for {model_name}...")
    result = cross_validate_model(model, X_train_scaled, y_train, model_name, threshold=model_thresholds[model_name])
    cv_results.append(result)

cv_df = pd.DataFrame(cv_results)
print("\nCross-Validation Results (10-fold):")
print(cv_df.drop(columns=['Confusion Matrix']).to_string(index=False))

# Plot confusion matrices
print("\nConfusion Matrices (Cross-Validation):")
for _, row in cv_df.iterrows():
    cm = row['Confusion Matrix']
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{row['Model']} (Threshold={row['Threshold']})")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()

# Final test set evaluation
print("\nFinal Evaluation on Test Set:")
test_results = []
for model_name, model in [('Random Forest', rf), ('SVM', svm), ('XGBoost', xgb), ('Ensemble', ensemble)]:
    res = evaluate_model(model, X_test_scaled, y_test, model_name, threshold=model_thresholds[model_name])
    if res:
        test_results.append(res)

test_df = pd.DataFrame(test_results)
print(test_df.drop(columns=['Confusion Matrix']).to_string(index=False))

# Plot test set confusion matrices
print("\nConfusion Matrices (Test Set):")
for _, row in test_df.iterrows():
    cm = row['Confusion Matrix']
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
    plt.title(f"Test Set: {row['Model']} (Threshold={row['Threshold']})")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()


NameError: name 'rf' is not defined

In [7]:
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = (model.predict_proba(X_test)[:, 1] >= 0.).astype(int)
    y_prob = model.predict_proba(X_test)[:, 1]

    return {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_prob),
        'Confusion Matrix': confusion_matrix(y_test, y_pred)
    }

# Evaluate all models
results = pd.DataFrame([
    evaluate_model(rf, X_test_scaled, y_test, 'Random Forest'),
    evaluate_model(svm, X_test_scaled, y_test, 'SVM'),
    evaluate_model(xgb, X_test_scaled, y_test, 'XGBoost'),
    evaluate_model(ensemble, X_test_scaled, y_test, 'Ensemble')
])

print("\nModel Evaluation Results:")
print(results.drop(columns=['Confusion Matrix']))

NameError: name 'rf' is not defined

In [8]:
# Section 5: Final Robust SHAP Analysis (Complete Fixed Version)
import shap
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from warnings import filterwarnings
from tqdm import tqdm

# Disable warnings
filterwarnings('ignore')

### 1. SHAP Initialization ###
def initialize_shap_explainers(models_dict, X_train, feature_names):
    """Initialize SHAP explainers with proper shape handling"""
    explainers = {}
    print("Initializing SHAP explainers...")

    # Create background data
    background = shap.sample(X_train, min(100, len(X_train)))

    # Tree-based models
    if 'Random Forest' in models_dict:
        try:
            explainers['Random Forest'] = shap.TreeExplainer(
                models_dict['Random Forest'],
                feature_perturbation="tree_path_dependent"
            )
        except Exception as e:
            print(f"❌ Random Forest explainer failed: {str(e)}")
            explainers['Random Forest'] = None

    if 'XGBoost' in models_dict:
        try:
            explainers['XGBoost'] = shap.TreeExplainer(
                models_dict['XGBoost'],
                feature_perturbation="tree_path_dependent"
            )
        except Exception as e:
            print(f"❌ XGBoost explainer failed: {str(e)}")
            explainers['XGBoost'] = None

    # SVM and Ensemble
    if 'SVM' in models_dict:
        try:
            def svm_predict(X):
                return models_dict['SVM'].predict_proba(X)

            explainers['SVM'] = shap.KernelExplainer(
                svm_predict,
                background,
                silent=True
            )
        except Exception as e:
            print(f"❌ SVM explainer failed: {str(e)}")
            explainers['SVM'] = None

    if 'Ensemble' in models_dict:
        try:
            def ensemble_predict(X):
                return models_dict['Ensemble'].predict_proba(X)

            explainers['Ensemble'] = shap.KernelExplainer(
                ensemble_predict,
                background,
                silent=True
            )
        except Exception as e:
            print(f"❌ Ensemble explainer failed: {str(e)}")
            explainers['Ensemble'] = None

    print("✅ SHAP explainers initialized")
    return explainers

### 2. SHAP Value Computation ###
def compute_shap_values(explainers, X_test, feature_names, max_samples=100):
    """Compute SHAP values with proper shape handling"""
    shap_values = {}
    X_test_sample = X_test[:max_samples]

    print("\nComputing SHAP values...")

    for name, explainer in explainers.items():
        if explainer is None:
            continue

        try:
            if isinstance(explainer, shap.TreeExplainer):
                vals = explainer.shap_values(X_test_sample)
                # Handle different tree explainer outputs
                if isinstance(vals, list):
                    # Binary classification - take class 1 values
                    shap_values[name] = vals[1]
                elif len(vals.shape) == 3:
                    # Multi-class - take class 1 values
                    shap_values[name] = vals[:, :, 1]
                else:
                    shap_values[name] = vals
            else:
                # For KernelExplainer (SVM/Ensemble)
                vals = np.zeros((len(X_test_sample), len(feature_names)))

                for i in tqdm(range(len(X_test_sample)), desc=f"Computing {name} SHAP"):
                    sample_shap = explainer.shap_values(
                        X_test_sample[i:i+1],
                        silent=True,
                        nsamples=200
                    )

                    # Handle both binary and multi-class cases
                    if isinstance(sample_shap, list):
                        vals[i] = sample_shap[1][0]  # Class 1 probabilities
                    elif len(sample_shap.shape) == 3:
                        vals[i] = sample_shap[0, :, 1]  # Class 1 for multi-class
                    else:
                        vals[i] = sample_shap[0]  # Single output

                shap_values[name] = vals

            # Validate shape
            if shap_values[name].shape[1] != len(feature_names):
                print(f"⚠️ Adjusting {name} SHAP shape from {shap_values[name].shape} to match {len(feature_names)} features")
                if shap_values[name].shape[1] > len(feature_names):
                    shap_values[name] = shap_values[name][:, :len(feature_names)]
                else:
                    padded = np.zeros((shap_values[name].shape[0], len(feature_names)))
                    padded[:, :shap_values[name].shape[1]] = shap_values[name]
                    shap_values[name] = padded

            print(f"✅ {name} SHAP computed - Shape: {shap_values[name].shape}")
        except Exception as e:
            print(f"❌ {name} SHAP computation failed: {str(e)}")
            shap_values[name] = None

    return shap_values, X_test_sample

### 3. SHAP Visualization (Fixed for Random Forest) ###
def generate_shap_visualizations(shap_values, feature_names, X_test):
    """Generate visualizations with robust error handling and show value impact"""
    print("\nGenerating SHAP visualizations...")

    sample_size = min(100, len(X_test))
    X_sample = X_test[:sample_size]
    X_df = pd.DataFrame(X_sample, columns=feature_names)

    for name, values in shap_values.items():
        if values is None:
            continue

        try:
            # Prepare SHAP values
            sample_values = values[:sample_size]
            if len(sample_values.shape) == 3:
                sample_values = sample_values[:, :, 1]  # Class 1 only

            ### 1. Summary Dot Plot ###
            plt.figure(figsize=(12, 8))
            shap.summary_plot(
                sample_values,
                X_df,
                show=False,
                max_display=20
            )
            plt.title(f'{name} SHAP Summary (Dot)', pad=20)
            plt.tight_layout()
            plt.savefig(f"shap_{name.lower().replace(' ', '_')}_summary_dot.png",
                        bbox_inches='tight', dpi=150)
            plt.close()

            ### 2. Optional Bar Plot ###
            plt.figure(figsize=(12, 8))
            shap.summary_plot(
                sample_values,
                X_df,
                show=False,
                plot_type='bar',
                max_display=20
            )
            plt.title(f'{name} SHAP Summary (Bar)', pad=20)
            plt.tight_layout()
            plt.savefig(f"shap_{name.lower().replace(' ', '_')}_summary_bar.png",
                        bbox_inches='tight', dpi=150)
            plt.close()

            ### 3. Dependence Plots for Top Features ###
            mean_abs = np.mean(np.abs(sample_values), axis=0)
            top_idx = np.argsort(mean_abs)[-5:][::-1]  # Top 5 for detailed dependence

            for idx in top_idx:
                feature = feature_names[idx]
                plt.figure(figsize=(10, 6))
                shap.dependence_plot(
                    feature,
                    sample_values,
                    X_df,
                    show=False
                )
                plt.title(f"{name} Dependence Plot: {feature}")
                plt.tight_layout()
                plt.savefig(f"shap_{name.lower().replace(' ', '_')}_dependence_{feature}.png",
                            bbox_inches='tight', dpi=150)
                plt.close()

            ### 4. Custom Importance Plot ###
            plt.figure(figsize=(12, 8))
            plt.barh(np.array(feature_names)[top_idx], mean_abs[top_idx])
            plt.title(f'{name} Top Features')
            plt.tight_layout()
            plt.savefig(f"shap_{name.lower().replace(' ', '_')}_importance.png",
                        bbox_inches='tight', dpi=150)
            plt.close()

            print(f"✅ Saved {name} visualizations (summary, bar, dependence)")
        except Exception as e:
            print(f"❌ {name} visualization failed: {str(e)}")

### 4. Update CSV with SHAP ###
def update_csv_with_shap(df, shap_values, feature_names, X_test_sample, models_dict):
    """Add SHAP explanations to CSV"""
    print("\nUpdating CSV with SHAP explanations...")

    df_out = df.iloc[:len(X_test_sample)].copy()

    # Add predictions
    for name, model in models_dict.items():
        df_out[f"{name.lower()}_prob"] = model.predict_proba(X_test_sample)[:, 1]
        df_out[f"{name.lower()}_pred"] = (df_out[f"{name.lower()}_prob"] >= 0.5).astype(int)

    # Add SHAP explanations
    for name, values in shap_values.items():
        if values is None:
            continue

        explanations = []
        for i in range(len(X_test_sample)):
            # Ensure we have 1D array
            sample_values = values[i] if len(values.shape) == 2 else values[i, :, 1]

            # Get top 3 features
            top3_idx = np.argsort(np.abs(sample_values))[-3:][::-1]
            explanation = ", ".join([
                f"{feature_names[j]} ({'↑' if sample_values[j] > 0 else '↓'}{abs(sample_values[j]):.3f})"
                for j in top3_idx
            ])
            explanations.append(explanation)

        df_out[f"{name.lower()}_explanation"] = explanations

    df_out.to_csv('final_predictions_with_shap.csv', index=False)
    print("✅ Saved predictions with SHAP explanations")

### Execution Workflow ###
# 1. Prepare models dict
models_dict = {
    'Random Forest': rf,
    'SVM': svm,
    'XGBoost': xgb,
    'Ensemble': ensemble
}

# 2. Get feature names
feature_names = X.columns.tolist()

# 3. Initialize explainers
explainers = initialize_shap_explainers(models_dict, X_train_scaled, feature_names)

# 4. Compute SHAP values
shap_values, X_test_sample = compute_shap_values(
    explainers,
    X_test_scaled,
    feature_names,
    max_samples=100
)

# 5. Generate visualizations
generate_shap_outputs(shap_values, feature_names, X_test_sample)

# 6. Update CSV
update_csv_with_shap(df, shap_values, feature_names, X_test_sample, models_dict)

print("\n✔️ SHAP analysis completed successfully!")

NameError: name 'rf' is not defined

In [48]:
# Final Corrected SHAP Analysis
import shap
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from warnings import filterwarnings
from tqdm import tqdm

# Disable warnings
filterwarnings('ignore')

### 1. SHAP Initialization (Fixed for Tree Models) ###
def initialize_shap_explainers(models_dict, X_train, feature_names):
    """Initialize SHAP explainers with correct configurations"""
    explainers = {}
    print("Initializing SHAP explainers...")

    # Background data for KernelExplainer
    background = shap.sample(X_train, min(50, len(X_train)))

    # Tree-based models (with correct parameters)
    if 'Random Forest' in models_dict:
        try:
            explainers['Random Forest'] = shap.TreeExplainer(
                models_dict['Random Forest'],
                feature_perturbation="tree_path_dependent"
            )
        except Exception as e:
            print(f"❌ Random Forest explainer failed: {str(e)}")

    if 'XGBoost' in models_dict:
        try:
            explainers['XGBoost'] = shap.TreeExplainer(
                models_dict['XGBoost'],
                feature_perturbation="tree_path_dependent"
            )
        except Exception as e:
            print(f"❌ XGBoost explainer failed: {str(e)}")

    # SVM and Ensemble
    if 'SVM' in models_dict:
        try:
            explainers['SVM'] = shap.KernelExplainer(
                lambda X: models_dict['SVM'].predict_proba(X)[:, 1],  # Only class 1 probabilities
                background,
                silent=True
            )
        except Exception as e:
            print(f"❌ SVM explainer failed: {str(e)}")

    if 'Ensemble' in models_dict:
        try:
            explainers['Ensemble'] = shap.KernelExplainer(
                lambda X: models_dict['Ensemble'].predict_proba(X)[:, 1],  # Only class 1 probabilities
                background,
                silent=True
            )
        except Exception as e:
            print(f"❌ Ensemble explainer failed: {str(e)}")

    print("✅ SHAP explainers initialized")
    return {k: v for k, v in explainers.items() if v is not None}

### 2. SHAP Value Computation (Fixed Shape Handling) ###
def compute_shap_values(explainers, X_test, feature_names):
    """Compute SHAP values with proper shape handling"""
    shap_values = {}
    print("\nComputing SHAP values...")

    for name, explainer in explainers.items():
        try:
            if isinstance(explainer, shap.TreeExplainer):
                # For tree models, get values and ensure correct shape
                vals = explainer.shap_values(X_test)
                if isinstance(vals, list):
                    shap_values[name] = vals[1]  # Class 1 for binary classification
                else:
                    shap_values[name] = vals
            else:
                # For KernelExplainer, process samples individually
                vals = np.zeros((len(X_test), len(feature_names)))

                for i in tqdm(range(len(X_test)), desc=f"Computing {name} SHAP"):
                    sample_shap = explainer.shap_values(
                        X_test[i:i+1],
                        silent=True,
                        nsamples=100
                    )
                    vals[i] = sample_shap[0] if isinstance(sample_shap, np.ndarray) else sample_shap

                shap_values[name] = vals

            print(f"✅ {name} SHAP computed - Shape: {shap_values[name].shape}")
        except Exception as e:
            print(f"❌ {name} SHAP computation failed: {str(e)}")
            shap_values[name] = None

    return {k: v for k, v in shap_values.items() if v is not None}

### 3. Visualization (Fixed for All Models) ###
def generate_shap_visualizations(shap_values, feature_names, X_test):
    """Generate visualizations with robust error handling"""
    print("\nGenerating SHAP visualizations...")

    # Use first 100 samples or all if fewer
    sample_size = min(100, len(X_test))
    X_sample = X_test[:sample_size]

    for name, values in shap_values.items():
        if values is None:
            continue

        try:
            # Prepare sample values
            sample_values = values[:sample_size]
            if len(sample_values.shape) == 3:
                sample_values = sample_values[:, :, 1]  # Class 1 for 3D arrays

            # 1. Summary Plot
            plt.figure(figsize=(12, 8))
            shap.summary_plot(
                sample_values,
                pd.DataFrame(X_sample, columns=feature_names),
                show=False,
                max_display=20
            )
            plt.title(f'{name} Feature Importance', pad=20)
            plt.tight_layout()
            plt.savefig(f"shap_{name.lower().replace(' ', '_')}_summary.png",
                      bbox_inches='tight', dpi=150)
            plt.close()

            # 2. Custom Importance Plot
            plt.figure(figsize=(12, 8))
            mean_abs = np.mean(np.abs(sample_values), axis=0)
            top_idx = np.argsort(mean_abs)[-20:][::-1]
            plt.barh(np.array(feature_names)[top_idx], mean_abs[top_idx])
            plt.title(f'{name} Top Features')
            plt.tight_layout()
            plt.savefig(f"shap_{name.lower().replace(' ', '_')}_importance.png",
                      bbox_inches='tight', dpi=150)
            plt.close()

            print(f"✅ Saved {name} visualizations")
        except Exception as e:
            print(f"❌ {name} visualization failed: {str(e)}")

### 4. Full Dataset Export (Fixed Length Mismatch) ###
def export_full_results(df, shap_values, feature_names, X_test, models_dict):
    """Export complete results with proper formatting"""
    print("\nExporting full results...")

    # Ensure we're working with the correct subset of data
    result_df = df.iloc[:len(X_test)].copy()

    # Add predictions
    for name, model in models_dict.items():
        try:
            result_df[f"{name}_prob"] = model.predict_proba(X_test)[:, 1]
            result_df[f"{name}_pred"] = (result_df[f"{name}_prob"] >= 0.5).astype(int)
        except Exception as e:
            print(f"❌ Error adding predictions for {name}: {str(e)}")

    # Add SHAP explanations
    for name, values in shap_values.items():
        if values is None:
            continue

        explanations = []
        for i in range(len(X_test)):
            try:
                if len(values.shape) == 3:
                    sample_values = values[i, :, 1]  # Class 1 for 3D arrays
                else:
                    sample_values = values[i]

                top3 = np.argsort(np.abs(sample_values))[-3:][::-1]
                parts = []
                for idx in top3:
                    val = sample_values[idx]
                    arrow = "↑" if val > 0 else "↓"
                    parts.append(f"{feature_names[idx]} ({arrow}{abs(val):.3f})")
                explanations.append(", ".join(parts))
            except Exception as e:
                explanations.append(f"Error: {str(e)}")
                print(f"⚠️ Explanation error for {name} sample {i}: {str(e)}")

        result_df[f"{name}_explanation"] = explanations

    # Save results
    result_df.to_csv('full_predictions_with_shap.csv', index=False)
    result_df.head(20).to_csv('sample_predictions_with_shap.csv', index=False)

    print("✅ Saved full results to 'full_predictions_with_shap.csv'")
    print("✅ Saved sample results to 'sample_predictions_with_shap.csv'")

    return result_df

### Execution Workflow ###
# 1. Prepare models
models_dict = {
    'Random Forest': rf,
    'SVM': svm,
    'XGBoost': xgb,
    'Ensemble': ensemble
}

# 2. Get feature names
feature_names = X.columns.tolist()
print(f"\nFeatures ({len(feature_names)}): {feature_names[:5]}...")

# 3. Initialize explainers with correct parameters
explainers = initialize_shap_explainers(models_dict, X_train_scaled, feature_names)

# 4. Compute SHAP values with proper shape handling
shap_values = compute_shap_values(explainers, X_test_scaled, feature_names)

# 5. Generate visualizations (using sample)
generate_shap_visualizations(shap_values, feature_names, X_test_scaled)

# 6. Export full results with proper data alignment
final_results = export_full_results(df, shap_values, feature_names, X_test_scaled, models_dict)

print("\n✔️ Analysis completed successfully!")
print("\nSample of explanations:")
print(final_results[['domain'] + [c for c in final_results.columns if 'explanation' in c]].head(10))


Features (67): ['domain_length', 'has_subdomain', 'num_dots', 'num_hyphens', 'num_slash']...
Initializing SHAP explainers...
✅ SHAP explainers initialized

Computing SHAP values...
✅ Random Forest SHAP computed - Shape: (108, 67, 2)
✅ XGBoost SHAP computed - Shape: (108, 67)


Computing SVM SHAP: 100%|████████████████████████████████████████████████████████████| 108/108 [00:49<00:00,  2.18it/s]


✅ SVM SHAP computed - Shape: (108, 67)


Computing Ensemble SHAP: 100%|███████████████████████████████████████████████████████| 108/108 [01:08<00:00,  1.57it/s]


✅ Ensemble SHAP computed - Shape: (108, 67)

Generating SHAP visualizations...
✅ Saved Random Forest visualizations
✅ Saved XGBoost visualizations
✅ Saved SVM visualizations
✅ Saved Ensemble visualizations

Exporting full results...
✅ Saved full results to 'full_predictions_with_shap.csv'
✅ Saved sample results to 'sample_predictions_with_shap.csv'

✔️ Analysis completed successfully!

Sample of explanations:
                                               domain  \
1   www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...   
4   thewhiskeydregs.com/wp-content/themes/widescre...   
9   horizonsgallery.com/js/bin/ssl1/_id/www.paypal...   
11  docs.google.com/spreadsheet/viewform?formkey=d...   
21  optimistic-pessimism.com/aoluserupdatealert.in...   
26  jameshowardmusic.com/wp-content/themes/widescr...   
27                                       xini.eu/00Qe   
31  horizonsgallery.com/js/bin/ssl/_id/www.paypal....   
36  docs.google.com/a/unmsm.edu.pe/spreadsheet/vie...   
40               

# New Section