# Counterfactual Explanations POC - House Prices

This notebook demonstrates how to use DiCE-ML for generating counterfactual explanations in regression problems using the House Prices dataset. We'll show how to explain model predictions and suggest actionable changes to achieve desired house prices.

## 1. Settings

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from feature_engine.selection import DropFeatures
from feature_engine.imputation import ArbitraryNumberImputer, CategoricalImputer
from feature_engine.encoding import OrdinalEncoder

# Modelling
import lightgbm as lgb
import shap
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

# Counterfactuals
import dice_ml

%matplotlib inline

In [4]:
import sklearn

sklearn.__version__

'1.5.0'

In [5]:
import feature_engine

feature_engine.__version__

'1.6.2'

## 2. Data Loading and Exploration

In [7]:
# Load the data
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

Training data shape: (1460, 81)
Test data shape: (1459, 80)


In [None]:
# Display basic information about the dataset
train_df.info()

In [None]:
# Display summary statistics
train_df.describe()

In [None]:
train_df['SalePrice'].describe([0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])

In [None]:
# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Scatter plot of SalePrice
sns.scatterplot(data=train_df, x=train_df.index, y='SalePrice', ax=ax1)
ax1.set_title('Sale Price Distribution')
ax1.set_xlabel('House Index')
ax1.set_ylabel('Sale Price ($)')

# Density plot of SalePrice
sns.kdeplot(data=train_df['SalePrice'], ax=ax2, fill=True)
ax2.set_title('Sale Price Density Distribution')
ax2.set_xlabel('Sale Price ($)')
ax2.set_ylabel('Density')

plt.tight_layout()
plt.show()

In [None]:
# Check for missing values and calculate percentages
missing_values = train_df.isnull().sum()
missing_percentages = (missing_values / len(train_df)) * 100
missing_info = pd.DataFrame({
    'Missing Values': missing_values[missing_values > 0],
    'Percentage': missing_percentages[missing_values > 0]
})
missing_info.sort_values('Missing Values', ascending=False)

In [None]:
# Calculate correlation matrix for numerical features
correlation_matrix = train_df.corr()

# Create a figure with a larger size
plt.figure(figsize=(20, 16))

# Create heatmap
sns.heatmap(correlation_matrix, 
            annot=True,  # Show correlation values
            cmap='coolwarm',  # Color scheme
            center=0,  # Center the colormap at 0
            fmt='.2f',  # Format correlation values to 2 decimal places
            square=True,  # Make the plot square
            linewidths=0.5)  # Add lines between cells

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)

# Adjust layout to prevent label cutoff
plt.tight_layout()

# Show the plot
plt.show()

# Print top correlations with SalePrice
print("\nTop 10 features most correlated with SalePrice:")
price_correlations = correlation_matrix['SalePrice'].sort_values(ascending=False)
print(price_correlations[1:11])  # Skip first one as it's SalePrice itself


## 3. Feature Engineering and Preprocessing

In [None]:
# Define target variable
TARGET = 'SalePrice'

# Separate features and target
X = train_df.drop([TARGET, 'Id'], axis=1)
y = train_df[TARGET]

# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {x_train.shape}")
print(f"Test set shape: {x_test.shape}")

In [12]:
NULL_COLS = [
    "PoolQC",
    "MiscFeature",
    "Alley",
    "Fence",
]

# 1. Drop columns with 80% missing values
drop_missing_cols_obj = DropFeatures(features_to_drop=NULL_COLS)

# Apply the transformation
x_train = drop_missing_cols_obj.fit_transform(x_train)
x_test = drop_missing_cols_obj.transform(x_test)

In [14]:
# # Identify numerical and categorical columns
# numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
# categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# print(f"Number of numerical features: {len(numerical_cols)}")
# print(f"Number of categorical features: {len(categorical_cols)}")

In [None]:
# Handle missing values
num_imputer = ArbitraryNumberImputer(variables=numerical_cols, arbitrary_number=0)
cat_imputer = CategoricalImputer(variables=categorical_cols, fill_value='missing')

# Encode categorical variables
encoder = OrdinalEncoder(variables=categorical_cols)

# Apply transformations
X_train_processed = num_imputer.fit_transform(X_train)
X_train_processed = cat_imputer.fit_transform(X_train_processed)
X_train_processed = encoder.fit_transform(X_train_processed)

# Transform test set
X_test_processed = num_imputer.transform(X_test)
X_test_processed = cat_imputer.transform(X_test_processed)
X_test_processed = encoder.transform(X_test_processed)

In [None]:
# Feature selection using Recursive Feature Elimination with Cross-Validation (RFECV)
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LassoCV

# Initialize base estimator for RFECV
base_estimator = LassoCV(cv=5, random_state=42)

# Create RFECV object
rfecv = RFECV(
    estimator=base_estimator,
    step=1,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

# Fit RFECV
rfecv.fit(X_train_processed, y_train)

# Get selected features
selected_features = X_train_processed.columns[rfecv.support_].tolist()
print(f"\nNumber of selected features: {len(selected_features)}")
print("\nSelected features:")
print(selected_features)

# Plot number of features vs cross-validation score
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(rfecv.grid_scores_) + 1), -rfecv.grid_scores_)
plt.xlabel('Number of Features')
plt.ylabel('Cross-Validation Score (MSE)')
plt.title('Recursive Feature Elimination with Cross-Validation')
plt.show()

# Update processed datasets with selected features
X_train_processed = X_train_processed[selected_features]
X_test_processed = X_test_processed[selected_features]



## 4. Model Training

In [None]:
# Train LightGBM model
model = lgb.LGBMRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

model.fit(X_train_processed, y_train)

# Make predictions
train_preds = model.predict(X_train_processed)
test_preds = model.predict(X_test_processed)

# Calculate metrics
print("Training Metrics:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_train, train_preds)):.2f}")
print(f"MAE: {mean_absolute_error(y_train, train_preds):.2f}")
print(f"R2: {r2_score(y_train, train_preds):.2f}")

print("\nTest Metrics:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, test_preds)):.2f}")
print(f"MAE: {mean_absolute_error(y_test, test_preds):.2f}")
print(f"R2: {r2_score(y_test, test_preds):.2f}")

## 5. Counterfactual Explanations

In [None]:
# Prepare data for DiCE
dice_data = pd.concat([X_test_processed, pd.Series(test_preds, index=X_test_processed.index, name='predicted_price')], axis=1)

# Create DiCE data object
dice_data_obj = dice_ml.Data(
    dataframe=dice_data,
    continuous_features=X_test_processed.columns.tolist(),
    outcome_name='predicted_price'
)

# Create DiCE model object
dice_model = dice_ml.Model(model=model, backend="sklearn", model_type="regressor")

# Create DiCE explainer
dice_explainer = dice_ml.Dice(dice_data_obj, dice_model, method="random")

In [None]:
# Select a sample house for counterfactual explanation
sample_idx = 0  # You can change this to analyze different houses
query_instance = X_test_processed.iloc[[sample_idx]]

# Get current prediction
current_price = model.predict(query_instance)[0]
print(f"Current predicted price: ${current_price:,.2f}")

# Generate counterfactuals
cf_examples = dice_explainer.generate_counterfactuals(
    query_instance,
    total_CFs=3,
    desired_range=(current_price * 1.1, current_price * 1.2),  # 10-20% higher price
    features_to_vary=['GrLivArea', 'OverallQual', 'YearBuilt']  # Example features to vary
)

# Display counterfactuals
cf_examples.visualize_as_dataframe(show_only_changes=True)