<a href="https://colab.research.google.com/github/PriyamvadaSingh-B/EXPLAINABLE-AI-B-45/blob/main/XAI_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt

In [None]:
%pip install dice-ml

Collecting dice-ml
  Downloading dice_ml-0.12-py3-none-any.whl.metadata (20 kB)
Collecting raiutils>=0.4.0 (from dice-ml)
  Downloading raiutils-0.4.2-py3-none-any.whl.metadata (1.4 kB)
Downloading dice_ml-0.12-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading raiutils-0.4.2-py3-none-any.whl (17 kB)
Installing collected packages: raiutils, dice-ml
Successfully installed dice-ml-0.12 raiutils-0.4.2


In [None]:
df = sns.load_dataset('titanic')


In [None]:
print("Initial shape:", df.shape)
print(df.head())

Initial shape: (891, 15)
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [None]:
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']].copy()
df = df.rename(columns={'survived': 'Survived'})

In [None]:
numeric_features = ['age', 'sibsp', 'parch', 'fare']
categorical_features = ['pclass', 'sex', 'embarked']

In [None]:
df = df[~df['Survived'].isna()]

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Survived'])


In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [None]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder='drop', verbose_feature_names_out=False)

In [None]:
preprocessor.fit(train_df)

In [None]:
X_train = pd.DataFrame(preprocessor.transform(train_df),
                       columns=preprocessor.get_feature_names_out(),
                       index=train_df.index)

In [None]:
X_test = pd.DataFrame(preprocessor.transform(test_df),
                      columns=preprocessor.get_feature_names_out(),
                      index=test_df.index)

In [None]:
y_train = train_df['Survived'].astype(int)
y_test = test_df['Survived'].astype(int)

In [None]:
print("X_train shape:", X_train.shape, "X_test shape:", X_test.shape)

X_train shape: (712, 12) X_test shape: (179, 12)


In [None]:
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)

In [None]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

In [None]:
def evaluate(model, X, y, name):
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    prec = precision_score(y, y_pred, zero_division=0)
    rec = recall_score(y, y_pred, zero_division=0)
    f1 = f1_score(y, y_pred, zero_division=0)
    print(f"--- {name} ---")
    print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")
    print(classification_report(y, y_pred, zero_division=0))
    return {'model': name, 'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1}


In [None]:
results = []
results.append(evaluate(lr, X_test, y_test, "Logistic Regression"))
results.append(evaluate(rf, X_test, y_test, "Random Forest"))


--- Logistic Regression ---
Accuracy: 0.8045 | Precision: 0.7931 | Recall: 0.6667 | F1: 0.7244
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       110
           1       0.79      0.67      0.72        69

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

--- Random Forest ---
Accuracy: 0.8156 | Precision: 0.8103 | Recall: 0.6812 | F1: 0.7402
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       110
           1       0.81      0.68      0.74        69

    accuracy                           0.82       179
   macro avg       0.81      0.79      0.80       179
weighted avg       0.82      0.82      0.81       179



In [None]:
metrics_df = pd.DataFrame(results)
print(metrics_df)

                 model  accuracy  precision    recall        f1
0  Logistic Regression  0.804469   0.793103  0.666667  0.724409
1        Random Forest  0.815642   0.810345  0.681159  0.740157


In [None]:
test_preds = rf.predict(X_test)
test_indices = X_test.index


In [None]:
neg_indices = [i for i, p in zip(test_indices, test_preds) if p == 0]
if len(neg_indices) == 0:
    pass

In [None]:
# remove the following lines to fix the error
# raise ValueError("No negative predictions found in test set.")
# chosen_idx = neg_indices[0]  # choose first negative predicted example
# print("Chosen test index:", chosen_idx)

In [None]:
# Since there are no negative predictions, choose an index from positive predictions
pos_indices = [i for i, p in zip(test_indices, test_preds) if p == 1]
if len(pos_indices) > 0:
    chosen_idx = pos_indices[0] # choose first positive predicted example
    print("Chosen test index:", chosen_idx)
    original_row = test_df.loc[chosen_idx]
    print("Original raw row:\n", original_row)
else:
    print("No positive predictions found in test set.")

Chosen test index: 241
Original raw row:
 Survived         1
pclass           3
sex         female
age            NaN
sibsp            1
parch            0
fare          15.5
embarked         Q
Name: 241, dtype: object


In [None]:
import dice_ml
from dice_ml.data import Data
from dice_ml.model import Model


In [None]:
# DiCE
import dice_ml
from dice_ml.data import Data
from dice_ml.model import Model
from dice_ml import Dice  # Corrected import path

In [None]:
dice_data = Data(dataframe=train_df.reset_index(drop=True),
                 continuous_features=['age', 'sibsp', 'parch', 'fare'],
                 outcome_name='Survived')


In [None]:
import dice_ml
from dice_ml.data import Data
from dice_ml.model import Model
from dice_ml import Dice  # the actual explainer

dice_data = Data(dataframe=train_df.reset_index(drop=True),
                 continuous_features=['age', 'sibsp', 'parch', 'fare'],
                 outcome_name='Survived')

dice_model = Model(model=rf, backend='sklearn')

exp = Dice(dice_data, dice_model, method='random')

In [None]:
dice_cf = exp.generate_counterfactuals(instance_for_dice,
                                       total_CFs=3,
                                       desired_class="opposite")


  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
100%|██████████| 1/1 [00:00<00:00,  1.15it/s]


In [None]:
dice_data = Data(dataframe=train_df.reset_index(drop=True),
                 continuous_features=['age', 'sibsp', 'parch', 'fare'],
                 outcome_name='Survived')

In [None]:
dice_model = Model(model=rf, backend='sklearn')

In [None]:
exp = Dice(dice_data, dice_model, method='random')

In [None]:
instance_for_dice = original_row[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']].to_frame().T.reset_index(drop=True)


In [None]:
print("Instance for DiCE (raw):")
print(instance_for_dice)

Instance for DiCE (raw):
  pclass     sex  age sibsp parch  fare embarked
0      3  female  NaN     1     0  15.5        Q


In [None]:
try:
    cf_df = dice_cf.final_cfs_df.copy()
except Exception:
    pass

In [None]:
cf_df = dice_cf.cf_examples_list[0].final_cfs_df.copy()

In [None]:
print("Counterfactuals (raw values):")
display(cf_df)

Counterfactuals (raw values):


Unnamed: 0,age,sibsp,parch,fare,pclass_1,pclass_2,pclass_3,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,Survived
0,2.872243,0.478335,-0.466183,-0.339817,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0
1,0.707851,0.478335,-0.466183,-0.339817,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0
2,-0.081135,0.478335,3.912444,1.103758,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0


In [None]:
combined = pd.concat([instance_for_dice.reset_index(drop=True), cf_df.drop(columns=['Survived']).reset_index(drop=True)], axis=0)
combined.insert(0, 'example', ['original'] + [f'cf_{i+1}' for i in range(len(cf_df))])
combined = combined.reset_index(drop=True)

In [None]:
orig = combined.loc[0, :]
changed = []
for col in combined.columns:
    if col in ['example']: continue
    changes = combined[col] != orig[col]
    changed.append((col, list(changes.values)))

In [None]:
def highlight_changes(df):
    changed_feats = []
    for i in range(len(df)):
        if i == 0:
            changed_feats.append('')
            continue
        diffs = []
        for c in df.columns:
            if c in ['example']: continue
            if df.loc[i, c] != df.loc[0, c]:
                diffs.append(c)
        changed_feats.append(", ".join(diffs))
    df['changed_features'] = changed_feats
    return df

In [None]:
combined_display = combined.copy()
combined_display = highlight_changes(combined_display)
display(combined_display)

Unnamed: 0,example,pclass,sex,age,sibsp,parch,fare,embarked,pclass_1,pclass_2,pclass_3,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,changed_features
0,original,3.0,female,,1.0,0.0,15.5,Q,,,,,,,,,
1,cf_1,,,2.872243,0.478335,-0.466183,-0.339817,,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,"pclass, sex, age, sibsp, parch, fare, embarked..."
2,cf_2,,,0.707851,0.478335,-0.466183,-0.339817,,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,"pclass, sex, age, sibsp, parch, fare, embarked..."
3,cf_3,,,-0.081135,0.478335,3.912444,1.103758,,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,"pclass, sex, age, sibsp, parch, fare, embarked..."


In [None]:
feature_change_counts = {} # Initialize the dictionary
for col in combined.columns:
    if col in ['example', 'changed_features']: continue
    count = sum(combined[col] != combined.loc[0, col]) - 0  # boolean sum
    feature_change_counts[col] = int(count)

In [None]:
print("Feature change frequency across generated CFs:\n", feature_change_counts)


Feature change frequency across generated CFs:
 {'pclass': 3, 'sex': 3, 'age': 4, 'sibsp': 3, 'parch': 3, 'fare': 3, 'embarked': 3, 'pclass_1': 4, 'pclass_2': 4, 'pclass_3': 4, 'sex_female': 4, 'sex_male': 4, 'embarked_C': 4, 'embarked_Q': 4, 'embarked_S': 4}


In [None]:
print("\nActionability checks:")
for i in range(1, len(combined_display)):
    print(f"\nCF {i}:")
    print("Changed features:", combined_display.loc[i, 'changed_features'])
    # Example check: Age should be within [0, 100], Fare non-negative
    if 'age' in combined_display.loc[i, 'changed_features']:
        a = combined_display.loc[i, 'age']
        print(f"  age -> {a} (valid range 0-120?)")
    if 'fare' in combined_display.loc[i, 'changed_features']:
        f = combined_display.loc[i, 'fare']
        print(f"  fare -> {f} (>=0?)")


Actionability checks:

CF 1:
Changed features: pclass, sex, age, sibsp, parch, fare, embarked, pclass_1, pclass_2, pclass_3, sex_female, sex_male, embarked_C, embarked_Q, embarked_S
  age -> 2.87224267 (valid range 0-120?)
  fare -> -0.33981696389004046 (>=0?)

CF 2:
Changed features: pclass, sex, age, sibsp, parch, fare, embarked, pclass_1, pclass_2, pclass_3, sex_female, sex_male, embarked_C, embarked_Q, embarked_S
  age -> 0.70785143 (valid range 0-120?)
  fare -> -0.33981696389004046 (>=0?)

CF 3:
Changed features: pclass, sex, age, sibsp, parch, fare, embarked, pclass_1, pclass_2, pclass_3, sex_female, sex_male, embarked_C, embarked_Q, embarked_S
  age -> -0.08113533197997112 (valid range 0-120?)
  fare -> 1.10375757 (>=0?)


In [None]:
def preprocess_raw_row(raw_row_df):
    """
    Preprocesses a raw row (as a single-row DataFrame) using the preprocessor.
    """
    # Select relevant columns and apply the preprocessor
    vect = preprocessor.transform(raw_row_df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']])
    return np.asarray(vect).reshape(-1)

In [None]:
# Pass the single-row DataFrame instance_for_dice
orig_vect = preprocess_raw_row(instance_for_dice)
distances = []
for i in range(1, len(combined_display)):
    # Select the row as a single-row DataFrame
    cf_raw_df = combined_display.loc[i:i, ['pclass','sex','age','sibsp','parch','fare','embarked']]
    cf_vect = preprocess_raw_row(cf_raw_df)
    euclid = np.linalg.norm(orig_vect - cf_vect)  # Euclidean
    manhattan = np.sum(np.abs(orig_vect - cf_vect))
    distances.append({'cf': f'cf_{i}', 'euclidean': float(euclid), 'manhattan': float(manhattan)})

In [None]:
dist_df = pd.DataFrame(distances).set_index('cf')
print("\nDistances (preprocessed feature space):")
display(dist_df)


Distances (preprocessed feature space):


Unnamed: 0_level_0,euclidean,manhattan
cf,Unnamed: 1_level_1,Unnamed: 2_level_1
cf_1,2.921894,7.347502
cf_2,3.036433,7.513787
cf_3,5.565517,11.659061


In [None]:
dist_df['rank_euclid'] = dist_df['euclidean'].rank(method='dense')
dist_df['rank_manhattan'] = dist_df['manhattan'].rank(method='dense')
print("\nDistance ranks:")
display(dist_df)


Distance ranks:


Unnamed: 0_level_0,euclidean,manhattan,rank_euclid,rank_manhattan
cf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cf_1,2.921894,7.347502,1.0,1.0
cf_2,3.036433,7.513787,2.0,2.0
cf_3,5.565517,11.659061,3.0,3.0


In [None]:
print("""
Reflection (short):
- Counterfactuals show minimal changes required to flip the prediction (e.g., change pclass or age).
- They improve trust by answering "what-if" questions: they provide actionable suggestions (if realistic).
- Limitations: Some CFs may propose changes that are not actionable (e.g., change birth year) unless constraints made immutable.
- Application beyond Titanic: Loan application decisions -> "If you increased income by X or reduced existing debts by Y, your loan would be approved."
""")


Reflection (short):
- Counterfactuals show minimal changes required to flip the prediction (e.g., change pclass or age).
- They improve trust by answering "what-if" questions: they provide actionable suggestions (if realistic).
- Limitations: Some CFs may propose changes that are not actionable (e.g., change birth year) unless constraints made immutable.
- Application beyond Titanic: Loan application decisions -> "If you increased income by X or reduced existing debts by Y, your loan would be approved."



In [None]:
combined_display.to_csv("counterfactuals_table.csv", index=False)
dist_df.to_csv("counterfactuals_distances.csv")

In [None]:
print("Saved 'counterfactuals_table.csv' and 'counterfactuals_distances.csv' in current directory.")

Saved 'counterfactuals_table.csv' and 'counterfactuals_distances.csv' in current directory.
