In [2]:
# import DiCE
import dice_ml
from dice_ml.utils import helpers  # helper functions

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
dataset = helpers.load_adult_income_dataset()

In [5]:
dataset.head()


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,28,Private,Bachelors,Single,White-Collar,White,Female,60,0
1,30,Self-Employed,Assoc,Married,Professional,White,Male,65,1
2,32,Private,Some-college,Married,White-Collar,White,Male,50,0
3,20,Private,Some-college,Single,Service,White,Female,35,0
4,41,Self-Employed,Some-college,Married,White-Collar,White,Male,50,0


In [6]:
d = dice_ml.Data(dataframe=dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income')

In [7]:
target = dataset["income"]
# Split data into train and test
datasetX = dataset.drop("income", axis=1)
x_train, x_test, y_train, y_test = train_test_split(datasetX,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=target)

numerical = ["age", "hours_per_week"]
categorical = x_train.columns.difference(numerical)

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        ('cat', categorical_transformer, categorical)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', RandomForestClassifier())])
model = clf.fit(x_train, y_train)

In [8]:
# provide the trained ML model to DiCE's model object
backend = 'sklearn'
m = dice_ml.Model(model=model, backend=backend)

## Generate diverse counterfactuals



It can be observed that the random sampling method produces less sparse CFs in contrast to current DiCE's implementation. The sparsity issue with random sampling worsens with increasing total_CFs

Further, different sets of counterfactuals can be generated with different random seeds.



In [9]:
# initiate DiCE
exp_random = dice_ml.Dice(d, m, method="random")
query_instances = x_train[4:6]
# generate counterfactuals
dice_exp_random = exp_random.generate_counterfactuals(query_instances, total_CFs=2, desired_class="opposite", verbose=False)
dice_exp_random.visualize_as_dataframe(show_only_changes=True)

# generate counterfactuals
# default random seed is 17
dice_exp_random = exp_random.generate_counterfactuals(query_instances, total_CFs=4, desired_class="opposite", random_seed=9)
dice_exp_random.visualize_as_dataframe(show_only_changes=True)

  cfs_df = cfs_df.append(rows_to_add)
100%|██████████| 2/2 [00:00<00:00,  3.49it/s]

Query instance (original outcome : 0)





Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,27,Private,School,Single,Blue-Collar,White,Male,40,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,71.0,-,Assoc,-,-,-,-,-,1
1,54.0,-,Assoc,-,-,-,-,-,1


Query instance (original outcome : 1)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,31,Self-Employed,Some-college,Married,Sales,Other,Male,60,1



Diverse Counterfactual set (new outcome: 0.0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,-,-,Prof-school,Divorced,-,-,-,-,0
1,-,-,-,-,-,White,-,33.0,0


  cfs_df = cfs_df.append(rows_to_add)
100%|██████████| 2/2 [00:00<00:00,  4.80it/s]

Query instance (original outcome : 0)





Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,27,Private,School,Single,Blue-Collar,White,Male,40,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,87.0,-,Assoc,-,-,-,-,-,1
1,67.0,-,Assoc,-,-,-,-,-,1
2,61.0,-,Assoc,-,-,-,-,-,1
3,72.0,-,Assoc,-,-,-,-,-,1


Query instance (original outcome : 1)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,31,Self-Employed,Some-college,Married,Sales,Other,Male,60,1



Diverse Counterfactual set (new outcome: 0.0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,-,-,-,-,Other/Unknown,-,-,16.0,0
1,-,-,-,Single,-,-,-,13.0,0
2,44.0,-,-,Separated,-,-,-,-,0
3,19.0,-,-,-,-,-,-,-,0
