1) Define the groud-truth concepts  
2) Train the TabCBM in unsup-concept setting
3) Evaluate performance using several metrics 
4) Supervised-concepts setting

In [1]:
import scipy
import joblib
import sklearn
import numpy as np
import pandas as pd
import os.path as osp
import matplotlib.pyplot as plt

import tensorflow as tf

from tabcbm.models.architectures import construct_encoder, construct_decoder
from tabcbm.models.architectures import construct_end_to_end_model
from tabcbm.models.tabcbm import TabCBM

# Read the data 

In [2]:
# Reading already preprocessed train and test data 

data_dir = 'D:\\PycharmProjects\\AMMISproject\\data\\processed_data'
dataset = 'dataco'

x_train_std = joblib.load(osp.join(data_dir, dataset, 'x_train_std.joblib'))
x_test_std = joblib.load(osp.join(data_dir, dataset, 'x_test_std.joblib'))

x_train = joblib.load(osp.join(data_dir, dataset, 'x_train.joblib'))
x_test = joblib.load(osp.join(data_dir, dataset, 'x_test.joblib'))

y_train = joblib.load(osp.join(data_dir, dataset, 'y_train.joblib'))
y_test = joblib.load(osp.join(data_dir, dataset, 'y_test.joblib'))

print('Shape of the training set: ', x_train_std.shape)
print('Shape of the test set: ', x_test_std.shape)
print('Shape of the trainigb targets: ', y_train.shape)


Shape of the training set:  (138212, 38)
Shape of the test set:  (34553, 38)
Shape of the trainigb targets:  (138212,)


In [None]:
x_train_reduced = 

In [4]:
# Defining the concepts 

aggregated_concepts = {
    'Shipment': ['Type', 'Days for shipment (scheduled)', 'Shipping Mode'],
    'Customer': ['Customer Zipcode', 'Customer Segment'],
    'Department': ['Department Name', 'Market'],
    'Store': ['Latitude', 'Longitude'],
    'Order': ['Order Id', 'Order City', 'Order Country', 'order date (DateOrders)',
              'Order Profit Per Order', 'Order Status', 'Sales', 'Order Item Discount',
              'order_year', 'order_month', 'order_day'],
    'ProductCategory': ['Category Name']
}

# In the preprocessed data the naming of the columns differs, so we have to define
# expanded features and put them as a value of a corresponding concept
extended_concepts = {}
for concept, features in aggregated_concepts.items():
    extended_features = []
    for value in features:
            [extended_features.append(column) for column in x_test.columns if value in column]
   
    extended_concepts[concept] = extended_features


In [5]:
# Creating masks for each concept 

concepts_num = len(aggregated_concepts)
total_features_num = x_test.shape[1] 

concepts_masks = pd.DataFrame(0, columns=x_test.columns, index=list(aggregated_concepts.keys()))

for concept, features in extended_concepts.items():
    for feature in features:
        concepts_masks.loc[concept, feature] = 1

In [6]:
# Checking whether the features correspond to the defined concept "Shipment" 

concepts_masks.loc['Shipment', :]

Days for shipment (scheduled)    1
Category Name                    0
Customer Zipcode                 0
Department Name                  0
Latitude                         0
Longitude                        0
Order City                       0
Order Country                    0
Sales                            0
Order Id                         0
Order Item Discount              0
Order Profit Per Order           0
order_year                       0
order_month                      0
order_day                        0
Type_CASH                        1
Type_DEBIT                       1
Type_PAYMENT                     1
Type_TRANSFER                    1
Customer Segment_Consumer        0
Customer Segment_Corporate       0
Customer Segment_Home Office     0
Market_Africa                    0
Market_Europe                    0
Market_LATAM                     0
Market_Pacific Asia              0
Market_USCA                      0
Shipping Mode_First Class        1
Shipping Mode_Same D

## Create main components for TabCBM

In [7]:
# Parameters defining the architecture we will use

input_shape = x_train_std.shape[1:]
num_outputs = len(set(y_train))
encoder_units = [16, 16]
decoder_units = [16]
latent_dims = 16
learning_rate = 0.001
validation_size = 0.1

print('Input shape: ', input_shape)
print('Number of outputs: ', num_outputs)

Input shape:  (38,)
Number of outputs:  2


In [8]:
# Next, we build the feature to latent code encoder model (i.e., phi)
encoder = construct_encoder(input_shape, encoder_units, latent_dims)

In [9]:
# Then, we build the concept to label model  (i.e., the label predictor f)

decoder_inputs = tf.keras.Input(shape=[latent_dims])
decoder_graph = construct_decoder(decoder_units, num_outputs)
decoder = tf.keras.Model(
    decoder_inputs,
    decoder_graph(decoder_inputs),
    name="decoder",
)

In [10]:
# We then put them both together to make an end-to-end model we can pretrain

end_to_end_model, encoder, decoder = construct_end_to_end_model(input_shape,
                                                                encoder,
                                                                decoder,
                                                                num_outputs,
                                                                learning_rate)

end_to_end_model.summary()

pretrain_epochs = 30
batch_size = 512
pretrain_hist = end_to_end_model.fit(
    x=x_train_std,
    y=y_train,
    epochs=pretrain_epochs,
    batch_size=batch_size,
    validation_split=validation_size,
    verbose=1,
)

Epoch 1/30
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - binary_accuracy: 0.5841 - loss: 0.6313 - val_binary_accuracy: 0.6945 - val_loss: 0.5464
Epoch 2/30
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 838us/step - binary_accuracy: 0.6932 - loss: 0.5431 - val_binary_accuracy: 0.6950 - val_loss: 0.5428
Epoch 3/30
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 900us/step - binary_accuracy: 0.6978 - loss: 0.5387 - val_binary_accuracy: 0.6947 - val_loss: 0.5419
Epoch 4/30
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 854us/step - binary_accuracy: 0.6952 - loss: 0.5396 - val_binary_accuracy: 0.6948 - val_loss: 0.5412
Epoch 5/30
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 892us/step - binary_accuracy: 0.6973 - loss: 0.5380 - val_binary_accuracy: 0.6949 - val_loss: 0.5407
Epoch 6/30
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 895us/step - binary_accuracy: 0.698

In [20]:
# Evaluate pretrained model

# We will accumulate all metrics/results in the same dictionary
results = {}

end_to_end_preds = end_to_end_model.predict(
    x_test_std,
    batch_size=batch_size,
)

# We assume that we have outputed logits
if np.min(end_to_end_preds) < 0.0 or np.max(end_to_end_preds) > 1:
    end_to_end_preds = tf.math.sigmoid(end_to_end_preds).numpy()
end_to_end_preds = (end_to_end_preds >= 0.5).astype(np.int32)
results['pre_train_acc'] = sklearn.metrics.accuracy_score(
    y_test,
    end_to_end_preds,
)
results['pre_train_auc'] = sklearn.metrics.roc_auc_score(
    y_test,
    end_to_end_preds,
)
print(f"Pretrained model task accuracy: {results['pre_train_acc']*100:.2f}%")


[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 982us/step
Pretrained model task accuracy: 65.26%


# Construct TabCBM

For this, we will first compute the empirical covariance matrix in order for us to learn useful masks using a similar approach to that proposed by SEFS:

In [11]:
# Construct the training set's empirical covariance matrix
# NOTE: This step can be very computationally expensive/intractable in large
#       datasets. In those cases, one may ignore the covariance matrix when
#       performing TabCBM's pretraining at the potential cost of performance or
#       more accurate concept discovery.
cov_mat = np.corrcoef(x_train_std.T)
print(cov_mat)

[[ 1.00000000e+00  1.00105563e-03  8.38634556e-03 ... -3.34490471e-03
   6.45576389e-03  7.41082360e-04]
 [ 1.00105563e-03  1.00000000e+00  2.93211877e-03 ...  1.57344435e-03
  -2.49050682e-03 -4.45254638e-03]
 [ 8.38634556e-03  2.93211877e-03  1.00000000e+00 ...  3.03096663e-03
   5.79808624e-03  1.24462552e-03]
 ...
 [-3.34490471e-03  1.57344435e-03  3.03096663e-03 ...  1.00000000e+00
  -1.99639348e-01 -1.38666258e-01]
 [ 6.45576389e-03 -2.49050682e-03  5.79808624e-03 ... -1.99639348e-01
   1.00000000e+00 -2.07627716e-01]
 [ 7.41082360e-04 -4.45254638e-03  1.24462552e-03 ... -1.38666258e-01
  -2.07627716e-01  1.00000000e+00]]


In [12]:
# Number of concepts we want to discover
n_concepts = 6

# Set the weights for the different regularisers in the loss
coherence_reg_weight = 0.1  # $lambda_{co}
diversity_reg_weight = 5  # $lambda_{div}
feature_selection_reg_weight = 5  # $lambda_{spec}
gate_estimator_weight = 10  # Gate prediction regularizer for SEFS's pre-text task

# Select how many neighbors to use for the coherency loss (must be less than the batch size!)
top_k = 256

In [13]:
# Generate a dictionary with the parameters to use for TabCBM as we will have
# to use the same parameters twice:
tab_cbm_params = dict(
    features_to_concepts_model=encoder,  # The $\phi$ sub-model
    concepts_to_labels_model=decoder,  # The $f$ sub-model
    latent_dims=latent_dims,  # The dimensionality of the concept embeddings $m$
    n_concepts=n_concepts,  # The number of concepts to discover $k^\prime$
    cov_mat=cov_mat,  # The empirical covariance matrix
    loss_fn=end_to_end_model.loss,  # The downstream task loss function
    # Then we provide all the regularizers weights
    coherence_reg_weight=coherence_reg_weight,
    diversity_reg_weight=diversity_reg_weight,
    feature_selection_reg_weight=feature_selection_reg_weight,
    gate_estimator_weight=gate_estimator_weight,
    top_k=top_k,

    # And indicate that we will not be providing any supervised concepts! Change
    # this is training concepts (e.g., `c_train`) are provided/known during
    # training
    n_supervised_concepts=0,
    concept_prediction_weight=0,

    # The accuracy metric to use for logging performance
    acc_metric=(
        lambda y_true, y_pred: tf.math.reduce_mean(
            tf.keras.metrics.sparse_categorical_accuracy(
                y_true,
                y_pred,
            )
        )
    ),

    # ANd architectural details of the self-supervised reconstruction modules
    concept_generator_units=[64],
    rec_model_units=[64],
)

In [14]:
# Mask Generator Self-supervised Training

# Next, we proceed to do the SELF-SUPERVISED TRAINING of the MASK
# GENERATORS for TabCBM. For this, we will follow a similar approach
# to that of SEFS. Our TabCBM module allows one to do this by setting
# the self_supervised_mode flag to True before calling the .fit() method:

# We can now construct our TabCBM model which we will first self-supervise!
ss_tabcbm = TabCBM(self_supervised_mode=True,  **tab_cbm_params)
ss_tabcbm.compile(optimizer=tf.keras.optimizers.Adam(learning_rate,))
ss_tabcbm.summary()




  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [15]:
self_supervised_train_epochs = 50
print("TabCBM self-supervised training stage...")
ss_tabcbm_hist = ss_tabcbm.fit(
    x=x_train_std,
    y=y_train,
    validation_split=validation_size,
    epochs=self_supervised_train_epochs,
    batch_size=batch_size,
    verbose=1,
)

print("\tTabCBM self-supervised training completed")

TabCBM self-supervised training stage...
Epoch 1/50




[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.0000e+00 - avg_concept_size: 0.0000e+00 - avg_features_rec_loss: 0.4546 - avg_mask_rec_loss: 6.7664 - loss: 43.3261 - max_probability: 0.0000e+00 - mean_probability: 0.0000e+00 - min_probability: 0.0000e+00 - prob_sparsity_loss: 0.0000e+00 - reg_loss_closest: 0.0000e+00 - reg_loss_similarity: 0.0000e+00 - task_loss: 0.0000e+00 - val_avg_features_rec_loss: 0.4930 - val_avg_mask_rec_loss: 6.4329 - val_loss: 41.5554
Epoch 2/50
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0000e+00 - avg_concept_size: 0.0000e+00 - avg_features_rec_loss: 0.4995 - avg_mask_rec_loss: 6.3714 - loss: 41.2253 - max_probability: 0.0000e+00 - mean_probability: 0.0000e+00 - min_probability: 0.0000e+00 - prob_sparsity_loss: 0.0000e+00 - reg_loss_closest: 0.0000e+00 - reg_loss_similarity: 0.0000e+00 - task_loss: 0.0000e+00 - val_avg_features_rec_loss: 0.5171 - val_avg_mask_rec_loss: 6.29

In [16]:
# First we will instantiate a new TabCBM that is NOT in self-supervised mode,
# and we will load its weights so that they are the same as the model whose
# mask generators have been pre-trained using the SS loss.
tabcbm_supervised = TabCBM(
    self_supervised_mode=False,
    # Notice how we provide as concept generators the concept generators of the
    # SS TabCBM:
    concept_generators=ss_tabcbm.concept_generators,
    # as well as the feature probability masks:
    prior_masks=ss_tabcbm.feature_probabilities,
    **tab_cbm_params,
)
tabcbm_supervised.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))
tabcbm_supervised.summary()



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [17]:
max_epochs = 100   
print("TabCBM self-supervised training stage...")

tabcbm_hist = tabcbm_supervised.fit(
    x=x_train_std,
    y=y_train,
    validation_split=validation_size,
    epochs=max_epochs,
    batch_size=batch_size,
    verbose=1,
)

print("\tTabCBM supervised training completed")

TabCBM self-supervised training stage...
Epoch 1/100
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 13ms/step - accuracy: 0.4253 - avg_concept_size: 18.5285 - avg_features_rec_loss: 0.0000e+00 - avg_mask_rec_loss: 0.0000e+00 - loss: 3.4303 - max_probability: 0.7183 - mean_probability: 0.4932 - min_probability: 0.2583 - prob_sparsity_loss: 2.4660 - reg_loss_closest: 0.0086 - reg_loss_similarity: 0.3887 - task_loss: 0.5841 - val_accuracy: 0.4157 - val_loss: 3.0908 - val_prob_sparsity_loss: 2.2570 - val_reg_loss_closest: 0.0120 - val_reg_loss_similarity: 0.2641 - val_task_loss: 0.5817
Epoch 2/100
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.4285 - avg_concept_size: 14.4770 - avg_features_rec_loss: 0.0000e+00 - avg_mask_rec_loss: 0.0000e+00 - loss: 2.9764 - max_probability: 0.6645 - mean_probability: 0.4376 - min_probability: 0.2157 - prob_sparsity_loss: 2.1881 - reg_loss_closest: 0.0127 - reg_loss_similarity: 0.2585 - task_lo

# Evaluate TabCBM

In [21]:
test_y_pred, test_concept_scores = tabcbm_supervised.predict(
    x_test_std,
    batch_size=batch_size,
)

[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [73]:
(tf.math.sigmoid(test_y_pred).numpy() > 0.5).sum(), test_y_pred.shape[0]

(12050, 34553)

In [22]:
if np.min(test_y_pred) < 0.0 or np.max(test_y_pred) > 1:
        # Then we assume that we have outputed logits
        test_preds = tf.math.sigmoid(test_y_pred).numpy()
        
test_preds = (test_preds > 0.5).astype(np.int32)
results['acc'] = sklearn.metrics.accuracy_score(
    y_test,
    test_preds,
)
results['auc'] = sklearn.metrics.roc_auc_score(
    y_test,
    test_preds,
)

print(f"Accuracy is {results['acc']*100:.2f}%")

Accuracy is 69.46%


In [23]:
test_concept_scores 

array([[0.7425234 , 0.5505919 , 0.4662737 , 0.54138654, 0.53836936,
        0.54039216],
       [0.6041201 , 0.49845678, 0.5351493 , 0.5050446 , 0.5057303 ,
        0.5035707 ],
       [0.46470454, 0.6613883 , 0.4395699 , 0.571951  , 0.58076847,
        0.57156   ],
       ...,
       [0.5239042 , 0.5532364 , 0.5398632 , 0.5070664 , 0.5112675 ,
        0.5100211 ],
       [0.60356987, 0.56256855, 0.53227943, 0.5095377 , 0.51488054,
        0.5111668 ],
       [0.59099615, 0.52526677, 0.53711325, 0.5034985 , 0.503288  ,
        0.5027602 ]], dtype=float32)

## Comparison of the calculated and ground-truth masks 

In [36]:
masks

array([[1.08940911e-03, 1.34003858e-04, 1.45111571e-05, 4.93960215e-05,
        1.25427076e-04, 8.61415756e-05, 6.06893009e-05, 5.33836464e-05,
        9.76440642e-05, 1.08725522e-04, 7.26769562e-04, 1.08637221e-04,
        7.99953414e-05, 1.02713042e-04, 1.35997325e-04, 2.94317051e-05,
        1.29061773e-05, 7.50097824e-05, 1.70446001e-05, 1.23426325e-05,
        3.84046543e-05, 1.09178468e-03, 6.34411583e-04, 1.40441944e-05,
        2.44487263e-03, 1.93002471e-03, 7.08360749e-04, 2.55774218e-03,
        7.97282066e-03, 2.81668338e-03, 1.22982380e-03, 3.64154221e-05,
        1.56433834e-05, 3.60276863e-05, 1.62242126e-04, 4.55317859e-05,
        1.50544441e-03, 1.02118787e-03],
       [7.41930818e-03, 1.23410340e-04, 7.80004993e-05, 6.63273458e-05,
        6.89990047e-05, 6.47282368e-03, 1.72660715e-04, 9.84538492e-05,
        4.92036816e-05, 5.92560500e-05, 5.81707100e-05, 1.52397755e-04,
        7.67082965e-05, 9.86133018e-05, 1.47198036e-04, 7.07884319e-05,
        9.75037547e-05,

In [38]:
# The masks are stored as logits, so we need to turn them to probabilities using
# a sigmoid
masks = tf.sigmoid(tabcbm_supervised.feature_probabilities).numpy()

print("Thresholded concept masks learnt by TabCBM:")
for i, mask in enumerate((masks>0.0005).astype(np.int32)):
    print("\tFor concept", i, "we are selecting the following features", mask)
print("-" * 80)
print("-" * 80)
print("For comparison, the ground truth concept masks are")
for i, mask in enumerate(concepts_masks.to_numpy()):
    print("\tFor GROUND-TRUTH concept", i, " the following features are relevant", mask)

Thresholded concept masks learnt by TabCBM:
	For concept 0 we are selecting the following features [1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 1 1 1 1 0 0 0 0 0 1
 1]
	For concept 1 we are selecting the following features [1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0
 0]
	For concept 2 we are selecting the following features [1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 0 1 1 1 1 1 1 0 0 0 0
 0]
	For concept 3 we are selecting the following features [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0
 0]
	For concept 4 we are selecting the following features [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0
 0]
	For concept 5 we are selecting the following features [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0
 0]
--------------------------------------------------------------------------------
-----------------------------------------------------------------------

# Load pretrained TabCBM 

In [None]:
tabcbm_pretrained = tf.keras.models.load_model(r'D:\\PycharmProjects\\AMMISproject\\trained_tabcbm_4concepts_1500epochs\\tabcbm_supervised.keras')
