In [1]:
import joblib
import numpy as np
import pandas as pd
import os.path as osp
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import tensorflow as tf

# Read the data 

In [2]:
data_dir = 'D:\\PycharmProjects\\AMMISproject\\data\\processed_data'
dataset = 'dataco'

x_train_std = joblib.load(osp.join(data_dir, dataset, 'x_train_std.joblib'))
x_test_std = joblib.load(osp.join(data_dir, dataset, 'x_test_std.joblib'))

x_train = joblib.load(osp.join(data_dir, dataset, 'x_train.joblib'))
x_test = joblib.load(osp.join(data_dir, dataset, 'x_test.joblib'))

y_train = joblib.load(osp.join(data_dir, dataset, 'y_train.joblib'))
y_test = joblib.load(osp.join(data_dir, dataset, 'y_test.joblib'))


In [10]:
print('Shape of the training set: ', x_train.shape)
print('Shape of the test set: ', x_test.shape)
print('Shape of the trainigb targets: ', y_train.shape)

Shape of the training set:  (138212, 38)
Shape of the test set:  (34553, 38)
Shape of the trainigb targets:  (138212,)


## Create main components for TabCBM

In [9]:
# Parameters defining the architecture we will use

input_shape = x_train.shape[1:]
num_outputs = len(set(y_train))
encoder_units = [16, 16]
decoder_units = [16]
latent_dims = 16     
learning_rate = 0.001  
validation_size = 0.1 

print('Input shape: ', input_shape)
print('Number of outputs: ', num_outputs)

Input shape:  (38,)
Number of outputs:  2


In [17]:
# Next, we build the feature to latent code encoder model (i.e., phi)

encoder_inputs = tf.keras.Input(shape=input_shape)
encoder_compute_graph = encoder_inputs

# Include the fully connected bottleneck here
for i, units in enumerate(encoder_units):
    encoder_compute_graph = tf.keras.layers.Dense(
        units,
        activation='relu',
        name=f"encoder_dense_{i}",
    )(encoder_compute_graph)

# TIme to generate the latent code here
encoder_compute_graph = tf.keras.layers.Dense(
    latent_dims,
    activation=None,
    name="encoder_bypass_channel",
)(encoder_compute_graph)

encoder = tf.keras.Model(
    encoder_inputs,
    encoder_compute_graph,
    name="encoder",
)
encoder.summary()

In [None]:
# Then, we build the concept to label model  (i.e., the label predictor f)

decoder_inputs = tf.keras.Input(shape=[latent_dims])
decoder_layers = [
    tf.keras.layers.Dense(
        units,
        activation=tf.nn.relu,
        name=f"decoder_dense_{i+1}",
    ) for i, units in enumerate(decoder_units)
]
decoder_graph = tf.keras.Sequential(decoder_layers + [
    tf.keras.layers.Dense(
        num_outputs if num_outputs > 2 else 1,
        activation=None,
        name="decoder_model_output",
    )
])
decoder = tf.keras.Model(
    decoder_inputs,
    decoder_graph(decoder_inputs),
    name="decoder",
)
decoder.summary()

In [23]:
# We then put them both together to make an end-to-end model we can pretrain

end_to_end_inputs = tf.keras.Input(shape=input_shape)
latent = encoder(end_to_end_inputs)
end_to_end_model_compute_graph = decoder(latent)
# Now time to collapse all the concepts again back into a single vector
end_to_end_model = tf.keras.Model(
    end_to_end_inputs,
    end_to_end_model_compute_graph,
    name="complete_model",
)
end_to_end_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate),
    loss=(
        tf.keras.losses.BinaryCrossentropy(from_logits=True) if (num_outputs <= 2)
        else tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    ),
    metrics=[
        "accuracy" if (num_outputs <= 2)
        else "sparse_categorical_accuracy"
    ],
)
end_to_end_model.summary()

# Latent code model pre-training (using end-to-end model)

In [24]:
pretrain_epochs = 70
batch_size = 512
pretrain_hist = end_to_end_model.fit(
    x=x_train_std,
    y=y_train,
    epochs=pretrain_epochs,
    batch_size=batch_size,
    validation_split=validation_size,
    verbose=1,
)

Epoch 1/70
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6079 - loss: 0.6418 - val_accuracy: 0.6927 - val_loss: 0.5508
Epoch 2/70
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 886us/step - accuracy: 0.6940 - loss: 0.5464 - val_accuracy: 0.6942 - val_loss: 0.5433
Epoch 3/70
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 839us/step - accuracy: 0.6945 - loss: 0.5414 - val_accuracy: 0.6946 - val_loss: 0.5420
Epoch 4/70
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 822us/step - accuracy: 0.6972 - loss: 0.5380 - val_accuracy: 0.6948 - val_loss: 0.5423
Epoch 5/70
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 850us/step - accuracy: 0.6971 - loss: 0.5383 - val_accuracy: 0.6941 - val_loss: 0.5415
Epoch 6/70
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 877us/step - accuracy: 0.6957 - loss: 0.5385 - val_accuracy: 0.6946 - val_loss: 0.5410
Epoch 7/70
[1m243

In [25]:
import scipy
import sklearn

# We will accumulate all metrics/results in the same dictionary
results = {}

# Make test predictions for the test set
end_to_end_preds = end_to_end_model.predict(
    x_test_std,
    batch_size=batch_size,
)

# Get accuracy/AUC using the corresponding test labels
if ((len(end_to_end_preds.shape) == 2)) and (end_to_end_preds.shape[-1] >= 2):
    # Then we are using multi-class outputs
    preds = scipy.special.softmax(
        end_to_end_preds,
        axis=-1,
    )

    one_hot_labels = tf.keras.utils.to_categorical(y_test)
    results['pre_train_acc'] = sklearn.metrics.accuracy_score(
        y_test,
        np.argmax(preds, axis=-1),
    )

    # And select just the labels that are in fact being used
    results['pre_train_auc'] = sklearn.metrics.roc_auc_score(
        one_hot_labels,
        preds,
        multi_class='ovo',
    )
else:
    # Otherwise we are dealing with simple binary outputs
    if np.min(end_to_end_preds) < 0.0 or np.max(end_to_end_preds) > 1:
        # Then we assume that we have outputed logits
        end_to_end_preds = tf.math.sigmoid(end_to_end_preds).numpy()
    end_to_end_preds = (end_to_end_preds >= 0.5).astype(np.int32)
    results['pre_train_acc'] = sklearn.metrics.accuracy_score(
        y_test,
        end_to_end_preds,
    )
    results['pre_train_auc'] = sklearn.metrics.roc_auc_score(
        y_test,
        end_to_end_preds,
    )
print(f"Pretrained model task accuracy: {results['pre_train_acc']*100:.2f}%")

[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 994us/step
Pretrained model task accuracy: 69.66%


# Construct TabCBM

We are now ready to construct a TabCBM. For this, we will first compute the
empirical covariance matrix in order for us to learn useful masks using a
similar approach to that proposed by SEFS:

In [26]:
# Construct the training set's empirical covariance matrix
# NOTE: This step can be very computationally expensive/intractable in large
#       datasets. In those cases, one may ignore the covariance matrix when
#       performing TabCBM's pretraining at the potential cost of performance or
#       more accurate concept discovery.
cov_mat = np.corrcoef(x_train.T)
print(cov_mat)

[[ 1.00000000e+00  1.00105563e-03  8.38634556e-03 ... -3.34490471e-03
   6.45576389e-03  7.41082360e-04]
 [ 1.00105563e-03  1.00000000e+00  2.93211877e-03 ...  1.57344435e-03
  -2.49050682e-03 -4.45254638e-03]
 [ 8.38634556e-03  2.93211877e-03  1.00000000e+00 ...  3.03096663e-03
   5.79808624e-03  1.24462552e-03]
 ...
 [-3.34490471e-03  1.57344435e-03  3.03096663e-03 ...  1.00000000e+00
  -1.99639348e-01 -1.38666258e-01]
 [ 6.45576389e-03 -2.49050682e-03  5.79808624e-03 ... -1.99639348e-01
   1.00000000e+00 -2.07627716e-01]
 [ 7.41082360e-04 -4.45254638e-03  1.24462552e-03 ... -1.38666258e-01
  -2.07627716e-01  1.00000000e+00]]


In [27]:
from tabcbm.models.tabcbm import TabCBM

# Number of concepts we want to discover
n_concepts = 2

# Set the weights for the different regularisers in the loss
coherence_reg_weight = 0.1  # $lambda_{co}
diversity_reg_weight = 5 # $lambda_{div}
feature_selection_reg_weight = 5 # $lambda_{spec}
gate_estimator_weight = 10 # Gate prediction regularizer for SEFS's pre-text task

# Select how many neighbors to use for the coherency loss (must be less than
# the batch size!)
top_k = 256

# Generate a dictionary with the parameters to use for TabCBM as we will have
# to use the same parameters twice:
tab_cbm_params = dict(
    features_to_concepts_model=encoder,  # The $\phi$ sub-model
    concepts_to_labels_model=decoder,  # The $f$ sub-model
    latent_dims=latent_dims,  # The dimensionality of the concept embeddings $m$
    n_concepts=n_concepts,  # The number of concepts to discover $k^\prime$
    cov_mat=cov_mat,  # The empirical covariance matrix
    loss_fn=end_to_end_model.loss,  # The downstream task loss function
    # Then we provide all the regularizers weights
    coherence_reg_weight=coherence_reg_weight,
    diversity_reg_weight=diversity_reg_weight,
    feature_selection_reg_weight=feature_selection_reg_weight,
    gate_estimator_weight=gate_estimator_weight,
    top_k=top_k,

    # And indicate that we will not be providing any supervised concepts! Change
    # this is training concepts (e.g., `c_train`) are provided/known during
    # training
    n_supervised_concepts=0,
    concept_prediction_weight=0,

    # The accuracy metric to use for logging performance
    acc_metric=(
        lambda y_true, y_pred: tf.math.reduce_mean(
            tf.keras.metrics.sparse_categorical_accuracy(
                y_true,
                y_pred,
            )
        )
    ),

    # ANd architectural details of the self-supervised reconstruction modules
    concept_generator_units=[64],
    rec_model_units=[64],
)

ModuleNotFoundError: No module named 'tabcbm.models'