In [1]:
%load_ext autoreload
%autoreload 2

from iit import generate_data, get_iit_distribution_dataset_both
from transformers import BertModel, BertTokenizer
from model_classifier import TorchDeepNeuralEmbeddingClassifier
from model_regression import TorchLinearEmbeddingRegression
import model_classifier_iit
from model_classifier_iit import TorchDeepNeuralClassifierIIT
from model_regression_iit import TorchLinearEmbeddingRegressionIIT
from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
import numpy as np
import torch
import torch.nn as nn
import vsm
from sklearn.metrics import classification_report, r2_score 


2022-07-14 21:35:02.602918: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-14 21:35:02.602941: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
vals = ['zero', 'one', 'two', 'three', 'four',
       'five', 'six', 'seven', 'eight', 'nine']

train_test_split = 0.9
X_train, y_train, X_test, y_test = generate_data(vals, train_test_split)

In [3]:
X_train[:5]

[['four', 'nine', 'five'],
 ['zero', 'four', 'three'],
 ['nine', 'three', 'six'],
 ['five', 'nine', 'nine'],
 ['four', 'nine', 'two']]

In [4]:
y_train[:5]

[64, 17, 0, 15, 48]

In [5]:
X_test[:5]

[['four', 'zero', 'two'],
 ['six', 'four', 'one'],
 ['three', 'one', 'three'],
 ['four', 'one', 'two'],
 ['five', 'three', 'seven']]

### Training a feed-forward network using randomized embeddings.

In [4]:
output_size = 163
num_inputs = 3
num_layers = 2
embed_dim = 5

mod = TorchDeepNeuralEmbeddingClassifier(vals, output_size, num_inputs,
                            num_layers, embed_dim, None, False)

In [None]:
_ = mod.fit(X_train, y_train)

In [None]:
preds = mod.predict(X_test)

print("\nClassification report:")
print(classification_report(y_test, preds, zero_division=1))

### Training a feed-forward network using BERT embeddings.

In [None]:
bert_weights_name = 'bert-base-uncased'
# Initialize a BERT tokenizer and BERT model based on
# `bert_weights_name`:
bert_tokenizer = BertTokenizer.from_pretrained(bert_weights_name)
bert_model = BertModel.from_pretrained(bert_weights_name)

In [None]:
bert_embed = vsm.create_subword_pooling_vsm(
    vals, bert_tokenizer, bert_model, layer=1, pool_func=vsm.mean_pooling)

In [None]:
bert_embed

In [None]:
output_size = 163
num_inputs = 3
num_layers = 2
embed_dim = bert_embed.shape[1]
freeze_embedding = True

mod_bert_embed = TorchDeepNeuralEmbeddingClassifier(vals, output_size, num_inputs,
                            num_layers, embed_dim, bert_embed,
                            freeze_embedding)

In [None]:
_ = mod_bert_embed.fit(X_train, y_train)

In [None]:
preds = mod_bert_embed.predict(X_test)

print("\nClassification report:")
print(classification_report(y_test, preds, zero_division=1))

### Train a BERT model by tokenizing inputs 

In [None]:
class HfBertClassifierModel(nn.Module):
    def __init__(self, n_classes, weights_name='bert-base-uncased'):
        super().__init__()
        self.n_classes = n_classes
        self.weights_name = weights_name
        self.bert = BertModel.from_pretrained(self.weights_name)
        self.bert.train()
        self.hidden_dim = self.bert.embeddings.word_embeddings.embedding_dim
        # The only new parameters -- the classifier:
        self.classifier_layer = nn.Linear(
            self.hidden_dim, self.n_classes)

    def forward(self, indices, mask):
        reps = self.bert(
            indices, attention_mask=mask)
        return self.classifier_layer(reps.pooler_output)


In [None]:
class HfBertClassifier(TorchShallowNeuralClassifier):
    def __init__(self, weights_name, *args, **kwargs):
        self.weights_name = weights_name
        self.tokenizer = BertTokenizer.from_pretrained(self.weights_name)
        super().__init__(*args, **kwargs)
        self.params += ['weights_name']

    def build_graph(self):
        return HfBertClassifierModel(self.n_classes_, self.weights_name)

    def build_dataset(self, X, y=None):
        data = self.tokenizer.batch_encode_plus(
            X,
            max_length=None,
            add_special_tokens=True,
            padding='longest',
            return_attention_mask=True)
        indices = torch.tensor(data['input_ids'])
        mask = torch.tensor(data['attention_mask'])
        if y is None:
            dataset = torch.utils.data.TensorDataset(indices, mask)
        else:
            self.classes_ = sorted(set(y))
            self.n_classes_ = len(self.classes_)
            class2index = dict(zip(self.classes_, range(self.n_classes_)))
            y = [class2index[label] for label in y]
            y = torch.tensor(y)
            dataset = torch.utils.data.TensorDataset(indices, mask, y)
        return dataset

In [None]:
def conversion(X):
    new_X = []
    for inpts in X:
        new_X.append(' '.join(inpts))
    return new_X

In [None]:
mod_bert = HfBertClassifier('bert-base-uncased', max_iter=5, batch_size=8, n_iter_no_change=5, early_stopping=True, hidden_dim=100, eta=0.00005)

In [None]:
X_train_bert = conversion(X_train)
X_test_bert = conversion(X_test)
X_train_bert[:5]

In [None]:
_ = mod_bert.fit(X_train_bert, y_train)

In [None]:
preds = mod_bert.predict(X_test_bert)

print("\nClassification report:")
print(classification_report(y_test, preds))

### Create IIT Dataset that computes y + z then x * (y + z)

In [5]:
V1 = 1
first_train_data_V1, first_test_data_V1 = get_iit_distribution_dataset_both(V1, vals, 0.9, True)

In [6]:
first_x_base_train_V1, first_y_base_train_V1, first_x_source_train_V1, first_y_source_train_V1 = first_train_data_V1
first_x_base_test_V1, first_y_base_test_V1, first_x_source_test_V1, first_y_source_test_V1 = first_test_data_V1

In [7]:
first_x_base_train_V1[:3]

[['three', 'eight', 'nine'], ['nine', 'two', 'two'], ['four', 'nine', 'four']]

In [10]:
first_x_source_train_V1[:3]

[['five', 'two', 'two'], ['two', 'six', 'eight'], ['four', 'one', 'nine']]

In [11]:
first_y_source_train_V1[:3]

[0, 70, 30]

In [8]:
train_size = len(first_x_base_train_V1)
interventions_train_V1 = torch.zeros(train_size)

In [9]:
test_size = len(first_x_base_test_V1)
interventions_test_V1 = torch.zeros(test_size)

### Training model without IIT on dataset 

In [10]:
class InterventionableTorchLinearEmbeddingRegression(TorchLinearEmbeddingRegression):
    def __init__(self, **base_kwargs):
        super().__init__(**base_kwargs)
        
    def make_hook(self, gets, sets, layer):
        def hook(model, input, output):
            layer_gets, layer_sets = [], []
            if gets is not None and layer in gets:
                layer_gets = gets[layer]
            if sets is not None and layer in sets:
                layer_sets = sets[layer]
            for set in layer_sets:
                output = torch.cat(
                    [output[:, :set["start"]], 
                     set["intervention"], 
                     output[:, set["end"]: ]], 
                    dim=1)
            for get in layer_gets:
                k = f'{get["layer"]}-{get["start"]}-{get["end"]}'
                self.activation[k] = output[:, get["start"]: get["end"] ]
            return output
        return hook

    def _gets_sets(self, gets=None, sets=None):
        handlers = []
        for layer in range(len(self.layers)):
            hook = self.make_hook(gets, sets, layer)
            both_handler = self.layers[layer].register_forward_hook(hook)
            handlers.append(both_handler)
        return handlers

    def retrieve_activations(self, X, get, sets):
        if sets is not None and "intervention" in sets:
            sets["intervention"] = sets["intervention"].type(torch.FloatTensor).to(self.device)
        X = X.type(torch.LongTensor).to(self.device)
        self.activation = {}
        get_val = {get["layer"]: [get]} if get is not None else None
        set_val = {sets["layer"]: [sets]} if sets is not None else None
        handlers = self._gets_sets(get_val, set_val)
        logits = self.model(X)
        for handler in handlers:
            handler.remove()
        return self.activation[f'{get["layer"]}-{get["start"]}-{get["end"]}']


In [20]:
model_regression = InterventionableTorchLinearEmbeddingRegression(vocab=vals, num_inputs=3,
                                                 num_layers=2, hidden_dim=50,
                                                 embed_dim=5, max_iter=2000)

In [33]:
_ = model_regression.fit(X_train, y_train)

Finished epoch 2000 of 2000; error is 177.48551940917974

In [34]:
model_regression.score(X_train, y_train)

0.8450343640660294

In [35]:
model_regression.score(X_test, y_test)

0.878311991525721

### Causal Abstraction Analysis on dataset 1

In [66]:
embedding_dim = 5

first_alignment = {
    "V1": {"layer": 1, "start": embedding_dim, "end": embedding_dim*3}, 
}

In [37]:
def interchange_intervention(model, base, source, get_coord, output_coord):
    intervention = model.retrieve_activations(source, get_coord, None)
    get_coord["intervention"] = intervention
    return model.retrieve_activations(base, output_coord, get_coord)

In [38]:
output_coord = {"layer": 2, "start": 0, "end": 1}

In [39]:
def ii_evaluation(X_base, X_source, model,
                variable, output_coord, alignment):
    predictions = []
    for base, source in zip(X_base, X_source):
        base = base.unsqueeze(0)
        source = source.unsqueeze(0)
        network_output = interchange_intervention(
            model,
            base,
            source,
            alignment[variable],
            output_coord
        )
        predictions.append(torch.round(network_output).detach().squeeze(1).numpy())
    return predictions

In [40]:
def convert_input(X):
    new_X = []
    index = dict(zip(vals, range(len(vals))))
    for ex in X:
        seq = [index[w] for w in ex]
        seq = torch.tensor(seq)
        new_X.append(seq)
    X = torch.stack(new_X)
    return X

In [41]:
first_v1_eval = ii_evaluation(convert_input(first_x_base_test_V1), 
                        convert_input(first_x_source_test_V1),
                        model_regression, "V1", output_coord,
                        first_alignment)
print("Counterfactual V1 evaluation")
print(r2_score(first_y_base_test_V1, first_v1_eval))

Counterfactual V1 evaluation
0.6037107827372421


### IIT on Dataset 1 

In [64]:
V1 = 0
coords = {
    V1: [{"layer": 1, "start": embed_dim, "end": embed_dim*3}], 
}

V1_model = TorchLinearEmbeddingRegressionIIT(
    hidden_dim=50, 
    hidden_activation=torch.nn.ReLU(), 
    num_layers=2, 
    id_to_coords=coords, 
    embed_dim=5,
    vocab=vals,
    num_inputs=3,
    max_iter=2000)

In [65]:
_ = V1_model.fit(
    first_x_base_train_V1, 
    [first_x_source_train_V1], 
    first_y_base_train_V1, 
    first_y_source_train_V1, 
    interventions_train_V1)

Finished epoch 2000 of 2000; error is 143.54090881347656

In [68]:
first_IIT_preds_V1, first_base_preds_V1 = V1_model.iit_predict(
    first_x_base_test_V1, [first_x_source_test_V1], interventions_test_V1
)

In [69]:
print('Standard Evaluation')
print(r2_score(first_y_base_test_V1, first_base_preds_V1.detach()))
      
print("V1 counterfactual evaluation")
print(r2_score(first_y_source_test_V1, first_IIT_preds_V1.detach()))

Standard Evaluation
0.9764884536513211
V1 counterfactual evaluation
0.8254405690167311


### Create IIT Dataset that computes xy and yz then xy + yz

In [42]:
V1 = 1
V2 = 2

train_data_V1, test_data_V1 = get_iit_distribution_dataset_both(V1, vals)
train_data_V2, test_data_V2 = get_iit_distribution_dataset_both(V2, vals)

In [43]:
x_base_train_V1, y_base_train_V1, x_source_train_V1, y_source_train_V1 = train_data_V1
x_base_test_V1, y_base_test_V1, x_source_test_V1, y_source_test_V1 = test_data_V1

x_base_train_V2, y_base_train_V2, x_source_train_V2, y_source_train_V2 = train_data_V2
x_base_test_V2, y_base_test_V2, x_source_test_V2, y_source_test_V2 = test_data_V2

In [44]:
train_size = len(x_base_train_V1)
interventions_train_V1 = torch.zeros(train_size)
interventions_train_V2 = torch.ones(train_size)

In [45]:
test_size = len(x_base_test_V1)
interventions_test_V1 = torch.zeros(test_size)
interventions_test_V2 = torch.ones(test_size)

In [46]:
x_base_train_V1[:3]

[['two', 'one', 'six'],
 ['seven', 'zero', 'three'],
 ['eight', 'three', 'seven']]

In [27]:
x_source_train_V1[:3]

[['nine', 'two', 'two'], ['three', 'two', 'six'], ['zero', 'two', 'one']]

In [None]:
y_source_train_V1[:3]

In [47]:
X_base_train_both = np.concatenate([np.array(x_base_train_V1),
                                np.array(x_base_train_V2)], axis=0)

X_sources_train_both = [np.concatenate([np.array(x_source_train_V1),
                            np.array(x_source_train_V2)], axis=0)]

y_base_train_both = np.concatenate([np.array(y_base_train_V1),
                            np.array(y_base_train_V2)], axis=0)

y_source_train_both = np.concatenate([np.array(y_source_train_V1),
                            np.array(y_source_train_V2)])

interventions_train_both = np.concatenate([np.array(interventions_train_V1),
                            np.array(interventions_train_V2)])

In [48]:
X_base_test_both = np.concatenate([np.array(x_base_test_V1),
                                np.array(x_base_test_V2)], axis=0)

X_sources_test_both = [np.concatenate([np.array(x_source_test_V1),
                            np.array(x_source_test_V2)], axis=0)]

y_base_test_both = np.concatenate([np.array(y_base_test_V1),
                            np.array(y_base_test_V2)], axis=0)

y_source_test_both = np.concatenate([np.array(y_source_test_V1),
                            np.array(y_source_test_V2)])

interventions_test_both = np.concatenate([np.array(interventions_test_V1),
                            np.array(interventions_test_V2)])

### Causual Abstraction Analysis for Dataset 2

In [None]:
alignment = {
    "V1": {"layer": 1, "start": 0, "end": embedding_dim*3}, 
    "V2": {"layer": 1, "start": embedding_dim*3, "end": embedding_dim*6}}

In [57]:
v1_eval = ii_evaluation(convert_input(x_base_test_V1), 
                        convert_input(x_source_test_V1),
                        model_regression, "V1", output_coord,
                        alignment)
print("Counterfactual V1 evaluation")
print(r2_score(y_base_test_V1, v1_eval))

Counterfactual V1 evaluation
0.7384461377571498


In [58]:
v2_eval = ii_evaluation(convert_input(x_base_test_V2), 
                        convert_input(x_source_test_V2),
                        model_regression, "V2", output_coord, 
                        alignment)
print("Counterfactual V2 evaluation")
print(r2_score(y_base_test_V2, v2_eval))

Counterfactual V2 evaluation
0.559269172200812


### IIT on Dataset 2

In [51]:
V1 = 0
V2 = 1
id_to_coords = {
    V1: [{"layer": 1, "start": 0, "end": embedding_dim*3}], 
    V2: [{"layer": 1, "start": embedding_dim*3, "end": embedding_dim*6}]    
}

both_regression_model = TorchLinearEmbeddingRegressionIIT(
    hidden_dim=50, 
    hidden_activation=torch.nn.ReLU(), 
    num_layers=2, 
    id_to_coords=id_to_coords, 
    vocab=vals,
    num_inputs=num_inputs,
    embed_dim=5,
    max_iter=2000,
    batch_size=1028)

In [52]:
_ = both_regression_model.fit(
    X_base_train_both, 
    X_sources_train_both, 
    y_base_train_both, 
    y_source_train_both,
    interventions_train_both)

Finished epoch 2000 of 2000; error is 67.24396467208862

In [53]:
IIT_preds_V1, base_preds_V1 = both_regression_model.iit_predict(
    x_base_test_V1, [x_source_test_V1], interventions_test_V1
)

In [54]:
print('Standard Evaluation')
print(r2_score(y_base_test_V1, base_preds_V1.detach()))
      
print("V1 counterfactual evaluation")
print(r2_score(y_source_test_V1, IIT_preds_V1.detach()))

Standard Evaluation
0.9969886890860198
V1 counterfactual evaluation
0.9695129683757866


In [55]:
IIT_preds_V2, base_preds_V2 = both_regression_model.iit_predict(
    x_base_test_V2, [x_source_test_V2], interventions_test_V2
)

In [56]:
print("V2 counterfactual evaluation")
print(r2_score(y_source_test_V2, IIT_preds_V2.detach()))

V2 counterfactual evaluation
0.9952701298011521
