In [41]:
%load_ext autoreload
%autoreload 2

from iit import generate_data, get_iit_distribution_dataset_both
from transformers import BertModel, BertTokenizer
from model_classifier import TorchDeepNeuralEmbeddingClassifier
import model_classifier_iit
from model_classifier_iit import TorchDeepNeuralClassifierIIT
from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
import numpy as np
import torch
import torch.nn as nn
import vsm
from sklearn.metrics import classification_report

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
vals = ['zero', 'one', 'two', 'three', 'four',
       'five', 'six', 'seven', 'eight', 'nine']

train_test_split = 0.9
X_train, y_train, X_test, y_test = generate_data(vals, train_test_split)

In [6]:
len(X_train)

900

In [3]:
X_train[:5]

[['four', 'nine', 'four'],
 ['nine', 'nine', 'zero'],
 ['six', 'one', 'one'],
 ['seven', 'three', 'four'],
 ['five', 'nine', 'three']]

In [4]:
y_train[:5]

[52, 81, 12, 49, 60]

In [5]:
X_test[:5]

[['eight', 'six', 'five'],
 ['six', 'nine', 'two'],
 ['two', 'one', 'zero'],
 ['nine', 'five', 'five'],
 ['two', 'five', 'five']]

### Training a feed-forward network using randomized embeddings.

In [43]:
output_size = 163
num_inputs = 3
num_layers = 2
embed_dim = 5

mod = TorchDeepNeuralEmbeddingClassifier(vals, output_size, num_inputs,
                            num_layers, embed_dim, None, False)

In [44]:
mod.fit(X_train, y_train)

Finished epoch 1000 of 1000; error is 0.11316589266061783

TorchDeepNeuralEmbeddingClassifier(
	batch_size=1028,
	max_iter=1000,
	eta=0.001,
	optimizer_class=<class 'torch.optim.adam.Adam'>,
	l2_strength=0,
	gradient_accumulation_steps=1,
	max_grad_norm=None,
	validation_fraction=0.1,
	early_stopping=False,
	n_iter_no_change=10,
	warm_start=False,
	tol=1e-05,
	hidden_dim=50,
	hidden_activation=Tanh(),
	num_layers=2)

In [46]:
torch.save(mod.model.state_dict(), './model')

In [9]:
preds = mod.predict(X_test)

print("\nClassification report:")
print(classification_report(y_test, preds))


Classification report:
              precision    recall  f1-score   support

           0       0.88      1.00      0.93         7
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.67      1.00      0.80         2
           8       0.67      0.50      0.57         4
          10       0.67      0.67      0.67         3
          12       0.50      1.00      0.67         2
          13       1.00      1.00      1.00         1
          14       1.00      1.00      1.00         1
          15       0.67      1.00      0.80         2
          16       1.00      0.67      0.80         3
          18       1.00      1.00      1.00         2
          20       1.00      1.00      1.00         2
          21       1.00      1.00      1.00         2
          22       1.00      1.00      1.00         1
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Training a feed-forward network using BERT embeddings.

In [10]:
bert_weights_name = 'bert-base-uncased'
# Initialize a BERT tokenizer and BERT model based on
# `bert_weights_name`:
bert_tokenizer = BertTokenizer.from_pretrained(bert_weights_name)
bert_model = BertModel.from_pretrained(bert_weights_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
bert_embed = vsm.create_subword_pooling_vsm(
    vals, bert_tokenizer, bert_model, layer=1, pool_func=vsm.mean_pooling)

In [12]:
bert_embed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
zero,0.368413,0.714821,-0.532399,-0.153238,-0.184203,0.009934,0.028352,-0.162773,0.163669,-0.471809,...,0.63745,-0.782,0.181008,-1.160265,0.005396,-0.899586,0.350256,-0.099035,-0.274523,0.308869
one,0.213226,0.484864,-0.032716,-0.026842,0.090642,-0.086201,0.195947,-0.16962,0.540456,-0.261407,...,0.455036,-0.426076,0.282586,-0.676856,-0.045337,-0.42813,0.227807,0.216206,0.112196,-0.128516
two,-0.121083,0.16106,-0.549375,-0.472711,-0.146909,0.155344,-0.13319,-0.483241,0.173656,-0.328344,...,0.30386,0.224042,0.113168,-0.807975,-0.281597,-0.575383,0.138091,0.198803,-0.091571,-0.442494
three,0.104971,0.313852,-0.34021,-0.434407,-0.049186,0.154321,-0.099751,-0.506876,0.389955,-0.244507,...,0.379173,0.134788,0.23369,-0.651362,-0.219017,-0.658988,0.203868,0.215337,-0.093137,-0.477617
four,0.039704,0.210273,-0.526674,-0.221241,-0.102211,0.203096,-0.147871,-0.331833,0.267574,-0.48117,...,0.240441,-0.040645,0.122894,-0.991067,0.013572,-0.848153,0.368776,0.249214,-0.265129,-0.095733
five,-0.039133,0.148628,-0.443105,-0.602812,0.024338,-0.077868,-0.239623,-0.467639,0.117604,-0.661229,...,0.410685,-0.126665,0.254384,-0.834398,-0.037767,-0.695847,0.18872,0.23778,-0.017827,-0.366614
six,0.304316,0.202454,-0.50693,-0.336193,0.042346,-0.201237,-0.326339,-0.235801,0.244673,-0.600611,...,0.417288,-0.329488,0.127992,-0.932242,-0.041394,-0.668947,-0.093982,0.385106,0.048364,-0.338787
seven,0.092242,0.176112,-0.475537,-0.412275,0.071916,-0.310987,-0.04849,-0.409864,0.006404,-0.807861,...,0.095583,-0.178779,0.421195,-0.693458,0.123755,-0.667253,0.076955,-0.13888,-0.079268,-0.244648
eight,-0.033224,0.192466,-0.507554,-0.419275,0.235861,0.105365,-0.25565,-0.2007,-0.053316,-0.733084,...,0.2336,-0.17198,0.28649,-0.620115,-0.104281,-0.58495,0.222393,0.159457,-0.355556,-0.570065
nine,0.105645,0.060501,-0.367164,-0.43336,0.425262,-0.032186,-0.136358,-0.358969,0.12983,-0.574778,...,0.081581,-0.280892,0.540993,-0.752147,-0.059858,-0.446215,0.033923,-0.036796,-0.296937,-0.314274


In [13]:
output_size = 163
num_inputs = 3
num_layers = 2
embed_dim = bert_embed.shape[1]
freeze_embedding = True

mod_bert_embed = TorchDeepNeuralClassifier(vals, output_size, num_inputs,
                            num_layers, embed_dim, bert_embed,
                            freeze_embedding)

In [15]:
mod_bert_embed.fit(X_train, y_train)

Finished epoch 1000 of 1000; error is 0.04465832561254501

TorchDeepNeuralClassifier(
	batch_size=1028,
	max_iter=1000,
	eta=0.001,
	optimizer_class=<class 'torch.optim.adam.Adam'>,
	l2_strength=0,
	gradient_accumulation_steps=1,
	max_grad_norm=None,
	validation_fraction=0.1,
	early_stopping=False,
	n_iter_no_change=10,
	warm_start=False,
	tol=1e-05,
	hidden_dim=50,
	hidden_activation=Tanh(),
	num_layers=2)

In [16]:
preds = mod_bert_embed.predict(X_test)

print("\nClassification report:")
print(classification_report(y_test, preds))


Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         2
           3       0.25      1.00      0.40         1
           5       1.00      1.00      1.00         1
           6       0.33      0.50      0.40         2
           8       1.00      0.50      0.67         4
          10       1.00      1.00      1.00         3
          12       1.00      0.50      0.67         2
          13       1.00      1.00      1.00         1
          14       0.33      1.00      0.50         1
          15       1.00      1.00      1.00         2
          16       0.67      0.67      0.67         3
          18       1.00      0.50      0.67         2
          20       1.00      1.00      1.00         2
          21       1.00      1.00      1.00         2
          22       1.00      1.00      1.00         1
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Train a BERT model by tokenizing inputs 

In [3]:
class HfBertClassifierModel(nn.Module):
    def __init__(self, n_classes, weights_name='bert-base-uncased'):
        super().__init__()
        self.n_classes = n_classes
        self.weights_name = weights_name
        self.bert = BertModel.from_pretrained(self.weights_name)
        self.bert.train()
        self.hidden_dim = self.bert.embeddings.word_embeddings.embedding_dim
        # The only new parameters -- the classifier:
        self.classifier_layer = nn.Linear(
            self.hidden_dim, self.n_classes)

    def forward(self, indices, mask):
        reps = self.bert(
            indices, attention_mask=mask)
        return self.classifier_layer(reps.pooler_output)


In [4]:
class HfBertClassifier(TorchShallowNeuralClassifier):
    def __init__(self, weights_name, *args, **kwargs):
        self.weights_name = weights_name
        self.tokenizer = BertTokenizer.from_pretrained(self.weights_name)
        super().__init__(*args, **kwargs)
        self.params += ['weights_name']

    def build_graph(self):
        return HfBertClassifierModel(self.n_classes_, self.weights_name)

    def build_dataset(self, X, y=None):
        data = self.tokenizer.batch_encode_plus(
            X,
            max_length=None,
            add_special_tokens=True,
            padding='longest',
            return_attention_mask=True)
        indices = torch.tensor(data['input_ids'])
        mask = torch.tensor(data['attention_mask'])
        if y is None:
            dataset = torch.utils.data.TensorDataset(indices, mask)
        else:
            self.classes_ = sorted(set(y))
            self.n_classes_ = len(self.classes_)
            class2index = dict(zip(self.classes_, range(self.n_classes_)))
            y = [class2index[label] for label in y]
            y = torch.tensor(y)
            dataset = torch.utils.data.TensorDataset(indices, mask, y)
        return dataset

In [5]:
def conversion(X):
    new_X = []
    for inpts in X:
        new_X.append(' '.join(inpts))
    return new_X

In [13]:
mod_bert = HfBertClassifier('bert-base-uncased', max_iter=5, batch_size=8, n_iter_no_change=5, early_stopping=True, hidden_dim=100, eta=0.00005)

In [7]:
X_train_bert = conversion(X_train)
X_test_bert = conversion(X_test)
X_train_bert[:5]

['seven seven four',
 'seven nine nine',
 'four eight five',
 'eight nine two',
 'one five zero']

In [14]:
mod_bert.fit(X_train_bert, y_train)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Finished epoch 5 of 5; error is 250.60060513019562

HfBertClassifier(
	batch_size=8,
	max_iter=5,
	eta=5e-05,
	optimizer_class=<class 'torch.optim.adam.Adam'>,
	l2_strength=0,
	gradient_accumulation_steps=1,
	max_grad_norm=None,
	validation_fraction=0.1,
	early_stopping=True,
	n_iter_no_change=5,
	warm_start=False,
	tol=1e-05,
	hidden_dim=100,
	hidden_activation=Tanh(),
	weights_name=bert-base-uncased)

In [15]:
preds = mod_bert.predict(X_test_bert)

print("\nClassification report:")
print(classification_report(y_test, preds))


Classification report:
              precision    recall  f1-score   support

           0       1.00      0.88      0.93         8
           1       0.00      0.00      0.00         1
           4       0.50      0.50      0.50         2
           6       0.50      1.00      0.67         1
           8       0.00      0.00      0.00         5
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         2
          12       0.50      0.20      0.29         5
          15       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         2
          18       0.07      1.00      0.13         1
          20       0.00      0.00      0.00         1
          21       0.00      0.00      0.00         0
          22       0.00      0.00      0.00         3
          24       0.25      0.60      0.35         5
          26       0.00      0.00      0.00         1
          27       0.00      0.00      0.00         2
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Create IIT Dataset that computes xy and yz then xy + yz

In [47]:
V1 = 1
V2 = 2

train_data_V1, test_data_V1 = get_iit_distribution_dataset_both(V1, vals)
train_data_V2, test_data_V2 = get_iit_distribution_dataset_both(V2, vals)

In [48]:
x_base_train_V1, y_base_train_V1, x_source_train_V1, y_source_train_V1 = train_data_V1
x_base_test_V1, y_base_test_V1, x_source_test_V1, y_source_test_V1 = test_data_V1

x_base_train_V2, y_base_train_V2, x_source_train_V2, y_source_train_V2 = train_data_V2
x_base_test_V2, y_base_test_V2, x_source_test_V2, y_source_test_V2 = test_data_V2

In [49]:
train_size = len(x_base_train_V1)
interventions_train_V1 = torch.zeros(train_size)
interventions_train_V2 = torch.ones(train_size)

In [50]:
test_size = len(x_base_test_V1)
interventions_test_V1 = torch.zeros(test_size)
interventions_test_V2 = torch.ones(test_size)

In [7]:
x_base_train_V1[:5]

[['three', 'one', 'six'],
 ['seven', 'zero', 'zero'],
 ['zero', 'four', 'two'],
 ['two', 'nine', 'two'],
 ['six', 'three', 'nine'],
 ['zero', 'zero', 'five'],
 ['three', 'two', 'three'],
 ['nine', 'one', 'eight'],
 ['four', 'zero', 'four'],
 ['six', 'two', 'zero'],
 ['six', 'five', 'three'],
 ['three', 'one', 'zero'],
 ['five', 'two', 'four'],
 ['six', 'one', 'two'],
 ['six', 'three', 'one'],
 ['one', 'five', 'nine'],
 ['one', 'nine', 'two'],
 ['nine', 'nine', 'five'],
 ['three', 'five', 'three'],
 ['eight', 'nine', 'zero'],
 ['eight', 'one', 'four'],
 ['zero', 'seven', 'three'],
 ['zero', 'one', 'three'],
 ['three', 'five', 'eight'],
 ['four', 'nine', 'eight'],
 ['seven', 'nine', 'three'],
 ['five', 'seven', 'one'],
 ['six', 'seven', 'six'],
 ['zero', 'five', 'zero'],
 ['three', 'eight', 'three'],
 ['three', 'two', 'five'],
 ['three', 'zero', 'seven'],
 ['seven', 'nine', 'seven'],
 ['five', 'one', 'nine'],
 ['eight', 'five', 'eight'],
 ['five', 'six', 'zero'],
 ['six', 'two', 'two'],


In [51]:
X_base_train_both = np.concatenate([np.array(x_base_train_V1),
                                np.array(x_base_train_V2)], axis=0)

X_sources_train_both = [np.concatenate([np.array(x_source_train_V1),
                            np.array(x_source_train_V2)], axis=0)]

y_base_train_both = np.concatenate([np.array(y_base_train_V1),
                            np.array(y_base_train_V2)], axis=0)

y_source_train_both = np.concatenate([np.array(y_source_train_V1),
                            np.array(y_source_train_V2)])

interventions_train_both = np.concatenate([np.array(interventions_train_V1),
                            np.array(interventions_train_V2)])

In [52]:
X_base_test_both = np.concatenate([np.array(x_base_test_V1),
                                np.array(x_base_test_V2)], axis=0)

X_sources_test_both = [np.concatenate([np.array(x_source_test_V1),
                            np.array(x_source_test_V2)], axis=0)]

y_base_test_both = np.concatenate([np.array(y_base_test_V1),
                            np.array(y_base_test_V2)], axis=0)

y_source_test_both = np.concatenate([np.array(y_source_test_V1),
                            np.array(y_source_test_V2)])

interventions_test_both = np.concatenate([np.array(interventions_test_V1),
                            np.array(interventions_test_V2)])

In [53]:
V1 = 0
V2 = 1
embedding_dim = 5
max_iter = 2 

id_to_coords = {
    V1: [{"layer": 1, "start": 0, "end": embedding_dim}], 
    V2: [{"layer": 1, "start": embedding_dim, "end": embedding_dim*2}]    
}

both_model = TorchDeepNeuralClassifierIIT(
    hidden_dim=embedding_dim*4, 
    hidden_activation=torch.nn.ReLU(), 
    num_layers=2, 
    id_to_coords=id_to_coords, 
    vocab=vals,
    output_size=output_size,
    num_inputs=num_inputs,
    max_iter=max_iter)

In [54]:
_ = both_model.fit(
    X_base_train_both, 
    X_sources_train_both, 
    y_base_train_both, 
    y_source_train_both, 
    interventions_train_both)

Finished epoch 2 of 2; error is 20.539315223693848

In [11]:
import importlib
importlib.reload(model_iit)

<module 'model_iit' from '/home/jamesflemings/Documents/work/CSLI/iit-distribution-rule/model_iit.py'>

In [31]:
IIT_preds_V1, base_preds_V1 = both_model.iit_predict(
    x_base_test_V1, [x_source_test_V1], interventions_test_V1
)

print('Standard Evaluation')
print(classification_report(y_base_test_V1, base_preds_V1))
      
print("V1 counterfactual evaluation")
print(classification_report(y_source_test_V1, IIT_preds_V1))

Standard Evaluation
              precision    recall  f1-score   support

           0       0.24      0.50      0.32         8
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         5
           9       0.00      0.00      0.00         3
          10       0.00      0.00      0.00         2
          12       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         3
          15       0.00      0.00      0.00         2
          18       0.00      0.00      0.00         4
          20       0.00      0.00      0.00         2
          22       0.00      0.00      0.00         2
          24       0.00      0.00      0.00         4
       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
IIT_preds_V2, base_preds_V2 = both_model.iit_predict(
    x_base_test_V2, x_source_test_V2, interventions_train_both
)

print("V2 counterfactual evaluation")
print(classification_report(y_source_test_V2, IIT_preds_V2))