# Download FEVER

In [1]:
!wget https://s3-eu-west-1.amazonaws.com/fever.public/wiki-pages.zip -O /tmp/wiki-pages.zip
!unzip -q /tmp/wiki-pages.zip -d ./data

--2021-04-29 20:05:07--  https://s3-eu-west-1.amazonaws.com/fever.public/wiki-pages.zip
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.102.11
Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.102.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1713485474 (1.6G) [application/zip]
Saving to: ‘/tmp/wiki-pages.zip’


2021-04-29 20:09:27 (6.30 MB/s) - ‘/tmp/wiki-pages.zip’ saved [1713485474/1713485474]



In [2]:
!wget https://s3-eu-west-1.amazonaws.com/fever.public/train.jsonl -O ./data/train.jsonl
!wget https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev.jsonl -O ./data/share_task_dev.jsonl
!wget https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_test.jsonl -O ./data/share_task_test.jsonl

--2021-04-29 20:13:53--  https://s3-eu-west-1.amazonaws.com/fever.public/train.jsonl
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.106.10
Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.106.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33024303 (31M) [application/x-www-form-urlencoded]
Saving to: ‘./data/train.jsonl’


2021-04-29 20:13:55 (17.3 MB/s) - ‘./data/train.jsonl’ saved [33024303/33024303]

--2021-04-29 20:13:55--  https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev.jsonl
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.106.10
Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.106.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4349935 (4.1M) [binary/octet-stream]
Saving to: ‘./da

# Training a Custom Entailment Model

In [100]:
import numpy as np
from typing import Tuple, Dict

from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import History

from allennlp.predictors.predictor import Predictor
from models.textual_entailment import TextualEntailment
from datasets.arrow_dataset import Dataset


class Entailment:
    
    def __init__(self):
        self._frozen = Predictor.from_path("./models/textual_entailment.tar.gz")
        self._model = self._model()
        
    def _map(self, label: int) -> str:
        """ Maps the label to the value """
        switcher = {
            0: 'entailment',
            1: 'contradiction'
        }
        return switcher[label]
        
    def _entail_texts(self, premises: List[str], hypotheses: List[str]) -> np.ndarray:
        """ Entails the texts """
        batch_json = [dict(premise=v[0], hypothesis=v[1]) for v in zip(premises, hypotheses)]
        entailment = self._frozen.predict_batch_json(batch_json)
        return np.array([e['aggregate_input'] for e in entailment])
        
    def _model(self) -> Sequential:
        """ Builds the MLP for the classification """
        model = Sequential()
        model.add(Dense(100, activation='relu', input_shape=(400,)))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer=Adam(lr=1E-3), loss='binary_crossentropy', 
                      metrics=['acc'])
        return model
    
    def prepare(self, data: Dataset) -> Dataset:
        """ Prepares the data by removing neutral statements and splitting the data into data and labels """
        data = data.filter(lambda x: x['label'] != 1)
        
        def change_label(x):
            if x['label'] == 2:
                x['label'] = 1
            return x
        
        data = data.map(change_label)
        return data
        
    
    def forward(self, premises: List[str], hypotheses: List[str]) -> List[Tuple[str, str]]:
        """ Predicts the next word for the text """
        entailments = self._entail_texts(texts, claims)
        preds = self.model.predict(entailments)
        return preds
    
    def fit(self, train: Dataset, validation: Dataset, batch_size: int = 32, epochs: int = 200) -> History:
        """ Fits the model on the training and validation sets """
        
        def fit_generator(data: Dataset, batch_size: int = 32, epochs: int = 200):
            n, d = data.shape
            iters = int(n / batch_size)
            
            for i in range(epochs):
                start = 0
                for j in range(iters):
                    end = start + batch_size
                    batch = data[start:end]
                    premises, hypotheses, labels = (batch['premise'], batch['hypothesis'], batch['label'])
                    x = self._entail_texts(premises, hypotheses)
                    y = np.array(labels)
                    yield x, y
                    start = end
        
        tr_epoch_steps = int(train.shape[0] / batch_size)
        tr_generator = fit_generator(train, batch_size, epochs)
        
        val_epoch_steps = int(validation.shape[0] / batch_size)
        val_generator = fit_generator(validation, batch_size, epochs)
        history = self._model.fit(tr_generator, steps_per_epoch=tr_epoch_steps, epochs=epochs,
                                  validation_data=val_generator, validation_steps=val_epoch_steps)
        return history
    

In [89]:
from datasets import load_dataset

snli_data = load_dataset('snli')

Reusing dataset snli (/home/jmack/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


In [101]:
entailment = Entailment()

In [98]:
data = entailment.prepare(snli_data)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=551.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6781.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=367388.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6765.0), HTML(value='')))




In [102]:
entailment.fit(data['train'], data['validation'])

Epoch 1/200
   50/11480 [..............................] - ETA: 3:26:40 - loss: 0.3935 - acc: 0.8472

KeyboardInterrupt: 

# The Facebook Experiment

1. Implement automatic masking
2. Get the top 1 prediction from the LM 
3. Fill in the mask
4. Use the claim and filled in sentence and input into an entailment model
5. Input entailment into MLP for final fact-verification prediction

In [37]:
import spacy
from typing import Iterable, List, Tuple
import tensorflow as tf
import numpy as np

from keras.models import Sequential
from keras.layers import Dense

class Pipeline:
    
    def __init__(self, tokenizer, unmasker, predictor, mask_token: str = "[MASK]"):
        self.nlp = spacy.load("en_core_web_trf")
        self.mask_token = mask_token
        self.tokenizer = tokenizer
        self.unmasker = unmasker
        self.predictor = predictor
        self.model = self._model()
    
    def _mask(self, texts: List[str]) -> List[str]:
        """ Masks the last named entity in the string """
        masked_sents = list()
        for doc in self.nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]):
            ents = doc.ents
            target = ents[-1].text.split()[-1]
            masked = doc.text.replace(target, mask_token)
            masked_sents.append(masked)
        return masked_sents
    
    def _fill_mask(self, texts: Iterable) -> List[str]:
        """ Fills the masked token with the  top-1 predicted value """
        preds = list()
        for text in texts:
            tokens = self.tokenizer(text, return_tensors='tf')['input_ids']
            masked_index = tf.where(tokens[0] == self.tokenizer.mask_token_id).numpy()

            outputs = self.unmasker(tokens)
            logits = outputs.logits[0, masked_index.item(), :]
            probs = tf.nn.softmax(logits)
            topk = tf.math.top_k(probs, k=1)
            values, predictions = topk.values.numpy(), topk.indices.numpy()
            
            pred = tokenizer.decode(predictions)
            preds.append(text.replace(self.mask_token, pred))
        return preds
    
    def _entail_texts(self, premises: List[str], hypotheses: List[str]) -> np.ndarray:
        """ Entails the texts """
        batch_json = [dict(premise=v[0], hypothesis=v[1]) for v in zip(premises, hypotheses)]
        print(batch_json)
        entailment = self.predictor.predict_batch_json(batch_json)
        return np.array([e['aggregate_input'] for e in entailment])
        
    
    def _model(self) -> Sequential:
        """ Builds the MLP for the classification """
        model = Sequential()
        model.add(Dense(100, activation='relu', input_shape=(400,)))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer=Adam(lr=1E-3), loss='binary_crossentropy', 
                      metrics=['acc'])
        return model
    
    def forward(self, texts: List[str]) -> List[Tuple[str, str]]:
        """ Predicts the next word for the text """
        masked = self._mask(texts)
        claims = self._fill_mask(masked)
        entailments = self._entail_texts(texts, claims)
        preds = self.model.predict(entailments)
        return preds
    
    def fit()
            

In [38]:
from allennlp.predictors.predictor import Predictor
from models.textual_entailment import TextualEntailment

predictor = Predictor.from_path("./models/textual_entailment.tar.gz")

In [34]:
from transformers import BertTokenizer, TFBertForMaskedLM
import tensorflow as tf

tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
unmasker = TFBertForMaskedLM.from_pretrained('bert-large-cased')

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at bert-large-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [35]:
test1 = "Thomas Jefferson founded the University of Virginia after retiring."
test2 = "Microsoft's headquarters are in Redmond."
test3 = "Tim Roth is an English actor."
texts = [test1,test2,test3]
mask_token = "[MASK]"

In [39]:
pipe = Pipeline(tokenizer, unmasker, predictor)
pipe.forward(texts)

[{'premise': 'Thomas Jefferson founded the University of Virginia after retiring.', 'hypothesis': 'Thomas Jefferson founded the University of Virginia after retiring.'}, {'premise': "Microsoft's headquarters are in Redmond.", 'hypothesis': "Microsoft's headquarters are in Atlanta."}, {'premise': 'Tim Roth is an English actor.', 'hypothesis': 'Tim Roth is an American actor.'}]


array([[0.6196344],
       [0.5279083],
       [0.5934663]], dtype=float32)

In [22]:
tokenizer.vocab['Virginia']

2550

In [22]:
import spacy
from typing import Iterable

NLP = spacy.load("en_core_web_trf")

def mask_last_named_entity(texts: Iterable, mask_token: str = "[MASK]") -> str:
    """ Masks the last named entity in the string """
    masked_sents = list()
    for doc in NLP.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]):
        ents = doc.ents
        target = ents[-1].text.split()[-1]
        masked = doc.text.replace(target, mask_token)
        masked_sents.append(masked)
    return masked_sents

In [64]:
masked = mask_last_named_entity([test1, test2, test3], mask_token) 
masked

['Thomas Jefferson founded the University of [MASK] after retiring.',
 "Microsoft's headquarters are in [MASK].",
 'Tim Roth is an [MASK] actor.']

In [72]:
def predict(tokenizer, model, sentences: list) -> list:
    filled = list()
    for sent in sentences:
        tokens = tokenizer(sent, return_tensors='tf')['input_ids']
        masked_index = tf.where(tokens[0] == tokenizer.mask_token_id).numpy()
        
        outputs = model(tokens)
        logits = outputs.logits[0, masked_index.item(), :]
        probs = tf.nn.softmax(logits)
        topk = tf.math.top_k(probs, k=1)
        values, predictions = topk.values.numpy(), topk.indices.numpy()
        
        pred = tokenizer.decode(predictions)
        filled.append(sent.replace('[MASK]', pred))
    return filled

In [73]:
predict(tokenizer, model, masked)

['Thomas Jefferson founded the University of Virginia after retiring.',
 "Microsoft's headquarters are in Atlanta.",
 'Tim Roth is an American actor.']

In [5]:
outputs = predictor.predict(
        premise="Two women are wandering along the shore drinking iced tea.",
        hypothesis="Two women are sitting on a blanket near some rocks talking about politics."
    )

In [16]:
model_inputs = np.array([outputs['aggregate_input']])
model_inputs.shape

(1, 400)

In [11]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

model = Sequential()
model.add(Dense(100, activation='relu', input_shape=(400,)))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 100)               40100     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 101       
Total params: 40,201
Trainable params: 40,201
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.compile(optimizer=Adam(lr=1E-3), loss='binary_crossentropy', 
                  metrics=['acc'])

In [17]:
model.predict(model_inputs)

array([[0.64905727]], dtype=float32)

In [57]:
from allennlp_models import pretrained

pretrained.get_pretrained_models()['pair-classification-decomposable-attention-elmo'].task_id

lerc is not a registered model.
roberta-rte is not a registered model.


'textual_entailment'

In [36]:
from allennlp_models.pair_classification.models import DecomposableAttention

model = DecomposableAttention()

TypeError: __init__() missing 6 required positional arguments: 'vocab', 'text_field_embedder', 'attend_feedforward', 'matrix_attention', 'compare_feedforward', and 'aggregate_feedforward'

In [21]:
pred = pretrained.load_predictor('pair-classification-decomposable-attention-elmo')

lerc is not a registered model.
roberta-rte is not a registered model.


In [58]:
outputs = pred.predict(
        premise="Two women are wandering along the shore drinking iced tea.",
        hypothesis="Two women are sitting on a blanket near some rocks talking about politics."
    )

In [59]:
outputs

{'label_logits': [-3.349437952041626, 4.429553985595703, 0.9683454036712646],
 'label_probs': [0.00040553370490670204,
  0.9691703915596008,
  0.030424002557992935],
 'h2p_attention': [[0.6542813777923584,
   0.04181642830371857,
   0.044751670211553574,
   0.04774125665426254,
   0.032621681690216064,
   0.02951720543205738,
   0.0222022645175457,
   0.02251223288476467,
   0.022536294534802437,
   0.03395495191216469,
   0.025392329320311546,
   0.02267230674624443],
  [2.8718852263409644e-05,
   0.9997602105140686,
   2.671044239832554e-05,
   3.0569222872145474e-05,
   1.630610495340079e-05,
   1.6656915249768645e-05,
   2.2438669475377537e-05,
   1.677172986092046e-05,
   1.8310134692001157e-05,
   3.356780143803917e-05,
   1.4835497495369054e-05,
   1.4929611097613815e-05],
  [0.10177356749773026,
   0.08759545534849167,
   0.1118234395980835,
   0.11648961156606674,
   0.08897500485181808,
   0.10866256058216095,
   0.06278194487094879,
   0.06688541173934937,
   0.0630134493112

# LAMA Probe into BART

The goal of this section is to understand the amount of knowledge stored in BART using the [LAMA probe](https://github.com/facebookresearch/LAMA).

# From BERT to BART

Insert the BART model in lieu of BERT to see if performance increases

# Inserting a Context Layer into the Pipeline

- We will try two methods, one using DrQA and one using an autoregressive language model GPT2.

# Results

Did it work?