# Download FEVER

In [1]:
!wget https://s3-eu-west-1.amazonaws.com/fever.public/wiki-pages.zip -O /tmp/wiki-pages.zip
!unzip -q /tmp/wiki-pages.zip -d ./data

--2021-04-29 20:05:07--  https://s3-eu-west-1.amazonaws.com/fever.public/wiki-pages.zip
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.102.11
Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.102.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1713485474 (1.6G) [application/zip]
Saving to: ‘/tmp/wiki-pages.zip’


2021-04-29 20:09:27 (6.30 MB/s) - ‘/tmp/wiki-pages.zip’ saved [1713485474/1713485474]



In [2]:
!wget https://s3-eu-west-1.amazonaws.com/fever.public/train.jsonl -O ./data/train.jsonl
!wget https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev.jsonl -O ./data/share_task_dev.jsonl
!wget https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_test.jsonl -O ./data/share_task_test.jsonl

--2021-04-29 20:13:53--  https://s3-eu-west-1.amazonaws.com/fever.public/train.jsonl
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.106.10
Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.106.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33024303 (31M) [application/x-www-form-urlencoded]
Saving to: ‘./data/train.jsonl’


2021-04-29 20:13:55 (17.3 MB/s) - ‘./data/train.jsonl’ saved [33024303/33024303]

--2021-04-29 20:13:55--  https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev.jsonl
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.106.10
Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.106.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4349935 (4.1M) [binary/octet-stream]
Saving to: ‘./da

# The Facebook Experiment

1. Implement automatic masking
2. Get the top 1 prediction from the LM 
3. Fill in the mask
4. Use the claim and filled in sentence and input into an entailment model
5. Input entailment into MLP for final fact-verification prediction

In [94]:
import spacy
from typing import Iterable, List, Tuple
import tensorflow as tf
import numpy as np

class Pipeline:
    
    def __init__(self, tokenizer, model, predictor, mask_token: str = "[MASK]"):
        self.nlp = spacy.load("en_core_web_trf")
        self.mask_token = mask_token
        self.tokenizer = tokenizer
        self.model = model
        self.predictor = predictor
    
    def _mask(self, texts: List[str]) -> List[str]:
        """ Masks the last named entity in the string """
        masked_sents = list()
        for doc in self.nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]):
            ents = doc.ents
            target = ents[-1].text.split()[-1]
            masked = doc.text.replace(target, mask_token)
            masked_sents.append(masked)
        return masked_sents
    
    def _fill_mask(self, texts: Iterable) -> List[str]:
        """ Fills the masked token with the  top-1 predicted value """
        preds = list()
        for text in texts:
            tokens = self.tokenizer(text, return_tensors='tf')['input_ids']
            masked_index = tf.where(tokens[0] == self.tokenizer.mask_token_id).numpy()

            outputs = self.model(tokens)
            logits = outputs.logits[0, masked_index.item(), :]
            probs = tf.nn.softmax(logits)
            topk = tf.math.top_k(probs, k=1)
            values, predictions = topk.values.numpy(), topk.indices.numpy()
            
            pred = tokenizer.decode(predictions)
            preds.append(text.replace(self.mask_token, pred))
        return preds
    
    def forward(self, texts: List[str]) -> List[Tuple[str, str]]:
        """ Predicts the next word for the text """
        masked = self._mask(texts)
        claims = self._fill_mask(masked)
        labels = np.zeros(len(texts))
        for indx, premise in enumerate(texts):
            hyp = claims[indx]
            entailment = self.predictor.predict(premise=premise, hypothesis=hyp)['label']
            print(entailment)
            if entailment == 'entailment':
                label = 0
            elif entailment == 'contradiction':
                label = 1
            else:
                label = 2
            labels[indx] = label
        return labels
            

In [88]:
test1 = "Thomas Jefferson founded the University of Virginia after retiring."
test2 = "Microsoft's headquarters are in Redmond."
test3 = "Tim Roth is an English actor."
texts = [test1,test2,test3]
mask_token = "[MASK]"

In [66]:
from transformers import BertTokenizer, TFBertForMaskedLM
import tensorflow as tf

tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
model = TFBertForMaskedLM.from_pretrained('bert-large-cased')

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at bert-large-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [32]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging

predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/decomposable-attention-elmo-2020.04.09.tar.gz")

AttributeError: 'TextualEntailmentPredictor' object has no attribute 'forward'

In [95]:
pipe = Pipeline(tokenizer, model, predictor)
pipe.forward(texts)

entailment
neutral
contradiction


array([0., 2., 1.])

In [22]:
import spacy
from typing import Iterable

NLP = spacy.load("en_core_web_trf")

def mask_last_named_entity(texts: Iterable, mask_token: str = "[MASK]") -> str:
    """ Masks the last named entity in the string """
    masked_sents = list()
    for doc in NLP.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]):
        ents = doc.ents
        target = ents[-1].text.split()[-1]
        masked = doc.text.replace(target, mask_token)
        masked_sents.append(masked)
    return masked_sents

In [64]:
masked = mask_last_named_entity([test1, test2, test3], mask_token) 
masked

['Thomas Jefferson founded the University of [MASK] after retiring.',
 "Microsoft's headquarters are in [MASK].",
 'Tim Roth is an [MASK] actor.']

In [72]:
def predict(tokenizer, model, sentences: list) -> list:
    filled = list()
    for sent in sentences:
        tokens = tokenizer(sent, return_tensors='tf')['input_ids']
        masked_index = tf.where(tokens[0] == tokenizer.mask_token_id).numpy()
        
        outputs = model(tokens)
        logits = outputs.logits[0, masked_index.item(), :]
        probs = tf.nn.softmax(logits)
        topk = tf.math.top_k(probs, k=1)
        values, predictions = topk.values.numpy(), topk.indices.numpy()
        
        pred = tokenizer.decode(predictions)
        filled.append(sent.replace('[MASK]', pred))
    return filled

In [73]:
predict(tokenizer, model, masked)

['Thomas Jefferson founded the University of Virginia after retiring.',
 "Microsoft's headquarters are in Atlanta.",
 'Tim Roth is an American actor.']

In [53]:
outputs = predictor.predict(
    premise="Two women are wandering along the shore drinking iced tea.",
    hypothesis="Two women are sitting on a blanket near some rocks talking about politics."
)
outputs

{'label_logits': [-3.349426507949829, 4.429577350616455, 0.9683250188827515],
 'label_probs': [0.00040552925202064216,
  0.9691717624664307,
  0.030422717332839966],
 'h2p_attention': [[0.6542736887931824,
   0.04181644320487976,
   0.04475216567516327,
   0.047742560505867004,
   0.03262277692556381,
   0.02951790951192379,
   0.022202881053090096,
   0.02251291088759899,
   0.02253689430654049,
   0.033956028521060944,
   0.02539297193288803,
   0.02267284318804741],
  [2.872008553822525e-05,
   0.9997602105140686,
   2.671237598406151e-05,
   3.057234061998315e-05,
   1.630776750971563e-05,
   1.6658552340231836e-05,
   2.244083043478895e-05,
   1.6773410607129335e-05,
   1.8311986423213966e-05,
   3.357100285938941e-05,
   1.4836954505881295e-05,
   1.4931149053154513e-05],
  [0.1017719954252243,
   0.08759496361017227,
   0.11182094365358353,
   0.1164901927113533,
   0.08897576481103897,
   0.10866396129131317,
   0.06278226524591446,
   0.06688568741083145,
   0.0630138665437698

# LAMA Probe into BART

The goal of this section is to understand the amount of knowledge stored in BART using the [LAMA probe](https://github.com/facebookresearch/LAMA).

# From BERT to BART

Insert the BART model in lieu of BERT to see if performance increases

# Inserting a Context Layer into the Pipeline

- We will try two methods, one using DrQA and one using an autoregressive language model GPT2.

# Results

Did it work?