In [1]:
# # In case it is necessary to run in colaboratory
# !python colab_module_imports.py

In [2]:
from revllm.preprocess_distilbert import PreprocessQAndA
from revllm.analyze_distilbert import AnalyzeQAndA 
from revllm.visualize_distilbert import VisualizeQAndA

# Question and Answer, or "qanda"
* DistilBERT base model
* fine-tuned on SQuAD, a standard question answering dataset

## The user provides:

* A question
* A context, from which the model will extract the answer
* A ground truth answer, which is what the model will try to predict

In [3]:
question = "What is important to us?"
context = "It is important to us to include, empower and support humans of all kinds."
ground_truth = "to include, empower and support humans of all kinds"

# We use the same model as the huggingface baseline for qanda
model = 'distilbert-base-uncased-distilled-squad'

In [4]:
#Define the objects
preprocessor = PreprocessQAndA(model)
analyzer = AnalyzeQAndA(model,preprocessor)
visualizer = VisualizeQAndA(model,preprocessor)

In [5]:
# Pass the question, context and ground truth to the preprocessor
preprocessor(question, context, ground_truth)

## .predict() method
* Two predictions are made:
    * A predicted start to the answer (token number)
    * A predicted end to the answer (token number)
* The predicted answer as shown is the text between the predicted start and end tokens

In [6]:
# Run the model
analyzer.predict()

        Question:  What is the name of the virus?
Predicted Answer:  co ##vid - 19
   Actual Answer:  COVID-19


## "lig" methods
* "lig" is a reference to the internal method used, "layer integrated gradients"

### .lig_color_map() method
* Since the prediction consists of a start and end token, for each one here is shown:
    * The predicted vs actual token
    * The entire text is printed with question and context concatenated.  
        * Above the text for each token, its importance to model's prediction ("attribution score") is shown in color.  
        * A legend is included.

In [14]:
analyzer.lig_color_map()

### .lig_top_k_tokens() method
* Prints the entire list of tokens
* For each of the start and end token predictions, prints the top k attributed tokens, for a choice of k.

In [None]:
analyzer.lig_top_k_tokens(k=5)

## "lc" methods
* "lc" is a reference to the internal method used, "layer conductance"

### .lc_visualize_layers() method
* Creates 2 plots: one for the predicting start token, one for the predicting end token
* For each token, a color tile representing the attribution score is given for each layer of the model (there are 6 layers in distilbert).

In [None]:
visualizer.lc_visualize_layers()

### .lc_visualize_token() method
* For a choice of token (for example "name"), produces some plots.
* This method is unfinished, an explanation to come.

In [None]:
token_to_analyze = 'humans'

In [None]:
visualizer.lc_visualize_token_boxes(token_to_analyze)

In [None]:
visualizer.lc_visualize_token_pdfs(token_to_analyze)

In [None]:
visualizer.lc_visualize_token_entropies(token_to_analyze)