# Setup
Download from https://github.com/dmis-lab/bioasq-biobert. Make sure to also download [BioBERT](https://drive.google.com/open?id=1rXFQRcV69QHAxghQ3NeAlhkg6ykpflVK) weights, [config file](https://drive.google.com/open?id=17fX1-oChZ5rxu-e-JuaZl2I96q1dGJO4), and [vocab file](https://drive.google.com/open?id=1GQUvBbXvlI_PeUPsZTqh7xQDZMOXh7ko) which are not directly included in the repository.

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd '/content/drive/My Drive/bioasq-biobert-1.0'

/content/drive/My Drive/bioasq-biobert-1.0


In [3]:
!pip install -r requirements.txt


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-gpu==1.15.2
  Downloading tensorflow_gpu-1.15.2-cp37-cp37m-manylinux2010_x86_64.whl (410.9 MB)
[K     |████████████████████████████████| 410.9 MB 34 kB/s 
[?25hCollecting pandas==0.23
  Downloading pandas-0.23.0.tar.gz (13.1 MB)
[K     |████████████████████████████████| 13.1 MB 26.6 MB/s 
Collecting tensorboard<1.16.0,>=1.15.0
  Downloading tensorboard-1.15.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 54.2 MB/s 
[?25hCollecting keras-applications>=1.0.8
  Downloading Keras_Applications-1.0.8-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 6.7 MB/s 
[?25hCollecting tensorflow-estimator==1.15.1
  Downloading tensorflow_estimator-1.15.1-py2.py3-none-any.whl (503 kB)
[K     |████████████████████████████████| 503 kB 72.3 MB/s 
Collecting gast==0.2.2
  Downloading gast-0.2.2.tar.gz (10 kB)
Building wheels for

In [3]:
!pip install numpy==1.19.5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Steps to pre-process data

In [17]:
import json
from collections import namedtuple


In [10]:
# Pre-process data
with open('/content/drive/My Drive/bioasq-biobert-1.0/data/BioASQ-training10b/training10b.json', 'r') as f:
    bioasq_json = json.load(f)

In [57]:
### Construct dataset
count_factoid = 0
count_list =0
count_summary=0
count_yesno =0

bioasq_list= []

for i in range(len(bioasq_json['questions'])):
    
    sample = bioasq_json['questions'][i]
    
    if sample['type'] == 'summary':
            count_summary += 1
    if sample['type'] == 'yesno':
            count_yesno += 1
    
    if sample['type'] in ['factoid', 'list']:
        
    #  Context
    ## flatten all the snippet, conccatenate and use as context
        context = '' 
        for snip in [ele['text'].strip() for ele in sample['snippets']]:
            snip += ' '
            context += snip
            
        context = context.replace('\n', ' ')
        
        ## limit the length of context
        ### Max: 4096 (for eleuther model)
        context = context[:1024]
        
        # question
        question = sample['body']
        question = question.replace('\n', ' ')
        
        # answer:
        ## deal with factoid question and list question differently
        if sample['type'] == 'factoid':
            answer = sample['exact_answer']
            count_factoid += 1
        
        ## For now, don't do list questions
        if sample['type'] == 'list':
            answer = [x for y in sample['exact_answer'] for x in y]
            count_list += 1        

        # construct a QA pairs like SQUAD
        bioasq_list.append({
            'id': i,
            'context': context,
            'qas': [{'question': sample['body'], 'id': i}], # Added this to match expected input format for BioBERT
            'question': sample['body'],
            'answers': answer,
            'type': sample['type']
        }) 

print(f'we have {count_factoid} factoid questions, {count_list} list questions, {count_summary} summary questions, {count_yesno} yesno qquestions')   

print(f'total is {count_factoid +count_list+ count_summary +count_yesno}')

we have 1252 factoid questions, 816 list questions, 1018 summary questions, 1148 yesno qquestions
total is 4234


In [67]:
from sklearn.model_selection import train_test_split
def get_bioasq_split(bioasq_list, random_state):
    """
    
    Returns
    -------
    list of example named tuples with attributes
    id, title, context, question, answers
    
    """
    # BioasqExample = namedtuple("BioasqExample",  "id context question answers")
    
    # bioasq_data = [BioasqExample(ele['id'], ele['context'], ele['question'], ele['answers']) for ele in bioasq_list]
    
    bioasq_train, _ = train_test_split(bioasq_list, test_size=0.9, random_state=random_state)

    bioasq_dev, bioasq_test = train_test_split(_, test_size=0.8888, random_state=random_state)
    
    return bioasq_train, bioasq_dev, bioasq_test

bioasq_train, bioasq_dev, bioasq_test = get_bioasq_split(bioasq_list, random_state=40)
print(f"{len(bioasq_train)}, {len(bioasq_dev)}, {len(bioasq_test)} ")

206, 207, 1655 


In [83]:
BioasqExample = namedtuple("BioasqExample", "id context question answers")
bioasq_data = [BioasqExample(ele['id'], ele['context'], ele['question'], ele['answers']) for ele in bioasq_test]

In [73]:
''' And then I copied and pasted the output of this into a JSON file, and put 
{
   "version":"BioASQ10b",
   "data":[
      {
         "title":"BioASQ10b",
         "paragraphs":


  at the beginning and also put

  }]}

  at the end. Kind of weird but whatever. Check the JSON is well formatted here if you like: https://jsonformatter.curiousconcept.com/#
         '''
json.dumps(bioasq_test)



# Generate predictions

In [74]:
!python3 run_factoid.py \
     --do_train=False \
     --do_predict=True \
     --vocab_file="/content/drive/My Drive/bioasq-biobert-1.0/BERT-pubmed-1000000-SQuAD/vocab.txt" \
     --bert_config_file="/content/drive/My Drive/bioasq-biobert-1.0/BERT-pubmed-1000000-SQuAD/bert_config.json" \
     --init_checkpoint="/content/drive/My Drive/bioasq-biobert-1.0/BERT-pubmed-1000000-SQuAD/model.ckpt-14599" \
     --max_seq_length=384 \
     --train_batch_size=12 \
     --learning_rate=5e-6 \
     --doc_stride=128 \
     --num_train_epochs=5.0 \
     --do_lower_case=False \
     --predict_file="/content/drive/My Drive/bioasq-biobert-1.0/data/BioASQ-training10b/processed-factoid-list.json"     --output_dir=factoid_output/




W0605 03:25:08.032666 140405812086656 module_wrapper.py:139] From run_factoid.py:1134: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.


W0605 03:25:08.032885 140405812086656 module_wrapper.py:139] From run_factoid.py:1134: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.


W0605 03:25:08.033055 140405812086656 module_wrapper.py:139] From /content/drive/MyDrive/bioasq-biobert-1.0/modeling.py:92: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.


W0605 03:25:08.035178 140405812086656 module_wrapper.py:139] From run_factoid.py:1140: The name tf.gfile.MakeDirs is deprecated. Please use tf.io.gfile.makedirs instead.

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com

In [75]:
# Read output
with open('/content/drive/My Drive/bioasq-biobert-1.0/factoid_output/predictions.json', 'r') as f:
    factoid_predictions = json.load(f)

In [88]:
factoid_predictions_dict = []
factoid_predictions_list = []
for i in factoid_predictions:
  factoid_predictions_list.append(factoid_predictions[i])
  factoid_predictions_dict.append({'generated_answer': factoid_predictions[i]})

In [78]:
factoid_predictions_dict[0]

'deubiquitinase'

# Run evaluation

In [79]:
from typing import List
import string
import re
import collections
import numpy as np

In [80]:
def normalize_answer(s: str) -> str:
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def get_tokens(s: str) -> List[str]:
    """Normalize string and split string into tokens."""
    if not s:
        return []
    return normalize_answer(s).split()


def compute_exact(a_gold: str, a_pred: str) -> int:
    """Compute the Exact Match score."""
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))


def compute_f1_from_tokens(gold_toks: List[str], pred_toks: List[str]) -> float:
    """Compute the F1 score from tokenized gold answer and prediction."""
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())

    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        f1= int(gold_toks == pred_toks)
        precision = int(gold_toks == pred_toks)
        recall = int(gold_toks == pred_toks)
        
     # if no token overlap at all, all metrics is 0
    if num_same == 0: 
        f1= int(gold_toks == pred_toks)
        precision = int(gold_toks == pred_toks)
        recall = int(gold_toks == pred_toks)
    
    else:
        precision = 1.0 * num_same / len(pred_toks)
        recall = 1.0 * num_same / len(gold_toks)
        f1 = (2 * precision * recall) / (precision + recall)
    return f1, precision, recall


def compute_f1(a_gold: str, a_pred: str) -> float:
    """Compute the F1 score."""
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    return compute_f1_from_tokens(gold_toks, pred_toks)

In [81]:
def evaluate(examples, prompts, gens):
    """Generic evalution function.
    
    Parameters
    ----------
    examples: iterable of `SquadExample` instances
    prompts: list of str
    preds: list of LM-generated texts to evaluate as answers
    
    Returns
    -------
    dict with keys "em_per", "macro_f1", "examples", where
    each "examples" value is a dict
    
    """        
    results = []
    for ex, prompt, gen in zip(examples, prompts, gens):
        answers = ex.answers
        pred = gen['generated_answer']
        # The result is the highest EM from the available answer strings:
        em = max([compute_exact(ans, pred) for ans in answers])
        
        # adding precision and recall
        # print([compute_f1(ans, pred) for ans in answers])
        f1 = max([compute_f1(ans, pred)[0] for ans in answers])
        precision = max([compute_f1(ans, pred)[1] for ans in answers])
        recall = max([compute_f1(ans, pred)[2] for ans in answers])
        
        gen.update({
            "id": ex.id, 
            "question": ex.question, 
            "prediction": pred, 
            "answers": answers, 
            "em": em,
            "f1": f1,
            "precision": precision,
            "recall": recall
        })
        results.append(gen)
    data = {}        
    data["macro_f1"] = np.mean([d['f1'] for d in results])
    data["macro_precision"] = np.mean([d['precision'] for d in results])
    data["macro_recall"] = np.mean([d['recall'] for d in results])
    data["em_per"] = sum([d['em'] for d in results]) / len(results)
    data["examples"] = results
    return data

In [89]:
evaluate(bioasq_data, factoid_predictions_list, factoid_predictions_dict)

{'em_per': 0.259214501510574,
 'examples': [{'answers': ['deubiquitination'],
   'em': 0,
   'f1': 0,
   'generated_answer': 'deubiquitinase',
   'id': 1869,
   'precision': 0,
   'prediction': 'deubiquitinase',
   'question': 'Which is the enzymatic activity of OTULIN?',
   'recall': 0},
  {'answers': ['Glybera', 'Alipogene tiparvovec'],
   'em': 1,
   'f1': 1.0,
   'generated_answer': 'Alipogene tiparvovec',
   'id': 3079,
   'precision': 1.0,
   'prediction': 'Alipogene tiparvovec',
   'question': 'Which was the first gene therapy to receive marketing authorization in the European Union?',
   'recall': 1.0},
  {'answers': ['SMARCAL1 (SWI/SNF Related, Matrix Associated, Actin Dependent Regulator Of Chromatin, Subfamily A-Like 1)',
    'HARP'],
   'em': 0,
   'f1': 0.14285714285714288,
   'generated_answer': 'SMARCAL1',
   'id': 2375,
   'precision': 1.0,
   'prediction': 'SMARCAL1',
   'question': 'Mutations in which gene cause Schimke immune-osseous dysplasia?',
   'recall': 0.07692