In [None]:
!nvidia-smi

Fri Jul  9 06:37:52 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

**Mounting Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!gdown --id 1mxVUywvKzvA9bvrUc11RYuOTy7MYcXHF 

Downloading...
From: https://drive.google.com/uc?id=1mxVUywvKzvA9bvrUc11RYuOTy7MYcXHF
To: /content/bio-QA.zip
0.00B [00:00, ?B/s]5.48MB [00:00, 86.0MB/s]


In [None]:
!unzip -q bio-QA.zip

**Installing Dependencies**

In [None]:
!pip install --quiet transformers==4.1.1
!pip install --quiet tokenizers==0.9.4
!pip install --quiet sentencepiece==0.1.94
!pip install torchtext==0.8.0 torch==1.7.1 pytorch-lightning==1.2.2

[K     |████████████████████████████████| 1.5MB 4.3MB/s 
[K     |████████████████████████████████| 2.9MB 64.3MB/s 
[K     |████████████████████████████████| 901kB 67.4MB/s 
[K     |████████████████████████████████| 1.1MB 4.1MB/s 
[?25hCollecting torchtext==0.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/26/8a/e09b9b82d4dd676f17aa681003a7533765346744391966dec0d5dba03ee4/torchtext-0.8.0-cp37-cp37m-manylinux1_x86_64.whl (6.9MB)
[K     |████████████████████████████████| 7.0MB 4.3MB/s 
[?25hCollecting torch==1.7.1
[?25l  Downloading https://files.pythonhosted.org/packages/90/5d/095ddddc91c8a769a68c791c019c5793f9c4456a688ddd235d6670924ecb/torch-1.7.1-cp37-cp37m-manylinux1_x86_64.whl (776.8MB)
[K     |████████████████████████████████| 776.8MB 20kB/s 
[?25hCollecting pytorch-lightning==1.2.2
[?25l  Downloading https://files.pythonhosted.org/packages/57/d7/52a8cf5ed43c5e0e6a2193d0b2e14f27054513000b4a46b61146c9eabbf5/pytorch_lightning-1.2.2-py3-none-any.whl (816kB)


**Importing Libraries**

In [None]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap

from pytorch_lightning.callbacks import ModelCheckpoint

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

**Data Preprocessing**

In [None]:
with Path("BioASQ/BioASQ-train-factoid-4b.json").open() as json_file:
  data = json.load(json_file)                                            

In [None]:
questions = data['data'][0]['paragraphs']

In [None]:
questions[0]

{'context': 'Balanced t(11;15)(q23;q15) in a TP53+/+ breast cancer patient from a Li-Fraumeni syndrome family. Li-Fraumeni Syndrome (LFS) is characterized by early-onset carcinogenesis involving multiple tumor types and shows autosomal dominant inheritance. Approximately 70% of LFS cases are due to germline mutations in the TP53 gene on chromosome 17p13.1. Mutations have also been found in the CHEK2 gene on chromosome 22q11, and others have been mapped to chromosome 11q23. While characterizing an LFS family with a documented defect in TP53, we found one family member who developed bilateral breast cancer at age 37 yet was homozygous for wild-type TP53. Her mother also developed early-onset primary bilateral breast cancer, and a sister had unilateral breast cancer and a soft tissue sarcoma. Cytogenetic analysis using fluorescence in situ hybridization of a primary skin fibroblast cell line revealed that the patient had a novel balanced reciprocal translocation between the long arms of c

**Data Extraction Class**

In [None]:
def extract_questions_and_answers(path: Path):
  with path.open() as json_file:
    data = json.load(json_file)

  questions = data['data'][0]['paragraphs']

  data_rows = []

  for question in questions:
    context = question['context']
    for question_and_answer in question['qas']:
      question = question_and_answer['question']
      answers = question_and_answer['answers']

      for answer in answers:
        answer_text = answer['text']
        answer_start = answer['answer_start']
        answer_end = answer_start + len(answer_text)

        data_rows.append({
            "question": question,
            "context": context,
            "answer_text": answer_text,
            "answer_start": answer_start,
            "answer_end": answer_end        
            })
  
  return pd.DataFrame(data_rows)

In [None]:
extract_questions_and_answers(Path("BioASQ/BioASQ-train-factoid-4b.json")).head()      

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,What is the inheritance pattern of Li–Fraumeni...,Balanced t(11;15)(q23;q15) in a TP53+/+ breast...,autosomal dominant,213,231
1,What is the inheritance pattern of Li–Fraumeni...,Genetic modeling of Li-Fraumeni syndrome in ze...,autosomal dominant,105,123
2,Which type of lung cancer is afatinib used for?,Clinical perspective of afatinib in non-small ...,EGFR-mutant NSCLC,1203,1220
3,Which hormone abnormalities are characteristic...,"DOCA sensitive pendrin expression in kidney, h...",thyroid,419,426
4,Which hormone abnormalities are characteristic...,Clinical and molecular characteristics of Pend...,thyroid,705,712


In [None]:
paths = sorted(list(Path("BioASQ/").glob('BioASQ-train-*')))
paths

[PosixPath('BioASQ/BioASQ-train-factoid-4b.json'),
 PosixPath('BioASQ/BioASQ-train-factoid-5b.json'),
 PosixPath('BioASQ/BioASQ-train-factoid-6b.json')]

In [None]:
dfs = []
for path in paths:
  df = extract_questions_and_answers(path)
  dfs.append(df)

df = pd.concat(dfs)

In [None]:
df = df.drop_duplicates(subset=["context"]).reset_index(drop=True)

In [None]:
df.head()

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,What is the inheritance pattern of Li–Fraumeni...,Balanced t(11;15)(q23;q15) in a TP53+/+ breast...,autosomal dominant,213,231
1,What is the inheritance pattern of Li–Fraumeni...,Genetic modeling of Li-Fraumeni syndrome in ze...,autosomal dominant,105,123
2,Which type of lung cancer is afatinib used for?,Clinical perspective of afatinib in non-small ...,EGFR-mutant NSCLC,1203,1220
3,Which hormone abnormalities are characteristic...,"DOCA sensitive pendrin expression in kidney, h...",thyroid,419,426
4,Which hormone abnormalities are characteristic...,Clinical and molecular characteristics of Pend...,thyroid,705,712


In [None]:
sample_question = df.iloc[20]
sample_question

question        Which hormone abnormalities are characteristic...
context         Mutations in the SLC26A4 (pendrin) gene in pat...
answer_text                                               thyroid
answer_start                                                  351
answer_end                                                    358
Name: 20, dtype: object

**Tokenization**

In [None]:
MODEL_NAME ='t5-base'

In [None]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




**Creating Dataset Model**

In [None]:
class QADataset(Dataset):
  def __init__(
      self,
      data:pd.DataFrame,
      tokenizer:T5Tokenizer,
      source_max_token_len: int = 512,
      target_max_token_len: int = 32,

      ):
    
    self.data =  data
    self.tokenizer =  tokenizer
    self.source_max_token_len =  source_max_token_len
    self.target_max_token_len =  target_max_token_len


  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    source_encoding = tokenizer(
      data_row['question'],
      data_row['context'],
      max_length=self.source_max_token_len,
      padding='max_length',
      truncation="only_second",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
      )
    
    target_encoding = tokenizer(
      data_row['answer_text'],
      max_length=self.target_max_token_len,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
      )
    
    labels = target_encoding['input_ids']
    labels[labels==0] = -100

    return dict(
        question=data_row['question'],
        context=data_row['context'],
        answer_text=data_row['answer_text'],
        input_ids=source_encoding["input_ids"].flatten(),
        attention_mask=source_encoding['attention_mask'].flatten(),
        labels=labels.flatten()
    )

In [None]:
sample_dataset = QADataset(df, tokenizer)

**Sample Check**

In [None]:
for data in sample_dataset:
  print("Question: ", data['question'])
  print("Answer text: ", data['answer_text'])
  print("Input_ids: ", data['input_ids'][:10])
  print("Labels: ", data['labels'][:10])
  break

Question:  What is the inheritance pattern of Li–Fraumeni syndrome?
Answer text:  autosomal dominant
Input_ids:  tensor([  363,    19,     8, 28915,  3275,    13,  1414,   104,   371,  6340])
Labels:  tensor([ 1510, 10348,   138, 12613,     1,  -100,  -100,  -100,  -100,  -100])


**Splitting the Dataset**

In [None]:
train_df, val_df = train_test_split(df, test_size=0.05)

In [None]:
train_df.shape,  val_df.shape

((2452, 5), (130, 5))

**Making our Module**

In [None]:
class QA_DataModule(pl.LightningDataModule):
  def __init__(
      self,
      train_df: pd.DataFrame,
      test_df: pd.DataFrame,
      tokenizer:T5Tokenizer,
      batch_size: int = 8,
      source_max_token_len: int = 512,
      target_max_token_len: int = 32,
      ):
    super().__init__()
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def setup(self):
    self.train_dataset = QADataset(
        self.train_df,
        self.tokenizer,
        self.source_max_token_len,
        self.target_max_token_len
        )

    self.test_dataset = QADataset(
    self.test_df,
    self.tokenizer,
    self.source_max_token_len,
    self.target_max_token_len
    )
 
  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size=self.batch_size,
        shuffle=True,
        num_workers=4
        )
  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=self.batch_size,
        num_workers=4
        )

  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=1,
        num_workers=4
        )

In [None]:
BATCH_SIZE = 12
N_EPOCHS = 6

data_module = QA_DataModule(train_df, val_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

In [None]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691430.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
model.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translation_en_to_fr": {
      "early_stopping": tru

In [None]:
class Our_QAModel(pl.LightningModule):
  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)


  def forward(self, input_ids, attention_mask, labels=None):
    output = self.model(
        input_ids, 
        attention_mask=attention_mask,
        labels=labels)

    return output.loss, output.logits

  def training_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask=batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions":outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask=batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask=batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def configure_optimizers(self):

    optimizer = AdamW(self.parameters(), lr=10e-5)
    return optimizer


In [None]:
model = Our_QAModel()

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
 checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

In [None]:
trainer = pl.Trainer(
    checkpoint_callback=checkpoint_callback,
    max_epochs=N_EPOCHS,
    gpus=1,
    progress_bar_refresh_rate = 30
)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


In [None]:
trainer.fit(model, data_module)


  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 0, global step 204: val_loss reached 0.28756 (best 0.28756), saving model to "/content/checkpoints/best-checkpoint.ckpt" as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 1, global step 409: val_loss reached 0.23524 (best 0.23524), saving model to "/content/checkpoints/best-checkpoint.ckpt" as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 2, global step 614: val_loss reached 0.23504 (best 0.23504), saving model to "/content/checkpoints/best-checkpoint.ckpt" as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 3, global step 819: val_loss reached 0.21662 (best 0.21662), saving model to "/content/checkpoints/best-checkpoint.ckpt" as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 4, global step 1024: val_loss reached 0.20322 (best 0.20322), saving model to "/content/checkpoints/best-checkpoint.ckpt" as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 5, step 1229: val_loss was not in top 1





1

In [None]:
trainer.test() 

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 0.21211868524551392}
--------------------------------------------------------------------------------


[{'test_loss': 0.21211868524551392}]

**Prediction**

In [None]:
trained_model = Our_QAModel.load_from_checkpoint("checkpoints/best-checkpoint.ckpt")
trained_model.freeze()

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


**Generating Answers**

In [None]:
def generate_answer(question):
  source_encoding=tokenizer(
      question["question"],
      question['context'],
      max_length = 396,
      padding="max_length",
      truncation="only_second",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"

  )
  generated_ids = trained_model.model.generate(
      input_ids=source_encoding["input_ids"],
      attention_mask=source_encoding["attention_mask"],
      num_beams=1,
      max_length=80,
      repetition_penalty=2.5,
      early_stopping=True,
      use_cache=True)
  
  preds =[
          tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
          for generated_id in generated_ids
  ]

  return "".join(preds)

**Testing**

In [None]:
sample_question = val_df.iloc[2]

In [None]:
sample_question["context"]

'A phenome-based functional analysis of transcription factors in the cereal head blight fungus, Fusarium graminearum. Fusarium graminearum is an important plant pathogen that causes head blight of major cereal crops. The fungus produces mycotoxins that are harmful to animal and human. In this study, a systematic analysis of 17 phenotypes of the mutants in 657 Fusarium graminearum genes encoding putative transcription factors (TFs) resulted in a database of over 11,000 phenotypes (phenome). This database provides comprehensive insights into how this cereal pathogen of global significance regulates traits important for growth, development, stress response, pathogenesis, and toxin production and how transcriptional regulations of these traits are interconnected. In-depth analysis of TFs involved in sexual development revealed that mutations causing defects in perithecia development frequently affect multiple other phenotypes, and the TFs associated with sexual development tend to be highl

In [None]:
sample_question["question"]

'The pathogen Fusarium graminearum affects what type of plant species?'

In [None]:
sample_question["answer_text"]

'cereal crops'

In [None]:
generate_answer(sample_question)

'cereal crops'

**Calculating F1 Score of our model**

In [None]:
from sklearn.metrics import f1_score

In [None]:
true_answer = []
pred_answer = []

In [None]:
len(val_df)

130

In [None]:
for i in range(len(val_df)):
  sample_question = val_df.iloc[i]
  true_answer.append(str(sample_question["answer_text"]))
  pred_answer.append(str(generate_answer(sample_question)))

In [None]:
true_answer

['factor Xa',
 'SAM',
 'cereal crops',
 'Thyroid',
 'CYP17A1',
 'LQVVR',
 'XK',
 'AUS',
 'diabetes mellitus',
 "Gaucher's disease type 1",
 'better understand the transcriptional network that regulates macrophage differentiation',
 'Caspases are intracellular proteases that propagate programmed cell death, proliferation, and inflammation.',
 'MethPed',
 '7',
 'ZEB2',
 'thyroid',
 'overactive bladder syndrome',
 'MITF',
 'tofacitinib',
 'AD',
 'autosomal dominant',
 'hypertension',
 'SECIS',
 'meningioma',
 'Factor Xa',
 'L-Dopa',
 'malaria',
 'Inhibitor 1',
 'malaria',
 'intermediate filaments',
 'oxytocin',
 'Dax1 associates with Esrrb and regulates its function in embryonic stem cells.',
 'NSD1 gene',
 'aryl hydrocarbon receptor interacting protein',
 'orexin',
 'p16Ink4',
 'proprotein convertase subtilisin/kexin type 9',
 'XK',
 'spleen tyrosine kinase',
 'autosomal dominant',
 'Xist',
 'interleukin-6',
 'thyroid',
 'Gaucher disease',
 'SWR1',
 'inhibits',
 'Mitochondrial fission',


In [None]:
pred_answer

['factor Xa',
 'SAM',
 'cereal crops',
 'thyroid',
 'CYP17A1',
 'The pentapeptide LQVVR',
 'XK',
 'The bone-anchor sub urethral',
 'diabetes mellitus',
 "Gaucher's disease type 1",
 'The international Functional Annotation Of the Mammalian Genomes 4 (FANTOM4) research collaboration set out to better understand',
 'Activation of caspases occurs by an conserved mechanism subject to strict cell regulation.',
 'MethPed',
 '7',
 'ZEB2',
 'thyroid',
 'overactive bladder syndrome',
 'MITF',
 'CP-690,550',
 'ad',
 'autosomal dominant',
 'hypertension',
 'SECIS',
 'meningioma',
 'Factor Xa',
 'L-Dopa',
 'malaria',
 'Inhibitor 1',
 'malaria',
 'Corneodesmosomes',
 'oxytocin',
 'LXxLL',
 'NSD1 gene',
 'aryl hydrocarbon receptor',
 'orexin',
 'p53',
 'proprotein convertase subtilisin/kexi',
 'XK',
 'spleen',
 'autosomal dominant',
 'Xist',
 'interleukin-6',
 'thyroid',
 'Gaucher disease',
 'SWR1',
 'inhibits',
 'Mitochondrial fission',
 'calcitonin gene-related protein',
 'frataxin',
 '7',
 '5-HT2

In [None]:
f1_score(true_answer, pred_answer, average='micro')

0.7461538461538462

**Uploading my model to HuggingFace**

In [1]:
import joblib