# Imports

In [2]:
from transformers import (
    T5Tokenizer,
    TFT5Model,
    TFT5ForConditionalGeneration,
    # TFAutoModel,
    AutoTokenizer,
    TFBertModel,
    AutoModel,
    BertTokenizer,
)
import os
import tensorflow as tf
import pandas as pd
import numpy as np
import zipfile as zf
from glob import glob
import sentencepiece
from metapub import PubMedFetcher
from semanticscholar import SemanticScholar
from metapub import FindIt
import requests
import urllib
import json

# from keras.saving.hdf5_format import save_attributes_to_hdf5_group
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

## Tokenizer & Model Imports

In [3]:
bio_bert_model = AutoModel.from_pretrained("gsarti/biobert-nli")
bio_bert_tokenizer = AutoTokenizer.from_pretrained("gsarti/biobert-nli")

Downloading pytorch_model.bin: 100%|██████████| 433M/433M [00:11<00:00, 37.0MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 136/136 [00:00<00:00, 639kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 39.7MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 563kB/s]


In [4]:
original_bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
T5Abstract_model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
T5tokens = T5Tokenizer.from_pretrained("t5-base")

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 107MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 60.0kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 783kB/s]
Downloading model.safetensors: 100%|██████████| 892M/892M [00:05<00:00, 159MB/s] 
2023-07-12 02:41:46.221066: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 98697216 exceeds 10% of free system memory.
2023-07-12 02:41:46.778897: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 98697216 exceeds 10% of free system memory.
2023-07-12 02:41:46.822624: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 98697216 exceeds 10% of free system memory.
2023-07-12 02:41:52.458014: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 98697216 exceeds 10% of free system memory.
2023-07-12 02:41:55.165300: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 9869721

## Data Import

In [5]:
# get abstracts from postgres
# Base code
%run /home/ubuntu/work/therapeutic_accelerator/scripts/base.py

In [8]:
table_name = 'abstracts'
query = f''' SELECT * FROM {table_name};'''

def query_to_df(query):
    return pd.read_sql(query, engine)


abstracts = query_to_df(query)

## Model Testing

In [9]:
text_example = abstracts.loc[0, "abstract"]

In [10]:
max_sequence_length = 512
embedding_size = 200

In [11]:
T5tokens(text_example)

{'input_ids': [22636, 3268, 446, 12396, 87, 4209, 5767, 519, 3240, 53, 4139, 7, 12, 1994, 13324, 16, 186, 25049, 32, 1409, 4597, 52, 1528, 10461, 5, 17716, 6472, 15282, 43, 5111, 2485, 18, 858, 18, 23993, 5097, 5767, 519, 17324, 7, 16, 25049, 32, 23, 26, 1874, 7, 1374, 12, 6676, 8463, 257, 13, 5097, 5767, 6355, 84, 164, 4221, 3, 9, 1055, 12206, 2387, 5, 3, 3626, 3, 9, 5014, 17021, 38, 8735, 6, 62, 3, 22529, 604, 948, 18042, 28, 20459, 1756, 581, 796, 2387, 19166, 12, 2862, 4845, 3919, 13, 19921, 53, 8, 3, 10791, 1756, 13, 5097, 5767, 5787, 2224, 8046, 130, 856, 16742, 26, 28, 1151, 2250, 379, 5097, 5767, 519, 18, 4246, 920, 793, 14804, 41, 17345, 61, 18, 8725, 90, 1598, 11658, 87, 120, 1167, 10207, 9, 2358, 2356, 11, 2329, 508, 3, 7662, 4885, 25049, 32, 75, 63, 1225, 41, 434, 13011, 61, 90, 1598, 11658, 2640, 12, 6570, 70, 1418, 12, 19921, 5097, 5767, 519, 3, 19787, 12973, 257, 11, 5097, 5767, 519, 8976, 2358, 5931, 2020, 5, 101, 4313, 446, 12396, 6, 3, 51, 16442, 6, 454, 7, 102, 2394,

In [12]:
bio_bert_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [13]:
# Instead of summarization can be used for classification of papers
def biobert_classifier(
    embedding_size=200,
    input_dimensions=3,
    hidden_layers=0,
    max_sequence_length=512,
    learning_rate=0.01,
):
    input_ids = tf.keras.layers.Input(shape=embedding_size, name="input_ids")
    token_type_ids = tf.keras.layers.Input(
        shape=embedding_size, name="token_type_id")
    attention_mask = tf.keras.layers.Input(
        shape=embedding_size, name="attention_mask")

    model_inputs = {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "attention_mask": attention_mask,
    }

    embedding_matrix = tf.keras.layers.Embedding(200)

    normalization_layer = tf.keras.layers.BatchNormalization()

    attention_layer = tf.keras.layers.Attention()

    pooler_layer = bio_bert_model(model_inputs)[0]

    dense_layer = tf.keras.layers.Dense(100, activation="relu")(pooler_layer)

    dropout_layer = tf.keras.layers.Dropout(0.3)(dense_layer)

    final_layer = tf.keras.layers.Dense(1, activation="relu")(dropout_layer)

    classification_layer = tf.keras.layers.Dense(
        1, activation="sigmoid")(final_layer)

    model = tf.keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[classification_layer],
    )

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy,
        metrics=[tf.keras.metrics.Accuracy, tf.keras.metrics.Precision],
    )

    return model

In [14]:
summary_task_prefix = "Summarize :"
qa_task_prefix = "Question :"

In [23]:
# T5 Abstractive Text Summarization Model
def t5summary_model(tokenizer, text, t5model):
    summarize = "summarize: "
    encoding = tokenizer([summarize + text], return_tensors="tf")
    output = t5model.generate(
        encoding.input_ids,
        num_beams=3,
        no_repeat_ngram_size=2,
        top_k=10,
        top_p=80,
        max_length=50,
        min_length=30,
    )
    return [
        tokenizer.decode(
            w, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )
        for w in output
    ]

In [24]:
results = t5summary_model(T5tokens, text_example, T5Abstract_model)
results

['constitutive JAK/STAT3 signaling contributes to disease progression. gain-of-function mutations in lymphoid cancers lead to hyperactivation of STAT3, a study has shown.']

In [25]:
# Check if summary is less than abstract
len(text_example) > len(results[0])

True

## Training New Models

In [None]:
from transformers import BioGptModel, BioGptConfig, BioGptTokenizer

biogpttokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
biogptmodel = BioGptModel.from_pretrained("microsoft/biogpt")

### Q&A Model

In [None]:
# BioBERT or BERT Q&A or Clincal-T5-Large

### Extractive Summary Model

In [None]:
# T5 or T5v1 or Clincal-T5-Large or Bio-GPT

### Classification Model

In [None]:
# BERT or Bio-GPT

### NER Model

In [None]:
# BioELECTRA