# Imports

In [1]:
from transformers import (
    T5Tokenizer,
    TFT5Model,
    TFT5ForConditionalGeneration,
    # TFAutoModel,
    AutoTokenizer,
    TFBertModel,
    AutoModel,
    BertTokenizer,
)
import os
import tensorflow as tf
import pandas as pd
import numpy as np
import zipfile as zf
from glob import glob
import sentencepiece
from metapub import PubMedFetcher
from semanticscholar import SemanticScholar
from metapub import FindIt
import requests
import urllib
import json

# from keras.saving.hdf5_format import save_attributes_to_hdf5_group
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

2023-07-18 23:19:23.658197: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-18 23:19:23.706303: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-18 23:19:23.707607: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Tokenizer & Model Imports

In [2]:
bio_bert_model = AutoModel.from_pretrained("gsarti/biobert-nli")
bio_bert_tokenizer = AutoTokenizer.from_pretrained("gsarti/biobert-nli")

In [3]:
original_bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
T5Abstract_model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
T5tokens = T5Tokenizer.from_pretrained("t5-base")

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
#! usr/bin/env python

# setup

# Base
import pandas as pd
import numpy as np
import re

# LLM packages
from transformers import pipeline, set_seed
from transformers import AutoTokenizer, AutoModelWithLMHead, BioGptTokenizer, BioGptForCausalLM

# Chunk context into 512  tokens
from langchain.text_splitter import RecursiveCharacterTextSplitter
# import tiktoken

# @dask.delayed
def token_len(text): 
    """ Get the length of tokens from text"""
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)['input_ids'][0]
    return len(tokens)
    
chunk_size = 1024

# create text splitters for processing the texts
text_splitter = RecursiveCharacterTextSplitter(
    # separator = ["\n\n", "\n", ". ", "? ", "! ", "; "],
    chunk_size = chunk_size,
    chunk_overlap  = 20,
    length_function = token_len
)


# Create embeddings function with specter model
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
model = AutoModel.from_pretrained('allenai/specter')

from chromadb.api.types import Documents, EmbeddingFunction, Embeddings

class specter_ef(EmbeddingFunction):
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def embed_documents(self, texts: Documents) -> Embeddings:
        
        text_list = [re.sub("\n", " ", p) for p in texts]
        texts = [re.sub("\s\s+", " ", t) for t in text_list]
        
        # embed the documents somehow
        embeddings = []
        
        for text in texts:
            inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)
            result = model(**inputs)
            embeddings.append(result.last_hidden_state[:, 0, :])
        
        return embeddings
    
specter_embeder = specter_ef(model, tokenizer)


import chromadb
from chromadb.config import Settings

# Create chroma client
chroma = chromadb.Client(Settings(chroma_api_impl="rest",
                                  chroma_server_host="34.238.51.66", # EC2 instance public IPv4
                                  chroma_server_http_port=8000))

print("Nanosecond heartbeat on server", chroma.heartbeat()) # returns a nanosecond heartbeat. Useful for making sure the client remains connected.

# Check Existing connections
display(chroma.list_collections())

collection = chroma.get_or_create_collection("specter_abstracts")


2023-07-18 23:19:48 ip-172-31-64-95 chromadb.telemetry.posthog[31048] INFO Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.


Nanosecond heartbeat on server 1689722388499249179000


[Collection(name=langchain_store),
 Collection(name=abstracts),
 Collection(name=fulltext),
 Collection(name=specter_abstracts)]

## Model Testing

In [5]:
# text_example = abstracts.loc[0, "abstract"]

In [6]:
max_sequence_length = 512
embedding_size = 200

In [None]:
prompts = pd.read_csv('/home/ubuntu/work/therapeutic_accelerator/data/prompts.csv')
# testing prompt one
question = prompts.loc[0, "Prompt"]

Unnamed: 0,User,Task,Prompt
0,General,QA,What is the most current research on pancreati...
1,General,QA,What recent therapeutics have come out for lun...
2,General,QA,How does ELISA assays work?
3,General,QA,What are the common use cases for flow cytometry?
4,General,QA,How does lentivirus transductions work?


In [9]:
def get_question_embeddings(question): 
    # Embed question
    question_embeddings = specter_embeder.embed_documents([question])[0][0].tolist()
    
    return question_embeddings

def query_chroma(question_embeddings):
    # Query ChromaDB with Embeddings
    results = collection.query(
        query_embeddings=[question_embeddings],
        n_results=10
        # where={"metadata_field": "is_equal_to_this"},
        # where_document={"$contains":"search_string"}
    )
    
    return results

In [10]:
question

'What is the most current research on pancreatic cancer?'

In [11]:
question_embeddings = get_question_embeddings(question)
results = query_chroma(question_embeddings)
results

{'ids': [['38374595-0',
   '203622768-0',
   '234597674-0',
   '211474643-0',
   '11181159-0',
   '10984456-0',
   '232429176-0',
   '246997767-0',
   '240425531-0',
   '38325820-0']],
 'distances': [[196.53672790527344,
   210.16175842285156,
   224.75006103515625,
   240.17074584960938,
   246.17657470703125,
   247.79898071289062,
   263.77459716796875,
   273.4192199707031,
   283.67694091796875,
   284.9610290527344]],
 'embeddings': None,
 'metadatas': [[{'corpusid': 38374595, 'chunk': 0},
   {'corpusid': 203622768, 'chunk': 0},
   {'corpusid': 234597674, 'chunk': 0},
   {'corpusid': 211474643, 'chunk': 0},
   {'corpusid': 11181159, 'chunk': 0},
   {'corpusid': 10984456, 'chunk': 0},
   {'corpusid': 232429176, 'chunk': 0},
   {'corpusid': 246997767, 'chunk': 0},
   {'corpusid': 240425531, 'chunk': 0},
   {'corpusid': 38325820, 'chunk': 0}]],
 'documents': [['Microvascular density (MVD), a marker for tumor angiogenesis, has been demonstrated to have prognostic significance in vari

In [None]:
results

In [None]:
# for k, v in results.items():
#     try: 
#         results[k] = v[0]
#     except: 
#         pass
# pd.DataFrame.from_dict(results).to_csv("/home/ubuntu/work/therapeutic_accelerator/data/outputs/chroma_results.csv", index=False)

# Setting Up Prompts

In [None]:
template = """Answer the question based on the context below. If the
question cannot be answered using the information provided answer
with "I don't know".

Question: {question}

Context: {context}

Answer: """

In [None]:
# Define your desired data structure.
class qa(BaseModel):
    setup: str = Field(description="question")
    punchline: str = Field(description="answer")
    
    # You can add custom validation logic easily with Pydantic.
    @validator('setup')
    def question_ends_with_question_mark(cls, field):
        if field[-1] != '?':
            raise ValueError("Badly formed question!")
        return field

In [None]:
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, validator
from typing import List

parser = PydanticOutputParser(pydantic_object=qa)

In [None]:
help(LLMChain)

In [None]:
from langchain.chains import LLMChain, TransformChain
from langchain.chains import SequentialChain

from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate

prompt_template = PromptTemplate(
    template= template,
    input_variables=["question", "context"]
    # partial_variables={"format_instructions": parser.get_format_instructions()}
)



In [14]:
context = ' '.join(results['documents'][0])

In [None]:
print(
    prompt_template.format(
        question=question,
        context = context,
    )
)

In [None]:
# from transformers import AutoModelForQuestionAnswering
model = AutoModelForQuestionAnswering.from_pretrained('gsarti/biobert-nli')

In [None]:
import yaml

with open("/home/ubuntu/work/therapeutic_accelerator/config/main.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
    
with open("/home/ubuntu/work/therapeutic_accelerator/config/keys.yaml", "r") as f:
    keys = yaml.load(f, Loader=yaml.FullLoader)

In [None]:
import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = keys['huggingface']

In [None]:
from langchain import HuggingFaceHub

repo_id = 'microsoft/biogpt'

llm = HuggingFaceHub(
    repo_id=repo_id
    
)

In [None]:
llm_chain = LLMChain(
    prompt=prompt,
    llm=llm,
    output_key="json_string",
)

In [None]:
print(llm_chain.run({
    'question': question,
    'context': context
    }))

In [None]:
test_text = question + test_text

In [None]:
# # split text
text_chunks = text_splitter.split_text(test_text)

In [None]:
text_chunks

In [None]:
inputs = bio_bert_tokenizer.batch_encode_plus(text_chunks, padding=True, truncation=True, return_tensors="pt", max_length=512)

In [None]:
# input_ids = inputs['input_ids'].numpy()

In [None]:
outputs = bio_bert_model(inputs['input_ids'])

In [None]:
dir(bio_bert_model)

In [None]:
outputs

In [None]:
answer_start = tf.argmax(
          outputs.start_logits, axis=1
      ).numpy()  # Get the most likely beginning of each answer with the argmax of the score


In [None]:
answer_end = (
          tf.argmax(output.end_logits, axis=1) + 1
      ).numpy()

input_text = tokenizer.decode(input_ids[i, :], clean_up_tokenization_spaces=True)
input_text = input_text.split('[SEP] ', 2)[1]
answer = tokenizer.decode(
    input_ids[i, answer_start[i]:answer_end[i]], clean_up_tokenization_spaces=True)

In [None]:
outputs['last_hidden_state'][0][0]

In [None]:
bio_bert_tokenizer.decode(outputs['last_hidden_state'][0][0])

In [None]:
# Instead of summarization can be used for classification of papers
def biobert_classifier(
    embedding_size=200,
    input_dimensions=3,
    hidden_layers=0,
    max_sequence_length=512,
    learning_rate=0.01,
):
    input_ids = tf.keras.layers.Input(shape=embedding_size, name="input_ids")
    token_type_ids = tf.keras.layers.Input(
        shape=embedding_size, name="token_type_id")
    attention_mask = tf.keras.layers.Input(
        shape=embedding_size, name="attention_mask")

    model_inputs = {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "attention_mask": attention_mask,
    }

    embedding_matrix = tf.keras.layers.Embedding(200)

    normalization_layer = tf.keras.layers.BatchNormalization()

    attention_layer = tf.keras.layers.Attention()

    pooler_layer = bio_bert_model(model_inputs)[0]

    dense_layer = tf.keras.layers.Dense(100, activation="relu")(pooler_layer)

    dropout_layer = tf.keras.layers.Dropout(0.3)(dense_layer)

    final_layer = tf.keras.layers.Dense(1, activation="relu")(dropout_layer)

    classification_layer = tf.keras.layers.Dense(
        1, activation="sigmoid")(final_layer)

    model = tf.keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[classification_layer],
    )

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy,
        metrics=[tf.keras.metrics.Accuracy, tf.keras.metrics.Precision],
    )

    return model

In [None]:
inputs = bio_bert_tokenizer.batch_encode_plus(
    [(question, text) for text in text_list], add_special_tokens=True, return_tensors='tf',
    max_length=max_tokens, truncation_strategy='only_second', pad_to_max_length=True)
input_ids = inputs['input_ids'].numpy()

In [None]:
# from transformers import AutoModelForQuestionAnswering
model = AutoModelForQuestionAnswering.from_pretrained('gsarti/biobert-nli')

# # Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at gsarti/biobert-nli and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
# # You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

In [None]:
outputs = model(inputs['input_ids'])

In [None]:
import tensorflow as tf

In [None]:
answer_start = tf.argmax(
          outputs.start_logits.detach().numpy(), axis=1
      ).numpy()  # Get the most likely beginning of each answer with the argmax of the score


answer_end = (
          tf.argmax(outputs.end_logits.detach().numpy(), axis=1) + 1
      ).numpy()

In [None]:
answer_start_scores, answer_end_scores = ouputs

In [None]:
ouputs

In [None]:
input_text.split('[SEP] ', 2)[0]

In [None]:
answers = []

for i, text in enumerate(text_chunks):
    input_text = bio_bert_tokenizer.decode(input_ids[i, :], clean_up_tokenization_spaces=True)
    input_text = input_text.split('[SEP] ', 2)[0]
    answer = bio_bert_tokenizer.decode(
        input_ids[i, answer_start[i]:answer_end[i]], clean_up_tokenization_spaces=True)
    # score_start = answer_start_scores.numpy()[i][answer_start[i]]
    # score_end = answer_end_scores.numpy()[i][answer_end[i]-1]
    if answer and not '[CLS]' in answer:
        answers.append([answer, input_text])
    else:
        answers.append(None)

In [None]:
answers

In [None]:
input_text = input_text.split('[SEP] ', 2)[1]
answer = tokenizer.decode(
    input_ids[i, answer_start[i]:answer_end[i]], clean_up_tokenization_spaces=True)

In [None]:
answer_end

In [None]:
bio_bert_tokenizer.decode(ouputs['last_hidden_state'][0])

In [None]:
dir(model)

In [None]:
model.generate(token_chunks[0])

In [None]:
summary_task_prefix = "Summarize :"
qa_task_prefix = "Question :"

In [12]:
# T5 Abstractive Text Summarization Model
def t5summary_model(tokenizer, text, t5model):
    summarize = "summarize: "
    encoding = tokenizer([summarize + text], return_tensors="tf")
    output = t5model.generate(
        encoding.input_ids,
        num_beams=3,
        no_repeat_ngram_size=2,
        top_k=10,
        top_p=80,
        max_length=500,
        min_length=50,
    )
    return [
        tokenizer.decode(
            w, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )
        for w in output
    ]

In [16]:
temp = text_splitter.split_text(context)

In [17]:
temp

['Microvascular density (MVD), a marker for tumor angiogenesis, has been demonstrated to have prognostic significance in various malignancies. Previous studies have demonstrated that MVD is an independent prognostic factor in pancreatic adenocarcinoma and that longer survival is associated with hypovascular tumors. The prognostic importance of MVD in pancreatic neuroendocrine tumor (NET) has not been documented. We evaluated MVD in pancreatic NET and correlated it with clinicopathologic features and patient outcome to determine whether MVD is a useful prognostic indicator for these patients. Twenty-five pancreatic NETs from our archival files resected between 1981 and 2000 were identified. The mean MVD was determined for each tumor from the 3 most vascularized 200 × fields. Clinical follow-up ranged from 1 to 19 years, with a mean of 4.9 years. At last follow-up, 6 patients were dead of disease, 10 patients were alive without disease, 4 patients were alive with disease, and 5 patients 

In [18]:
results = []

for t in temp: 
    results.append(t5summary_model(T5tokens, t, T5Abstract_model))
    
results


[['microvascular density (MVD) has been shown to have prognostic significance. MVd was determined for each tumor from the 3 most vascularized 200  fields.'],
 ['recurrence rate of patients with stage III colorectal cancer was 27.7% (64/231) in the study group. the univariate analysis identified five risk factors: site of primary tumor (rectal'],
 ['rectal cancer, preoperative serum CEA level >5.0 ng/ml 95% risk factors. results: miR-510 significantly upregulated in colon cancer tissues and cell lines relative to adjacent normal tissues, colonic cells'],
 ['miR-510 was involved in the disease progression and clinical prognosis of colon cancer. a knockdown of the gene significantly inhibited these cellular processes, says dr robert mcd'],
 ['two PDX models showed different outcomes after castration or docetaxel treatment. the hormone-nave model displayed a range of responses from complete tumor regression to overt tumor progression.'],
 ['a 9 cm liver lesion was initially suspicious for 

In [None]:
# Check if summary is less than abstract
len(text_example) > len(results[0])

## Training New Models

In [None]:
from transformers import BioGptModel, BioGptConfig, BioGptTokenizer

biogpttokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
biogptmodel = BioGptModel.from_pretrained("microsoft/biogpt")

### Q&A Model

In [None]:
# BioBERT or BERT Q&A or Clincal-T5-Large

### Extractive Summary Model

In [None]:
# T5 or T5v1 or Clincal-T5-Large or Bio-GPT

### Classification Model

In [None]:
# BERT or Bio-GPT

### NER Model

In [None]:
# BioELECTRA