# BioGpt
- Create connection between chroma and BioGPT
- Create connection between postgresql DB and BioGPT

## Setup

In [111]:
# setup

# Base
import pandas as pd
import numpy as np
import re

# LLM packages
from transformers import pipeline, set_seed
from transformers import AutoTokenizer, AutoModelWithLMHead, BioGptTokenizer, BioGptForCausalLM

In [2]:
# Chunk context into 512  tokens
from langchain.text_splitter import RecursiveCharacterTextSplitter
# import tiktoken

# @dask.delayed
def token_len(text): 
    """ Get the length of tokens from text"""
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)['input_ids'][0]
    return len(tokens)
    
chunk_size = 1024

# create text splitters for processing the texts
text_splitter = RecursiveCharacterTextSplitter(
    # separator = ["\n\n", "\n", ". ", "? ", "! ", "; "],
    chunk_size = chunk_size,
    chunk_overlap  = 20,
    length_function = token_len
)

Specter Embedder

In [3]:
# Create embeddings function with specter model
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
model = AutoModel.from_pretrained('allenai/specter')

from chromadb.api.types import Documents, EmbeddingFunction, Embeddings

class specter_ef(EmbeddingFunction):
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def embed_documents(self, texts: Documents) -> Embeddings:
        
        text_list = [re.sub("\n", " ", p) for p in texts]
        texts = [re.sub("\s\s+", " ", t) for t in text_list]
        
        # embed the documents somehow
        embeddings = []
        
        for text in texts:
            inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)
            result = model(**inputs)
            embeddings.append(result.last_hidden_state[:, 0, :])
        
        return embeddings
    
specter_embeder = specter_ef(model, tokenizer)

# Q&A

In [4]:
# # Define your desired data structure.
# class qa(BaseModel):
#     setup: str = Field(description="Ask a question")
#     answer: str = Field(description="Answer to the question")
    
#     # You can add custom validation logic easily with Pydantic.
#     @validator('setup')
#     def question_ends_with_question_mark(cls, field):
#         if field[-1] != '?':
#             raise ValueError("Badly formed question!")
#         return field

## Chroma Connection

In [5]:
import chromadb
from chromadb.config import Settings

# Create chroma client
chroma = chromadb.Client(Settings(chroma_api_impl="rest",
                                  chroma_server_host="34.238.51.66", # EC2 instance public IPv4
                                  chroma_server_http_port=8000))

print("Nanosecond heartbeat on server", chroma.heartbeat()) # returns a nanosecond heartbeat. Useful for making sure the client remains connected.

# Check Existing connections
display(chroma.list_collections())

collection = chroma.get_or_create_collection("specter_abstracts")

Nanosecond heartbeat on server 1689689383721754531000


[Collection(name=langchain_store),
 Collection(name=abstracts),
 Collection(name=fulltext),
 Collection(name=specter_abstracts)]

In [6]:
# # ChromaDB Query

# collection.query(
#     query_texts=["doc10", "thus spake zarathustra", ...],
#     n_results=10,
#     where={"metadata_field": "is_equal_to_this"},
#     where_document={"$contains":"search_string"}
# )

# Model Work

In [112]:
biogpttokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
biogptmodel = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
generator = pipeline('text-generation', model=biogptmodel, tokenizer=biogpttokenizer)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


# User Input

In [8]:
prompts = pd.read_csv('/home/ubuntu/work/therapeutic_accelerator/data/prompts.csv')
# testing prompt one
prompts.head()

Unnamed: 0,User,Task,Prompt
0,General,QA,What is the most current research on pancreati...
1,General,QA,What recent therapeutics have come out for lun...
2,General,QA,How does ELISA assays work?
3,General,QA,What are the common use cases for flow cytometry?
4,General,QA,How does lentivirus transductions work?


In [None]:
biogpt_tests = [
    'How the ELISA assay works', 
    'Effective treatments of pancreatic cancer are',
    'Effective treatments of alzheizmers are',
    'Current treatments for parkinson\'s disease are'
]

In [120]:
generator("How the ELISA assay works", max_length=150, num_return_sequences=5, do_sample=True)

[{'generated_text': 'How the ELISA assay works for the detection of all forms of the human immunodeficiency virus (HIV) using a single antibody-capture technique.'},
 {'generated_text': 'How the ELISA assay works with serum, we then examined the relationship between serum IGF-I and the duration of exposure to lead.'},
 {'generated_text': 'How the ELISA assay works best for detection of IgG.'},
 {'generated_text': 'How the ELISA assay works for the detection ofF.F.n. with different species might be a good strategy to overcome some of the current problems, such as the lack of an appropriate reference sera.'},
 {'generated_text': 'How the ELISA assay works reliably does not suggest that the IgG-isotype response is a specific biomarker for disease status of HCV infection and has a poor correlation with the levels of IgM or IgA.'}]

In [None]:
generator("COVID-19 is", max_length=20, num_return_sequences=5, do_sample=True)

In [10]:
# User input
# question = input("What is your question? ")

In [106]:
def get_question_embeddings(question): 
    # Embed question
    question_embeddings = specter_embeder.embed_documents([question])[0][0].tolist()
    
    return question_embeddings

def query_chroma(question_embeddings):
    # Query ChromaDB with Embeddings
    results = collection.query(
        query_embeddings=[question_embeddings],
        n_results=10
        # where={"metadata_field": "is_equal_to_this"},
        # where_document={"$contains":"search_string"}
    )
    
    return results

In [None]:
question_embeddings = get_question_embeddings(question)
results = query_chroma(question_embeddings)
results

In [33]:
results['documents'][0]

['Microvascular density (MVD), a marker for tumor angiogenesis, has been demonstrated to have prognostic significance in various malignancies. Previous studies have demonstrated that MVD is an independent prognostic factor in pancreatic adenocarcinoma and that longer survival is associated with hypovascular tumors. The prognostic importance of MVD in pancreatic neuroendocrine tumor (NET) has not been documented. We evaluated MVD in pancreatic NET and correlated it with clinicopathologic features and patient outcome to determine whether MVD is a useful prognostic indicator for these patients. Twenty-five pancreatic NETs from our archival files resected between 1981 and 2000 were identified. The mean MVD was determined for each tumor from the 3 most vascularized 200 × fields. Clinical follow-up ranged from 1 to 19 years, with a mean of 4.9 years. At last follow-up, 6 patients were dead of disease, 10 patients were alive without disease, 4 patients were alive with disease, and 5 patients 

In [31]:
def process_large_text(texts, max_chunk_tokens, tokenizer, large_language_model):
    
    # Recursive text spliter
    texts = text_splitter.split_texts(texts)
    
    token_chunks = [] 
    
    for text in texts: 
        context = "context: " + text
        token_chunks.append(tokenizer(context, return_tensors='tf'))
    
    # Process each chunk through the model
    results = []
    for chunk in token_chunks:
        chunk_text = tokenizer.convert_tokens_to_string(chunk)
        result = large_language_model.generate(chunk_text)
        results.append(result)

    # Combine results if needed
    final_result = combine_results(results)

    return final_result


In [36]:
context = results['documents'][0]

new_texts = []

for text in texts: 
    new_texts.append(text_splitter.split_text(text))


In [53]:
token_chunks = []

for text in new_texts:
    for t in text:
        context = "context: " + t
        token_chunks.append(biogpttokenizer(context, return_tensors='pt'))

In [54]:
token_chunks

[{'input_ids': tensor([[    2,  1544,    20, 34175,   654,    12, 20364,    11,     7,    14,
           1400,    16,   186,  3688,     7,    57,    58,   301,    13,    47,
           1428,  1347,    10,   376,  3836,     4,  2994,   100,    47,   301,
             22, 20364,    21,    32,   646,  1428,   189,    10,  1414,  2700,
              8,    22,  1348,   294,    21,    73,    15,  2917,   647,   538,
              4,    18,  1428,   923,     5, 20364,    10,  1414,  6492,   186,
             12, 15988,    11,    57,    41,    58,  2348,     4,    52,   330,
          20364,    10,  1414, 15988,     8,   650,   114,    15, 12744,   585,
              8,   125,   497,    13,   340,   373, 20364,    21,    14,   745,
           1428,  3595,    16,    55,    28,     4,  1723,     9,   508,  1414,
          17686,    29,   218, 28837, 11041,  6074,    45,  9901,     8,  2488,
             19,   220,     4,    18,   187, 20364,    17,   346,    16,   221,
            186,    29,   

In [114]:
biogpttokenizer(context, return_tensors='pt')

{'input_ids': tensor([[    2,  1544,    20, 10626,    20,    18,  7667,  8690,  2476,  2093,
           527,   427, 24779,    10,  4451,    14,   252, 39914,   117,  6779,
            13,   574,     6,  4792,     5,  1989,   308,  1613,   608,  1416,
          4597,     7,   284,     7,   429,    30,  3800,     4]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [98]:
test = biogptmodel.generate(**token_chunks[0], max_length = 512)[0]

In [102]:
biogpttokenizer.decode(test)

'</s>context: Microvascular density (MVD), a marker for tumor angiogenesis, has been demonstrated to have prognostic significance in various malignancies. Previous studies have demonstrated that MVD is an independent prognostic factor in pancreatic adenocarcinoma and that longer survival is associated with hypovascular tumors. The prognostic importance of MVD in pancreatic neuroendocrine tumor (NET) has not been documented. We evaluated MVD in pancreatic NET and correlated it with clinicopathologic features and patient outcome to determine whether MVD is a useful prognostic indicator for these patients. Twenty-five pancreatic NETs from our archival files resected between 1981 and 2000 were identified. The mean MVD was determined for each tumor from the 3 most vascularized 200 × fields. Clinical follow-up ranged from 1 to 19 years, with a mean of 4.9 years. At last follow-up, 6 patients were dead of disease, 10 patients were alive without disease, 4 patients were alive with disease, and

In [82]:
test = biogptmodel(**token_chunks[0])

In [83]:
src_text = new_texts[0][0]
src_tokens = biogptmodel.encode(src_text)
generate = biogptmodel.generate([src_tokens], beam=args.beam)[0]
output = biogptmodel.decode(generate[0]["tokens"])
print(output)

AttributeError: 'BioGptForCausalLM' object has no attribute 'encode'

In [80]:
dir(biogptmodel)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_auto_class',
 '_backward_compatibility_gradient_checkpointing',
 '_backward_hooks',
 '_backward_pre_hooks',
 '_buffers',
 '_call_impl',
 '_convert_head_mask_to_5d',
 '_create_repo',
 '_expand_inputs_for_generation',
 '_extract_past_from_model_output',
 '_forward_hooks',
 '_forward_hooks_with_kwargs',
 '_forward_pre_hooks',
 '_forward_pre_hooks_with_kwargs',
 '_from_config',
 '_get_backward_hooks',
 '_get_backward_pre_hooks',
 '_get_decoder_start_token_id',
 '_get_files_timestamps',
 '_get_logits_processor',
 '_get_logits_warpe

In [74]:
biogpttokenizer.convert_tokens_to_string(test.tolist()[0])

AttributeError: 'int' object has no attribute 'replace'

In [60]:
biogptmodel.convert_tokens_to_string(test[0])

AttributeError: 'BioGptForCausalLM' object has no attribute 'convert_tokens_to_string'

In [42]:
results = []
for chunk in token_chunks:
    # chunk_text = tokenizer.convert_tokens_to_string(chunk)
    result = biogptmodel.generate(chunk)
    results.append(result)

AttributeError: 

In [None]:
process_large_text()

In [25]:
results['documents'][0][0]

'Microvascular density (MVD), a marker for tumor angiogenesis, has been demonstrated to have prognostic significance in various malignancies. Previous studies have demonstrated that MVD is an independent prognostic factor in pancreatic adenocarcinoma and that longer survival is associated with hypovascular tumors. The prognostic importance of MVD in pancreatic neuroendocrine tumor (NET) has not been documented. We evaluated MVD in pancreatic NET and correlated it with clinicopathologic features and patient outcome to determine whether MVD is a useful prognostic indicator for these patients. Twenty-five pancreatic NETs from our archival files resected between 1981 and 2000 were identified. The mean MVD was determined for each tumor from the 3 most vascularized 200 × fields. Clinical follow-up ranged from 1 to 19 years, with a mean of 4.9 years. At last follow-up, 6 patients were dead of disease, 10 patients were alive without disease, 4 patients were alive with disease, and 5 patients w

In [26]:
biogpttokenizer.tokenize("context: " + results['documents'][0][0])

['context</w>',
 ':</w>',
 'Microvascular</w>',
 'density</w>',
 '(</w>',
 'MVD</w>',
 ')</w>',
 ',</w>',
 'a</w>',
 'marker</w>',
 'for</w>',
 'tumor</w>',
 'angiogenesis</w>',
 ',</w>',
 'has</w>',
 'been</w>',
 'demonstrated</w>',
 'to</w>',
 'have</w>',
 'prognostic</w>',
 'significance</w>',
 'in</w>',
 'various</w>',
 'malignancies</w>',
 '.</w>',
 'Previous</w>',
 'studies</w>',
 'have</w>',
 'demonstrated</w>',
 'that</w>',
 'MVD</w>',
 'is</w>',
 'an</w>',
 'independent</w>',
 'prognostic</w>',
 'factor</w>',
 'in</w>',
 'pancreatic</w>',
 'adenocarcinoma</w>',
 'and</w>',
 'that</w>',
 'longer</w>',
 'survival</w>',
 'is</w>',
 'associated</w>',
 'with</w>',
 'hypo',
 'vascular</w>',
 'tumors</w>',
 '.</w>',
 'The</w>',
 'prognostic</w>',
 'importance</w>',
 'of</w>',
 'MVD</w>',
 'in</w>',
 'pancreatic</w>',
 'neuroendocrine</w>',
 'tumor</w>',
 '(</w>',
 'NET</w>',
 ')</w>',
 'has</w>',
 'not</w>',
 'been</w>',
 'documented</w>',
 '.</w>',
 'We</w>',
 'evaluated</w>',
 'MVD

In [16]:
tokens = []

# Split text if neeeded
# chunks = text_splitter.split_text(context)

# Process large text for context
for text in results['documents'][0]: 
    context = "context: " + text
    tokens.append(biogpttokenizer(context, return_tensors='tf'))
    

2023-07-18 14:09:49.142086: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [20]:
context

'context: Aim: The Italian Piedmont region sponsored in 2005 a population‐based registry to assess the epidemiology of childhood chronic organ failure involving kidneys, liver, heart or lungs.'

In [17]:
# 
encoding_gpt = biogpttokenizer([question + context], return_tensors='tf', max_length=3000, truncation=True)

In [None]:
def process_large_text_with_questions(large_text, questions, max_chunk_tokens):
    # Tokenize the large text
    tokens = tokenizer.tokenize(large_text)

    # Split tokens into chunks
    chunks = []
    current_chunk = []
    for token in tokens:
        if len(current_chunk) + len(token) > max_chunk_tokens:
            chunks.append(current_chunk)
            current_chunk = []
        current_chunk.append(token)
    if current_chunk:
        chunks.append(current_chunk)

    # Process each question and chunk pair
    answers = []
    for question, chunk in zip(questions, chunks):
        # Concatenate question with the chunk
        context_question_input = question + " " + tokenizer.convert_tokens_to_string(chunk)
        
        # Tokenize the combined input
        context_question_tokens = tokenizer.tokenize(context_question_input)
        
        # Generate the answer from the model
        answer = large_language_model.generate(context_question_tokens)
        answers.append(answer)

    # Combine answers if needed
    final_answer = combine_answers(answers)

    return final_answer


# Answer question

In [None]:
# from langchain.output_parsers import PydanticOutputParser
# from pydantic import BaseModel, Field, validator
# from typing import List

In [1]:
template = """Answer the following question


Question:
{question}


Answers:
"""

In [2]:
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate

template="Answer the user question.\n{format_instructions}\n{query}\n",

prompt = PromptTemplate(
    template= template,
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

NameError: name 'parser' is not defined

In [None]:
multi_template = """Answer the following questions one at a time.


Questions:
{questions}


Answers:
"""

In [None]:
from langchain.chains import LLMChain, TransformChain
from langchain.chains import SequentialChain

llm_chain = LLMChain(
    prompt=prompt,
    llm=biogptmodel,
    output_key="json_string",
)


In [None]:

def parse_output(inputs: dict) -> dict:
    text = inputs["json_string"]
    return {"result": parser.parse(text)}

transform_chain = TransformChain(
    input_variables=["json_string"],
    output_variables=["result"],
    transform=parse_output
)

chain = SequentialChain(
    input_variables=["joke_query"],
    output_variables=["result"],
    chains=[llm_chain, transform_chain],
)

chain.run(query="Tell me a joke.")

In [None]:
print(llm_chain.run(qs_str))

# Create Chain for Multiple Questions

# Extra Code

In [None]:
def process_large_text(text, max_chunk_tokens):
    # Tokenize the input text
    tokens = tokenizer.tokenize(text)

    # Split tokens into chunks
    chunks = []
    current_chunk = []
    for token in tokens:
        if len(current_chunk) + len(token) > max_chunk_tokens:
            chunks.append(current_chunk)
            current_chunk = []
        current_chunk.append(token)
    if current_chunk:
        chunks.append(current_chunk)

    # Process each chunk through the model
    results = []
    for chunk in chunks:
        chunk_text = tokenizer.convert_tokens_to_string(chunk)
        result = large_language_model.generate(chunk_text)
        results.append(result)

    # Combine results if needed
    final_result = combine_results(results)

    return final_result
