In [1]:
#! usr/bin/env python

# setup

# Base
import pandas as pd
import numpy as np
import re

# LLM packages
from transformers import pipeline, set_seed
from transformers import AutoTokenizer, AutoModelWithLMHead, BioGptTokenizer, BioGptForCausalLM

# Chunk context into 512  tokens
from langchain.text_splitter import RecursiveCharacterTextSplitter
# import tiktoken

# @dask.delayed
def token_len(text): 
    """ Get the length of tokens from text"""
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)['input_ids'][0]
    return len(tokens)
    
chunk_size = 1024

# create text splitters for processing the texts
text_splitter = RecursiveCharacterTextSplitter(
    # separator = ["\n\n", "\n", ". ", "? ", "! ", "; "],
    chunk_size = chunk_size,
    chunk_overlap  = 20,
    length_function = token_len
)


# Create embeddings function with specter model
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
model = AutoModel.from_pretrained('allenai/specter')

from chromadb.api.types import Documents, EmbeddingFunction, Embeddings

class specter_ef(EmbeddingFunction):
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def embed_documents(self, texts: Documents) -> Embeddings:
        
        text_list = [re.sub("\n", " ", p) for p in texts]
        texts = [re.sub("\s\s+", " ", t) for t in text_list]
        
        # embed the documents somehow
        embeddings = []
        
        for text in texts:
            inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)
            result = model(**inputs)
            embeddings.append(result.last_hidden_state[:, 0, :])
        
        return embeddings
    
specter_embeder = specter_ef(model, tokenizer)


import chromadb
from chromadb.config import Settings

# Create chroma client
chroma = chromadb.Client(Settings(chroma_api_impl="rest",
                                  chroma_server_host="34.238.51.66", # EC2 instance public IPv4
                                  chroma_server_http_port=8000))

print("Nanosecond heartbeat on server", chroma.heartbeat()) # returns a nanosecond heartbeat. Useful for making sure the client remains connected.

# Check Existing connections
display(chroma.list_collections())

collection = chroma.get_or_create_collection("specter_abstracts")


2023-07-20 02:57:38.887522: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Nanosecond heartbeat on server 1689821877933682809000


[Collection(name=langchain_store),
 Collection(name=abstracts),
 Collection(name=fulltext),
 Collection(name=specter_abstracts)]

In [2]:
prompts = pd.read_csv('/home/ubuntu/work/therapeutic_accelerator/data/prompts.csv')
# testing prompt one
question = prompts.loc[0, "Prompt"]

In [3]:
def get_question_embeddings(question): 
    # Embed question
    question_embeddings = specter_embeder.embed_documents([question])[0][0].tolist()
    
    return question_embeddings

def query_chroma(question_embeddings):
    # Query ChromaDB with Embeddings
    results = collection.query(
        query_embeddings=[question_embeddings],
        n_results=10
        # where={"metadata_field": "is_equal_to_this"},
        # where_document={"$contains":"search_string"}
    )
    
    return results

In [71]:
question_embeddings = get_question_embeddings(question)
results = query_chroma(question_embeddings)

for k in results.keys(): 
    try: 
        results[k] = results[k][0]
    except: 
        pass
    
results

{'ids': [['38374595-0',
   '203622768-0',
   '234597674-0',
   '211474643-0',
   '11181159-0',
   '10984456-0',
   '232429176-0',
   '246997767-0',
   '240425531-0',
   '38325820-0']],
 'distances': [[196.53672790527344,
   210.16175842285156,
   224.75006103515625,
   240.17074584960938,
   246.17657470703125,
   247.79898071289062,
   263.77459716796875,
   273.4192199707031,
   283.67694091796875,
   284.9610290527344]],
 'embeddings': None,
 'metadatas': [[{'corpusid': 38374595, 'chunk': 0},
   {'corpusid': 203622768, 'chunk': 0},
   {'corpusid': 234597674, 'chunk': 0},
   {'corpusid': 211474643, 'chunk': 0},
   {'corpusid': 11181159, 'chunk': 0},
   {'corpusid': 10984456, 'chunk': 0},
   {'corpusid': 232429176, 'chunk': 0},
   {'corpusid': 246997767, 'chunk': 0},
   {'corpusid': 240425531, 'chunk': 0},
   {'corpusid': 38325820, 'chunk': 0}]],
 'documents': [['Microvascular density (MVD), a marker for tumor angiogenesis, has been demonstrated to have prognostic significance in vari

# Load LED model

In [82]:
from transformers import AutoTokenizer, LongformerForQuestionAnswering
import torch

led_tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
led_model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")

In [None]:
text = ''.join(results['documents'][0])

In [None]:
encoding = led_tokenizer(question, text, return_tensors="pt")

input_ids = encoding["input_ids"]


In [None]:
# default is local attention everywhere
# the forward method will automatically set global attention on question tokens
attention_mask = encoding["attention_mask"]

outputs = led_model(input_ids, attention_mask=attention_mask)


In [None]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits

all_tokens = led_tokenizer.convert_ids_to_tokens(input_ids[0].tolist())

answer_tokens = all_tokens[torch.argmax(start_logits) : torch.argmax(end_logits) + 1]

answer = led_tokenizer.decode(
    led_tokenizer.convert_tokens_to_ids(answer_tokens)
)  # remove space prepending space token

# Llama Indexing

In [5]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import VectorDBQA
from langchain.document_loaders import TextLoader

In [6]:
documents = text_splitter.create_documents(results['documents'][0])
documents

[Document(page_content='Microvascular density (MVD), a marker for tumor angiogenesis, has been demonstrated to have prognostic significance in various malignancies. Previous studies have demonstrated that MVD is an independent prognostic factor in pancreatic adenocarcinoma and that longer survival is associated with hypovascular tumors. The prognostic importance of MVD in pancreatic neuroendocrine tumor (NET) has not been documented. We evaluated MVD in pancreatic NET and correlated it with clinicopathologic features and patient outcome to determine whether MVD is a useful prognostic indicator for these patients. Twenty-five pancreatic NETs from our archival files resected between 1981 and 2000 were identified. The mean MVD was determined for each tumor from the 3 most vascularized 200 × fields. Clinical follow-up ranged from 1 to 19 years, with a mean of 4.9 years. At last follow-up, 6 patients were dead of disease, 10 patients were alive without disease, 4 patients were alive with di

In [7]:
class specter_ef(EmbeddingFunction):
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def embed_documents(self, texts: Documents) -> Embeddings:
        
        text_list = [re.sub("\n", " ", p) for p in texts]
        texts = [re.sub("\s\s+", " ", t) for t in text_list]
        
        # embed the documents somehow
        embeddings = []
        
        for text in texts:
            inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)
            result = model(**inputs)
            embeddings.append(result.last_hidden_state[:, 0, :])
        
        return embeddings
    
specter_embeder = specter_ef(model, tokenizer)

In [18]:
# remove stop words and punctuation from a string using the nltk library
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [34]:
test_string = 'Background: Recurrent laryngeal nerve paralysis (RLNP), a severe complication of mini-invasive esophagectomy, usually occurs during lymphadenectomy adjacent to recurrent laryngeal nerve. '


In [None]:

# remove stop words from test_string
stop_words = set(stopwords.words('english'))

In [27]:
from string import punctuation
exclude_punctuation = set(punctuation)

In [32]:
# combine stop words and punctuation marks
exclude = set.union(stop_words, exclude_punctuation)

In [35]:
# use stop_words to remove stop words from test_string
test_string = ' '.join([word for word in test_string.split() if word not in exclude])

In [50]:
# import
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings import LangchainEmbedding
from IPython.display import Markdown, display
import chromadb

In [9]:
# set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [75]:
from llama_index.schema import TextNode

In [76]:
nodes = []

for d in range(len(results['documents'])): 
    nodes.append(TextNode(
        text = results['documents'][d],
        metadata = results['metadatas'][d]
    ))

In [83]:
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

TypeError: StorageContext.from_defaults() got an unexpected keyword argument 'llm'

In [81]:
index = VectorStoreIndex(nodes, storage_context=storage_context)

RetryError: RetryError[<Future at 0x7f6e6dbae260 state=finished raised AuthenticationError>]

In [61]:
# service_context = ServiceContext.from_defaults(llm = led_model, chunk_size=512)

NameError: name 'led_model' is not defined

In [52]:
from llama_index import Document

In [59]:
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    service_context=service_context
)

AttributeError: 'Document' object has no attribute 'get_doc_id'

In [None]:

# Query Data
query_engine = index.as_query_engine()

response = query_engine.query("What did the author do growing up?")

display(Markdown(f"<b>{response}</b>"))

In [85]:
from llama_index import download_loader

ChromaReader = download_loader("ChromaReader")


In [87]:
help(ChromaReader)

Help on class ChromaReader:

class ChromaReader(llama_index.readers.base.BaseReader)
 |  ChromaReader(collection_name: str, persist_directory: str) -> None
 |  
 |  Chroma reader.
 |  
 |  Retrieve documents from existing persisted Chroma collections.
 |  
 |  Args:
 |      collection_name: Name of the peristed collection.
 |      persist_directory: Directory where the collection is persisted.
 |  
 |  Method resolution order:
 |      ChromaReader
 |      llama_index.readers.base.BaseReader
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, collection_name: str, persist_directory: str) -> None
 |      Initialize with parameters.
 |  
 |  load_data(self, query_vector: Any, limit: int = 10) -> Any
 |      Load data from Chroma.
 |      
 |      Args:
 |          query_vector (Any): Query
 |          limit (int): Number of results to return.
 |      
 |      Returns:
 |          List[Document]: A list of documents.
 |  
 |  -------------------------------------

In [89]:
chroma

<chromadb.api.fastapi.FastAPI at 0x7f6e7113a170>

In [88]:
# The chroma reader loads data from a persisted Chroma collection.
# This requires a collection name and a persist directory.
reader = ChromaReader(
    collection_name="specter_abstracts",
    persist_directory=collection
)

ValidationError: 2 validation errors for Settings
persist_directory
  str type expected (type=type_error.str)
is_persistent
  extra fields not permitted (type=value_error.extra)

In [None]:

query_vector=[n1, n2, n3, ...]

documents = reader.load_data(collection_name="demo", query_vector=query_vector, limit=5)