In [None]:
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions

import pandas as pd

import sqlalchemy as sa
import json
import re

In [None]:
from transformers import T5Tokenizer

max_sequence_length = 1200
embedding_size = 200

# Create tokenizer for T5 model
T5tokens = T5Tokenizer.from_pretrained('t5-base', model_max_length = max_sequence_length)

In [None]:
# I got tired of copying the same code all the time
# loads config for main parts and creates engine for sqlalchemy
%run /home/ubuntu/work/therapeutic_accelerator/scripts/create_sqlalchemy_engine.py

In [None]:
# Create chroma client
chroma = chromadb.Client(Settings(chroma_api_impl="rest",
                                  chroma_server_host="54.175.241.78", # EC2 instance public IPv4
                                  chroma_server_http_port=8000))

print("Nanosecond heartbeat on server", chroma.heartbeat()) # returns a nanosecond heartbeat. Useful for making sure the client remains connected.

# Check Existing connections
chroma.list_collections()

In [None]:
# # Embedding fuctions
# from transformers import AutoTokenizer, AutoModel

# # AllenAI Specter
# tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
# model = AutoModel.from_pretrained('allenai/specter')

# def get_embeddings(test):
#     inputs = tokenizer(test, padding=True, truncation=True, return_tensors="pt", max_length=512)
#     result = model(**inputs)
#     embeddings = result.last_hidden_state[:, 0, :]
#     print (embeddings)


In [None]:
# Sentence Transformers all-MiniLM-L6-v2 
default_ef = embedding_functions.DefaultEmbeddingFunction()

# Create collection to store embeddings with T5 sentence transformer
def create_collection(chroma, name, metadata = {"hnsw:space":"cosine"}, embedding_function = default_ef):
    try:
        chroma.create_collection(name=name, metadata=metadata, embedding_function=embedding_function)
    except Exception as e:
        logging.error(e)

collection = chroma.get_or_create_collection("fulltext")

Create embeddings table in SQL for abstracts

In [None]:
# chroma.delete_collection('abstract_sentence')

Add data to collection

In [None]:
def create_dictionary(text):
    """ Turn string containing list of dictionaries into a dictionary"""
    
    # remove new line characters
    categories = re.sub(r'[\[\]\'\\]', '', text)

    # remove outer brackets, quotes, and split on commas
    categories = categories.strip('{}').strip('"').split('","')

    # create list with unique values from category
    # categories = pd.Series([json.loads(t)['category'] for t in categories]).unique().tolist()
    categories = [json.loads(t) for t in categories]
    
    return categories

In [None]:
# Get attributes table for the metadata embeddings
table_name = 'fulltext'

sql = sa.text(f''' 
    SELECT * from {table_name} LEFT JOIN attributes ON ({table_name}.corpusid = CAST(attributes.corpusid as text));
''')

with engine.connect() as conn: 
    query = conn.execute(sql)
    
ft = pd.DataFrame(query.fetchall())

# remove unncecessary columns
# ft.drop(columns = ['paperId', 'corpusId', 'index', 'id'], inplace = True)

# turn strings into list of dictionaries
ft['s2fieldsofstudy'] = ft['s2fieldsofstudy'].apply(create_dictionary).apply(lambda x: pd.Series([d['category'] for d in x]).unique().tolist())
ft['authors'] = ft['authors'].apply(create_dictionary)

fulltext = ft.text
fulltext.head()

In [None]:
# # Get attributes table for the metadata embeddings
# table_name = 'abstracts_encodings'

# sql = sa.text(f''' 
#     SELECT * from {table_name} LEFT JOIN attributes ON ({table_name}."corpusId" = CAST(attributes.corpusid as text)) LIMIT 10;
# ''')

# with engine.connect() as conn: 
#     query = conn.execute(sql)
    
# att = pd.DataFrame(query.fetchall())

# # remove unncecessary columns
# att.drop(columns = ['paperId', 'corpusId', 'index', 'id'], inplace = True)

# # turn strings into list of dictionaries
# att['s2fieldsofstudy'] = att['s2fieldsofstudy'].apply(create_dictionary).apply(lambda x: pd.Series([d['category'] for d in x]).unique().tolist())
# att['authors'] = att['authors'].apply(create_dictionary)

# abstracts = att.abstract
# abstracts.head()

In [None]:
# # Get attributes table for the metadata embeddings
# table_name = 'abstracts_encodings'

# sql = sa.text(f''' 
#     SELECT * from {table_name} LIMIT 10;
# ''')

# with engine.connect() as conn: 
#     query = conn.execute(sql)
    
# att = pd.DataFrame(query.fetchall())

In [None]:
# from sqlalchemy import Table, Column, Integer, String, MetaData, ARRAY

# # Create a metadata object
# metadata = MetaData()

# # Define a table using the metadata object
# abstract_encodings = Table(
#     'fulltext_embeddings',
#     metadata,
#     Column('paperId', String, primary_key=True),
#     Column('corpusId', Integer),
#     Column('abstract', String),
#     Column('input_ids', ARRAY(Integer)),
#     Column('attention_mask', ARRAY(Integer))
# )

# # Create the table in the database
# metadata.create_all(engine)

In [None]:
# # # Get attributes table for the metadata embeddings
# table_name = 'abstracts_encodings'

# # sql query text that sets the creates an index column
# sql = text(''' 
#     CREATE INDEX id 
#     ON abstracts_encodings("corpusId");
# ''')

# with engine.connect() as conn: 
#     query = conn.execute(sql)

In [None]:
from sqlalchemy.ext.automap import automap_base

# Reflect the existing database schema
Base = automap_base()
Base.prepare(engine, reflect=True)

# Access the existing table you want to create a class for
TableClass = Base.classes

In [None]:
attributes = TableClass.attributes

In [None]:
from sqlalchemy import Column, Integer, String, ForeignKey
from sqlalchemy.orm import declarative_base
from sqlalchemy.orm import relationship

# Define the database connection
Base = declarative_base()

# Define the models for the tables
class abstract_encodings(Base):
    __tablename__ = 'abstracts_encodings'
    id = Column(Integer, primary_key=True)
    paperId = Column(String, primary_key=True)
    corpusId = Column(Integer)
    abstract = Column(String)
    input_ids = Column(ARRAY(Integer))
    attention_mask = Column(ARRAY(Integer))
    attributes = relationship(ForeignKey("attributes.corpusid"))


In [None]:
from sqlalchemy import select
stmt = (
    select(abstract_encodings)
    .join(abstract_encodings.attributes)
)

In [None]:
from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=engine)
session = Session()

In [None]:
# import dask.dataframe as dd
# from dask.diagnostics import ProgressBar
# import pandas as pd

# # Read the joined tables using Dask
# with ProgressBar():
#     df = dd.read_sql_query(query, database_uri, index_col='idx_corpusid')

# # Convert Dask dataframe to pandas dataframe
# # df = df.compute()

# # Print the resulting dataframe
# # print(df.head())


In [None]:
# With dask dataframe to partition the query into pieces and prevent maxing out machine
import dask.dataframe as dd

table_name = 'fulltext'

sql = sa.text(f''' 
    SELECT * from {table_name} LEFT JOIN attributes ON ({table_name}.corpusid = CAST(attributes.corpusid as text));
''')

df = dd.read_sql_query(sql, str(engine.url), index_col = 'index', head_rows=10)


# Testing embedding creation for abstracts

In [None]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
import tiktoken

def token_len(text): 
    """ Get the length of tokens from text"""
    tokens = T5tokens.encode(text)
    return len(tokens)
    
# create text splitters for processing the texts
text_splitter = CharacterTextSplitter(
    separator = "\n\n",
    chunk_size = 400,
    chunk_overlap  = 200,
    length_function = token_len,
)

recursive_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", "\n", ".", "?", "!"],
    chunk_size = 400,
    chunk_overlap  = 20,
    length_function = token_len,
)

In [None]:
# Get all the other attributes, excluding the abstracts, encodings and attention mask
meta_df = ft.loc[:, ['corpusid', 'title', 'referencecount', 'citationcount', 'influentialcitationcount']]

# create metadata for object
metadata = meta_df.iloc[0, :].to_dict()

In [None]:
metadata

# Create Document class for creating embeddings

In [None]:
from dataclasses import dataclass

# create a document class that will split the text into chunks and add metadata, create embeddings, and create ids
@dataclass
class Document:
    text: str
    metadata: dict

    def __init__(self, text, metadata, **kwargs):
        self.text = text
        self.metadata = metadata
        self.embedder = embedding_functions.DefaultEmbeddingFunction()
        self.__dict__.update(kwargs)

    def __str__(self) -> str:
        ...

    # def __repr__(self) -> str:
    #     return self.documents, self.ids, self.embeddings, self.metadata

    def create_documents(self) -> list:
        """Split text into chunks and add metadata"""
        self.documents = text_splitter.create_documents([self.text])
        
        # add metadata to each document
        for i, d in enumerate(self.documents):
            d.metadata = self.metadata | {"chunk_id": i}
        
        self.papers = [d.page_content for d in self.documents]
        
        self.metadatas = [d.metadata for d in self.documents]
        
        """Create embeddings for each chunk"""
        self.embeddings = self.embedder(self.papers)
        
        """Create unique ids for each chunk"""
        self.ids = [
            f"{d.metadata['corpusid']}_{i}" for i,d in enumerate(self.documents)
        ]        
        
    def tokenize(self) -> list:
        """ tokenize text of chunks to store"""
        self.tokenized_text = [T5tokens.encode(d) for d in self.papers]

    def main(self) -> tuple:
        """Run all the functions"""
        self.create_documents()

        # to import into add to collection function easier. Loop over documents to create list of dictionaries to add to collection. 
        self.rep = {
            "documents": self.papers, # list of all documents [doc1, doc2, doc3, ...]
            'embeddings': self.embeddings, # list of list for all embeddings [[emb1, emb2], [emb3, ...],...]
            'ids': self.ids, # list of all ids [id1, id2, id3, ...]
            'metadatas': self.metadatas # list of dictionaries with metadata for each document
        }
        
        return self.rep

In [None]:
test = Document(att['abstract'][0], metadata = metadata)

In [None]:
keep_cols = ['corpusid', 'title', 'referencecount', 'citationcount', 'influentialcitationcount']

for i, k in ft.iterrows():
    try: 
        doc = Document(k['text'], k[keep_cols].to_dict())
        collection.add(**doc.main())
    except: 
        print(f"Error with {k['corpusid']}")
        continue

Query the collections

In [None]:
collection.get(
    include=['metadatas']   
)

In [None]:
# query example of finding articles by corpus id
collection.get(
    where={"corpusid": "237156001"},
    include=["documents", "embeddings", "metadatas"]
)

# Llama index

Using Llama index for ...

In [None]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores import ChromaVectorStore
from IPython.display import Markdown, display

In [None]:
from llama_index.storage.storage_context import StorageContext

vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

# Different Embeddings

Create embeddings to upload to chroma

In [None]:
embedding_size = 200

# T5Abstract_model = TFT5ForConditionalGeneration.from_pretrained('t5-base')
T5tokens = T5Tokenizer.from_pretrained('t5-base')

In [None]:
# Custome Embedding Function
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings

class MyEmbeddingFunction(EmbeddingFunction):
    def __call__(self, texts: Documents) -> Embeddings:
        # embed the documents somehow
        model = T5Model.from_pretrained("t5-small")
        tok = T5Tokenizer.from_pretrained("t5-small")

        enc = tok(texts, return_tensors="pt")

        # forward pass through encoder only
        output = model.encoder(
            input_ids=enc["input_ids"], 
            attention_mask=enc["attention_mask"], 
            return_dict=True
        )
        # get the final hidden states
        embeddings = output.last_hidden_state
        return embeddings

In [None]:
def t5summary_model(tokenizer, text, t5model):
    summarize = "summarize: "
    encoding = tokenizer([summarize+text], return_tensors='tf')

In [None]:
# Bio bert toeknizer


In [None]:
# Create collection to store embeddings with T5
collection = chroma_client.create_collection(
    name="abstract_collection_t5",
    metadata={"hnsw:space":"cosine"}) #customize distance method of embedding space 


# Get collection
collection = chroma_client.get_collection(name="my_collection", embedding_function=emb_fn)

In [None]:
# Embeddings with Sentence Transformers
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

In [None]:
# Embeddings with OpenAI
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key="YOUR_API_KEY",
                model_name="text-embedding-ada-002"
            )

In [None]:
# Biogpt tokenizer
biogpttokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
biogptmodel = BioGptForCausalLM.from_pretrained("microsoft/biogpt")

Queries
https://docs.trychroma.com/usage-guide

In [None]:
# Query Embeddings
collection.query(
    query_embeddings=[[11.1, 12.1, 13.1],[1.1, 2.3, 3.2] ...]
    n_results=10,
    where={"metadata_field": "is_equal_to_this"},
    where_document={"$contains":"search_string"}
)

In [None]:
# query by ids
collection.get(
    ids=["id1", "id2", "id3", ...],
    where={"style": "style1"}
)

In [None]:
# Query by texts
collection.query(
    query_texts=["doc10", "thus spake zarathustra", ...]
    n_results=10,
    where={"metadata_field": "is_equal_to_this"},
    where_document={"$contains":"search_string"}
)