In [None]:
import pandas as pd
import sqlalchemy as sa

from transformers import T5Tokenizer

import dask
from dask import dataframe as dd
from dask.diagnostics import ProgressBar
import dask.distributed as distributed

import warnings
import logging
warnings.filterwarnings('ignore')

%run /home/ubuntu/work/therapeutic_accelerator/scripts/base.py

max_sequence_length = 1200
embedding_size = 200

# Create tokenizer for T5 model
T5tokens = T5Tokenizer.from_pretrained('t5-base', model_max_length = max_sequence_length)

In [None]:
# Create dask cluster
dask.config.set(scheduler='processes')  # overwrite default with multiprocessing scheduler

cluster = distributed.LocalCluster(name='local', n_workers=7, memory_limit = '4GiB', threads_per_worker=4)  # Launches a scheduler and workers locally
client = distributed.client._get_global_client() or distributed.Client(cluster)
client

# Create Embeddings

In [None]:
from langchain.text_splitter import CharacterTextSplitter
# import tiktoken

# @dask.delayed
def token_len(text): 
    """ Get the length of tokens from text"""
    tokens = T5tokens.encode(text)
    return len(tokens)
    
chunk_size = 2000

# create text splitters for processing the texts
text_splitter = CharacterTextSplitter(
    separator = "\n\n",
    chunk_size = chunk_size,
    chunk_overlap  = 20,
    length_function = token_len
)

## Now with Dask

Functions to clean up dataframes

In [None]:
# @dask.delayed
def create_doc(split_text, corpusid):
    """ Create documents for each chunk """
        
    try:
        docs = {
            "documents": split_text, # list of all documents [doc1, doc2, doc3, ...]
            'ids': [f'{corpusid}-{i}' for i in range(len(split_text))], # list of all ids [id1, id2, id3, ...]
            'metadatas': [{'corpusid': int(corpusid), 'chunk': i} for i in range(len(split_text))] # list of dictionaries with metadata for each document
        }
        return docs

    except Exception as e:
        logging.error(e)


def mp_create_doc(ddf): 
    """ Used for mapping partitions"""
    return ddf.apply(lambda x: create_doc(x['split_text'], x['corpusid']), axis=1)


def split_text(ddf):
    """ Split text into chunks """
    return ddf['text'].apply(text_splitter.split_text)


def add_to_collection(text, corpusid):
    
    doc = create_document(text, corpusid)
    
    try:
        dask.delayed(collection.add)(**doc)
    except Exception as e:
        logging.error(e)

In [None]:
# Read in fulltext from csvs for dask
ft = dd.read_parquet('/home/ubuntu/work/data/fulltext_parquets/fulltext-*.parquet', sample=10000000,
                     sample_rows=10,
                     lineterminator=None,
                     dtype={'corpusid': 'int', 'text': 'object'})

# Cleanup dataframes
ft = ft.map_partitions(pd.DataFrame.dropna, subset='text')

ft = ft.map_partitions(pd.DataFrame.drop_duplicates, subset='text')

ft = ft.map_partitions(pd.DataFrame.reset_index, drop=True)

ft = ft.persist()


In [None]:
# split the full text into chunks and add to collection
ft_split = ft.assign(split_text=ft.map_partitions(split_text, meta=('text', 'object')))

# No longer need text column
ft_split = ft_split.drop('text', axis=1)

In [None]:
# Create documents for chroma
ft_docs = ft_split.map_partitions(mp_create_doc, meta=('docs', 'object'))

In [None]:
ft_docs.to_csv('/home/ubuntu/work/data/fulltext_docs_csvs/fulltext_docs-*.csv')

In [None]:
# Add documents to collection
def add_to_collection(docs):
    """ Add documents to collection """
    try:
        collection.add(**docs)
    except Exception as e:
        logging.error(e)

In [None]:
ft_docs.apply(add_to_collection, axis=1, meta=('docs', 'object'))

In [None]:
collection

In [None]:
def write_to_csv(partition):
    cli_comp = [client.persist(x) for x in partition] # gets delayed dask objects and puts them in cluster
    result = [x.persist() for x in cli_comp]
    path = f'/home/ubuntu/work/data/fulltext_docs/fulltext_docs-{i}.csv'
    if os.path.exists(path): 
        next
    else:
        pd.Series(result).to_csv(path)

# Llama Indexing for Chroma

In [None]:
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions

# Create chroma client
# chroma_client = chromadb.Client()
chroma = chromadb.Client(Settings(chroma_api_impl="rest",
                                chroma_server_host="44.204.90.95",  # EC2 instance public IPv4
                                chroma_server_http_port=8000))

# returns a nanosecond heartbeat. Useful for making sure the client remains connected.
print("Nanosecond heartbeat on server", chroma_client.heartbeat())

# Check Existing connections
chroma_client.list_collections()

default_ef = embedding_functions.DefaultEmbeddingFunction()
collection = chroma_client.get_or_create_collection("fulltext")


In [None]:
    # chroma = chromadb.Client(Settings(chroma_api_impl="rest",
    #                                 chroma_server_host="44.204.90.95",  # EC2 instance public IPv4
    #                                 chroma_server_http_port=8000))

In [None]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores import ChromaVectorStore
from IPython.display import Markdown, display
from llama_index.storage.storage_context import StorageContext

vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

In [None]:
# load documents
documents = SimpleDirectoryReader(
    "/home/ubuntu/work/data/fulltext_docs/"
).load_data()

In [None]:
type(documents)

In [None]:
# set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, embed_model=embed_model
)

# Query Data
query_engine = index.as_query_engine(chroma_collection=chroma_collection)
response = query_engine.query("What did the author do growing up?")
display(Markdown(f"<b>{response}</b>"))

Check Work

In [None]:
collection.count()

In [None]:
collection.query(
    n_results=10,
    where={"corpusid": "1353942"}
)

# Embed the text

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
model = AutoModel.from_pretrained('allenai/specter')

# @dask.delayed
def tokenize(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)
    return inputs

# @dask.delayed
def get_embeddings(inputs):
    result = model(**inputs).last_hidden_state[:, 0, :].tolist()
    return result

In [None]:
inputs = tokenize(res[0])

In [None]:
embed = get_embeddings(inputs)