# Setup

## Packages

In [None]:
%run /home/ubuntu/work/therapeutic_accelerator/scripts/base.py

In [None]:
import pandas as pd
import numpy as np

from transformers import T5Tokenizer # AutoModel, AutoTokenizer, BertTokenizer,BioGptModel, BioGptConfig, BioGptTokenizer

import torch

## Tokenizers and Models

In [None]:
max_sequence_length = 1200
embedding_size = 200
T5tokens = T5Tokenizer.from_pretrained('t5-base', model_max_length = max_sequence_length)

In [None]:
# bio_bert_model = AutoModel.from_pretrained("gsarti/biobert-nli")
# bio_bert_tokenizer = AutoTokenizer.from_pretrained("gsarti/biobert-nli")
# original_bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# T5Abstract_model = TFT5ForConditionalGeneration.from_pretrained('t5-base')
# biogpttokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
# biogptmodel = BioGptModel.from_pretrained("microsoft/biogpt")

Custom embeddings function

In [None]:
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings

class MyEmbeddingFunction(EmbeddingFunction):
    def __call__(self, texts: Documents) -> Embeddings:
        #create document embeddings with T5
        embeddings = []
        for text in texts:
            input_ids = T5tokens.encode(text)
            input_ids = torch.tensor(input_ids).unsqueeze(0)
            with torch.no_grad():
                output = T5Abstract_model(input_ids)
            embeddings.append(output[0][0][0].numpy())
        return embeddings

# Langchain Sentence Embeddings

In [None]:
from abc import ABC
from typing import List, Optional, Any

import chromadb
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import OpenAIEmbeddings

In [None]:
from abc import ABC
from typing import List, Optional, Any


class CachedChroma(Chroma, ABC):
    """
    Wrapper around Chroma to make caching embeddings easier.
    
    It automatically uses a cached version of a specified collection, if available.
        Example:
            .. code-block:: python
                    from langchain.vectorstores import Chroma
                    from langchain.embeddings.openai import OpenAIEmbeddings
                    embeddings = OpenAIEmbeddings()
                    vectorstore = CachedChroma.from_documents_with_cache(
                        ".persisted_data", texts, embeddings, collection_name="fun_experiement"
                    )
        """
    
    @classmethod
    def from_documents_with_cache(
            cls,
            persist_directory: str,
            documents: List[Document],
            embedding: Optional[Embeddings] = None,
            ids: Optional[List[str]] = None,
            collection_name: str = Chroma._LANGCHAIN_DEFAULT_COLLECTION_NAME,
            client_settings: Optional[chromadb.config.Settings] = None,
            **kwargs: Any,
    ) -> Chroma:
        settings = chromadb.config.Settings(
            chroma_db_impl="duckdb+parquet",
            persist_directory=persist_directory
        )
        client = chromadb.Client(settings)
        collection_names = [c.name for c in client.list_collections()]

        if collection_name in collection_names:
            return Chroma(
                collection_name=collection_name,
                embedding_function=embedding,
                persist_directory=persist_directory,
                client_settings=client_settings,
            )

        return Chroma.from_documents(
            documents=documents,
            embedding=embedding,
            ids=ids,
            collection_name=collection_name,
            persist_directory=persist_directory,
            client_settings=client_settings,
            **kwargs
        )

In [None]:
# openai embeddings model
embeddings_model = OpenAIEmbeddings(openai_api_key=keys['openai_api_key'])

In [None]:
# vectorstore = CachedChroma.from_documents_with_cache(
#     ".persisted_data", texts, embeddings, collection_name="fun_experiement"
# )

In [None]:
# create a function that uses langchain to embed sentences
def embed_sentences(sentences, tokenizer, model, max_sequence_length):
    # tokenize the sentences
    tokenized_sentences = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt", max_length=max_sequence_length)
    # get the embeddings
    with torch.no_grad():
        model_output = model(**tokenized_sentences)
    # get the embeddings from the model output
    embeddings = model_output[0][:,0,:].numpy()
    return embeddings

# Create Encodings using Dask

## Full Text

In [None]:
# import dask
import sqlalchemy as sa
from dask import dataframe as dd
from dask.delayed import delayed
from dask.diagnostics import ProgressBar
import json
import re
# from glob import glob

In [None]:
table_name = 'fulltext'

In [None]:
def create_dictionary(text):
    """ Turn string containing list of dictionaries into a dictionary"""
    
    # remove new line characters
    categories = re.sub(r'[\[\]\'\\]', '', text)

    # remove outer brackets, quotes, and split on commas
    categories = categories.strip('{}').strip('"').split('","')

    # create list with unique values from category
    # categories = pd.Series([json.loads(t)['category'] for t in categories]).unique().tolist()
    categories = [json.loads(t) for t in categories]
    
    return categories

In [None]:
# Pull fulltext table and combine with attributes table for metadata embeddings
sql = sa.text(f''' 
    SELECT * FROM {table_name} LEFT JOIN attributes ON ({table_name}.corpusid = CAST(attributes.corpusid as text)) LIMIT 10;
''')

with engine.connect() as conn: 
    query = conn.execute(sql)
    
ft = pd.DataFrame(query.fetchall())

In [None]:
# Get locations of figure captions to remvoe from full text
ft['annotations.figurecaption'] = ft['annotations.figurecaption'].apply(json.loads)

# turn strings into list of dictionaries
ft['s2fieldsofstudy'] = ft['s2fieldsofstudy'].apply(create_dictionary).apply(lambda x: pd.Series([d['category'] for d in x]).unique().tolist())
ft['authors'] = ft['authors'].apply(create_dictionary)

In [None]:
def get_text_from_index(text, indexes): 
    """ index is a list of dictionary with start and end keys"""
    # Looking at sections
    section = {}
    for i in indexes: 
        section['name'] = text['text'][i['start']:i['end']]
        section['start'] = i['start']
        section['end'] = i['end']
        
    return section

In [None]:
# Keep relevant columns
ft = ft[
    [
        "text",
        "corpusid",
        "title",
        "s2fieldsofstudy",
        "authors",
        "venue",
        "year",
        "referencecount",
        "citationcount",
        "influentialcitationcount",
        "isopenaccess",
        "s2fieldsofstudy",
        "publicationtypes",
        "publicationdate",
        "journal",
    ]
]

In [None]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
import tiktoken

def token_len(text): 
    """ Get the length of tokens from text"""
    tokens = T5tokens.encode(text)
    return len(tokens)
    
# create text splitters for processing the texts
text_splitter = CharacterTextSplitter(
    separator = "\n\n",
    chunk_size = 400,
    chunk_overlap  = 200,
    length_function = token_len,
)

recursive_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", "\n", ".", "?", "!"],
    chunk_size = 400,
    chunk_overlap  = 20,
    length_function = token_len,
)

In [None]:
# from langchain.document_loaders.csv_loader import CSVLoader
# loader = CSVLoader(file_path='./example_data/mlb_teams_2012.csv')
# data = loader.load()

In [None]:
# split the text into chunks
documents = text_splitter.create_documents([ft.loc[0, 'text']])
documents

In [None]:
# # Create unique id for each chunk
# import hashlib
# m = hashlib.md5()
# uid = m.hexdigest()[:12]

# data = [{
#     'id': f'{uid}-{i}',
#     'text': chunk,
#     'source': 'prompt',
# } for i, chunk in enumerate(chunks)
# ]

In [None]:
# create data for chroma
data = [
    {
        'id': f'{ft.loc[0, "corpusid"]}-{i}',
        'text': chunk,
        'metadata': ft.iloc[0, 1:].to_dict()
    } for i, chunk in enumerate(documents)
]

In [None]:
# create metadata dictionary
metadata = ft.iloc[0, 1:].to_dict()

for i, d in enumerate(documents): 
    d.metadata = metadata
    
documents

In [None]:
# embed the documents
encoded_documents = [
    T5tokens.encode(d.page_content, return_tensors="pt", max_length=512, truncation=True) for d in documents
]

In [None]:
# create me a function that will preprocess text to prepare to be used in a nlp model
import unidecode

def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text

def preprocess_text(text):
    # remove the new lines
    text = text.replace('\n', ' ')
    # remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    
    text = remove_accented_chars(text)
    return text

In [None]:
docs = [Document(page_content=t) for t in text[:3]]

In [None]:
# Create connection to postgres db
# from sqlalchemy.engine.url import URL

# postgres_db = {'drivername': 'postgres',
#                'database': 'postgres',
#                'username': 'postgres',
#                'password': keys["postgres"],
#                'host': config["database"]["host"],
#                'port': 5432}
# print(URL(**postgres_db))
# postgres = URL(**postgres_db)

In [None]:
df = pd.read_sql_table('fulltext', con = f'postgresql://postgres:{keys["postgres"]}@{config["database"]["host"]}:5432/postgres')

In [None]:
ddf = dd.read_sql_table('fulltext', 
                        con = f'postgresql://postgres:{keys["postgres"]}@{config["database"]["host"]}:5432/postgres',
                        index_col = 'id',
                        head_rows = 10,
                        npartitions = 100)

# Remove unnecessary columns
# ddf = ddf.drop(columns = ['index'])

# Remove empty abstract rows
# ddf = ddf.dropna(how = 'all', subset='abstract').reset_index(drop = True)

In [None]:

for x in xl_files:
    parts = dask.delayed(try_to_read)(x)
    # filter_df = dask.delayed(get_techniques)(parts)
    output.append(parts)

# convert to a single dataframe
df_total = dd.from_delayed(output)

# df_total.visualize()

with ProgressBar():
    ddf = df_total.compute()

# Upload to Postgresql DB

In [None]:
# sql = text(''' 
#     SELECT EXISTS (
#         SELECT FROM information_schema.tables 
#         WHERE    table_name   = 'abstracts'
#     );
# ''')

# with engine.connect() as conn: 
#     conn.execute(sql)

Create array columns to store encoding and mask

In [None]:
table_name = 'abstracts_encodings'

In [None]:
delete_table = False

if delete_table: 
    sql = text(f''' 
        DROP TABLE IF EXISTS {table_name}};
    ''')

    with engine.connect() as conn: 
        query = conn.execute(sql)

In [None]:
# create table
# Create Table in DB first before uploading
from sqlalchemy import MetaData, Table, Column, Integer, String, ARRAY

metadata_obj = MetaData()

# Create abstracts metadata
abstracts = Table(
    table_name,
    metadata_obj,
    Column("paperId", String, nullable = True),
    Column("corpusId", String, nullable=True),
    Column("abstract", String, nullable = True),
    Column("input_ds", ARRAY(Integer), nullable=True),
    Column("attention_mask", ARRAY(Integer), nullable=True),
)

metadata_obj.create_all(engine)

In [None]:
# Upload dask dataframe to psql
ddf = ddf.to_sql(name = table_name, uri = str(url_object), if_exists = 'replace', index = False, chunksize = 10000, method = 'multi')

In [None]:
# Check if it worked
import pandas as pd
from sqlalchemy import text

table_name = 'abstracts_encodings'

sql = text(f''' 
    SELECT * FROM {table_name} LIMIT 5;
''')

with engine.connect() as conn: 
    query = conn.execute(sql)

test = pd.DataFrame(query.fetchall())
test.head()

In [None]:
# Check if it worked
import pandas as pd
from sqlalchemy import text

table_name = 'fulltext'

sql = text(f''' 
    SELECT * FROM {table_name} LIMIT 5;
''')

with engine.connect() as conn: 
    query = conn.execute(sql)

test = pd.DataFrame(query.fetchall())
test.head()

# Create Vectore Store

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

llm = OpenAI(temperature=0)