# Setup

## Packages

In [1]:
%run /home/ubuntu/work/therapeutic_accelerator/scripts/base.py

In [2]:
import pandas as pd
import numpy as np

from transformers import T5Tokenizer # AutoModel, AutoTokenizer, BertTokenizer,BioGptModel, BioGptConfig, BioGptTokenizer
import torch

## Tokenizers and Models

In [2]:
max_sequence_length = 1200
embedding_size = 200

T5tokens = T5Tokenizer.from_pretrained('t5-base', model_max_length = max_sequence_length)

In [None]:
# bio_bert_model = AutoModel.from_pretrained("gsarti/biobert-nli")
# bio_bert_tokenizer = AutoTokenizer.from_pretrained("gsarti/biobert-nli")
# original_bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# T5Abstract_model = TFT5ForConditionalGeneration.from_pretrained('t5-base')
# biogpttokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
# biogptmodel = BioGptModel.from_pretrained("microsoft/biogpt")

Custom embeddings function

In [None]:
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings

class MyEmbeddingFunction(EmbeddingFunction):
    def __call__(self, texts: Documents) -> Embeddings:
        #create document embeddings with T5
        embeddings = []
        for text in texts:
            input_ids = T5tokens.encode(text)
            input_ids = torch.tensor(input_ids).unsqueeze(0)
            with torch.no_grad():
                output = T5Abstract_model(input_ids)
            embeddings.append(output[0][0][0].numpy())
        return embeddings

# Langchain Sentence Embeddings

In [None]:
from abc import ABC
from typing import List, Optional, Any

import chromadb
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import OpenAIEmbeddings

In [3]:
embeddings_model = OpenAIEmbeddings(openai_api_key=keys['openai_api_key'])

In [9]:
# Embed abstracts
abstracts = pd.read_sql_table("abstracts", engine.connect())

In [None]:
embeddings = OpenAIEmbeddings()
vectorstore = CachedChroma.from_documents_with_cache(
    ".persisted_data", texts, embeddings, collection_name="fun_experiement"
)

In [None]:
# create a function that uses langchain to embed sentences
def embed_sentences(sentences, tokenizer, model, max_sequence_length):
    # tokenize the sentences
    tokenized_sentences = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt", max_length=max_sequence_length)
    # get the embeddings
    with torch.no_grad():
        model_output = model(**tokenized_sentences)
    # get the embeddings from the model output
    embeddings = model_output[0][:,0,:].numpy()
    return embeddings


# Create Encodings using Dask

## Abstracts

In [None]:
ddf = dd.read_sql_table('abstracts', 
                        con = f'postgresql://postgres:{keys["postgres"]}@{config["database"]["host"]}:5432/postgres',
                        index_col = 'id',
                        head_rows = 5,
                        npartitions = 700)

# Remove unnecessary columns
ddf = ddf.drop(columns = ['index'])

# Remove empty abstract rows
ddf = ddf.dropna(how = 'all', subset='abstract').reset_index(drop = True)

In [None]:
# create new columns for DB table
ddf2 = ddf.abstract.apply(T5tokens, meta=('abstract', 'string')).apply(pd.Series, meta=({'input_ids':'object', 'attention_mask':'object'}))

# concatenate two dataframes
ddf = dd.concat([ddf, ddf2], axis = 1) # create divisions in ddfs? 
# QA check
# results

In [None]:
# Rename columns for easier reading later
ddf.columns 

In [None]:
# Save backup
# name_function = lambda x: f"abstracts-{x}.parquet"
# ddf3.to_parquet('/home/ubuntu/work/backup/', name_function = name_function)

## Full Text

In [3]:
# import dask
from dask import dataframe as dd
from dask.delayed import delayed
from dask.diagnostics import ProgressBar
# from glob import glob

In [None]:
# Create connection to postgres db
# from sqlalchemy.engine.url import URL

# postgres_db = {'drivername': 'postgres',
#                'database': 'postgres',
#                'username': 'postgres',
#                'password': keys["postgres"],
#                'host': config["database"]["host"],
#                'port': 5432}
# print(URL(**postgres_db))
# postgres = URL(**postgres_db)

In [4]:
df = pd.read_sql_table('fulltext', con = f'postgresql://postgres:{keys["postgres"]}@{config["database"]["host"]}:5432/postgres')

: 

: 

In [18]:
ddf = dd.read_sql_table('fulltext', 
                        con = f'postgresql://postgres:{keys["postgres"]}@{config["database"]["host"]}:5432/postgres',
                        index_col = 'id',
                        head_rows = 10,
                        npartitions = 100)

# Remove unnecessary columns
# ddf = ddf.drop(columns = ['index'])

# Remove empty abstract rows
# ddf = ddf.dropna(how = 'all', subset='abstract').reset_index(drop = True)

AttributeError: 'OptionEngine' object has no attribute 'execute'

In [None]:

for x in xl_files:
    parts = dask.delayed(try_to_read)(x)
    # filter_df = dask.delayed(get_techniques)(parts)
    output.append(parts)

# convert to a single dataframe
df_total = dd.from_delayed(output)

# df_total.visualize()

with ProgressBar():
    ddf = df_total.compute()

# Upload to Postgresql DB

In [None]:
# sql = text(''' 
#     SELECT EXISTS (
#         SELECT FROM information_schema.tables 
#         WHERE    table_name   = 'abstracts'
#     );
# ''')

# with engine.connect() as conn: 
#     conn.execute(sql)

Create array columns to store encoding and mask

In [None]:
table_name = 'abstracts_encodings'

In [None]:
delete_table = False

if delete_table: 
    sql = text(f''' 
        DROP TABLE IF EXISTS {table_name}};
    ''')

    with engine.connect() as conn: 
        query = conn.execute(sql)

In [None]:
# create table
# Create Table in DB first before uploading
from sqlalchemy import MetaData, Table, Column, Integer, String, ARRAY

metadata_obj = MetaData()

# Create abstracts metadata
abstracts = Table(
    table_name,
    metadata_obj,
    Column("paperId", String, nullable = True),
    Column("corpusId", String, nullable=True),
    Column("abstract", String, nullable = True),
    Column("input_ds", ARRAY(Integer), nullable=True),
    Column("attention_mask", ARRAY(Integer), nullable=True),
)

metadata_obj.create_all(engine)

In [None]:
# Upload dask dataframe to psql
ddf = ddf.to_sql(name = table_name, uri = str(url_object), if_exists = 'replace', index = False, chunksize = 10000, method = 'multi')

In [4]:
# Check if it worked
import pandas as pd
from sqlalchemy import text

table_name = 'abstracts_encodings'

sql = text(f''' 
    SELECT * FROM {table_name} LIMIT 5;
''')

with engine.connect() as conn: 
    query = conn.execute(sql)

test = pd.DataFrame(query.fetchall())
test.head()

Unnamed: 0,paperId,corpusId,abstract,input_ids,attention_mask
0,f674f1fa1bcccc7c4072aed1eca9cc3c6f773539,9445537,Introduction Meta-epidemiological studies exam...,"[18921, 14204, 18, 15, 102, 23, 1778, 23, 4478...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,c0bbe6967f8c4c9223112bb9fe384f1af2d28339,216030749,Purpose: To assess whether treatment with the ...,"[7333, 2748, 15, 10, 304, 6570, 823, 1058, 28,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,4e112781c353ff25ea93a796f51e6a71ab7a52af,43026158,The aim of the study is to examine whether bas...,"[37, 2674, 13, 8, 810, 19, 12, 5443, 823, 2072...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,d79b0987af87e240fcfd1d86a7ecf0b16638139f,231849048,The scale of the SARS-CoV-2 pandemic has thrus...,"[37, 2643, 13, 8, 180, 25210, 18, 3881, 553, 4...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,6644258fd62e695b15ef6825099a8dd4e0710334,3701623,Group 2 innate lymphoid cells (ILC2s) are impo...,"[1531, 204, 3, 28538, 25049, 32, 23, 26, 2640,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [17]:
# Check if it worked
import pandas as pd
from sqlalchemy import text

table_name = 'fulltext'

sql = text(f''' 
    SELECT * FROM {table_name} LIMIT 5;
''')

with engine.connect() as conn: 
    query = conn.execute(sql)

test = pd.DataFrame(query.fetchall())
test.head()

Unnamed: 0.1,empty,Unnamed: 0,corpusid,text,source.pdfurls,source.pdfsha,source.oainfo,annotations.abstract,annotations.author,annotations.authoraffiliation,...,annotations.publisher,annotations.sectionheader,annotations.table,annotations.tableref,annotations.title,annotations.venue,source.oainfo.license,source.oainfo.openaccessurl,source.oainfo.status,id
0,1410,1410,250929149,\nOPEN ACCESS EDITED BY\n\n\nAlexander Nikolae...,,57f2173016853ed6f6406a962adb11da43917165,,,"[{""end"":55,""start"":25},{""end"":69,""start"":56},{...","[{""end"":455,""start"":306},{""end"":1102,""start"":4...",...,,"[{""end"":3001,""start"":2989},{""end"":6895,""start""...","[{""end"":35560,""start"":34552},{""end"":38099,""sta...","[{""end"":18195,""start"":18188},{""end"":18790,""sta...","[{""end"":22,""start"":1},{""end"":1721,""start"":1700}]",,,,,1
1,1411,1411,215793097,\n\n\n\nCi Song \nState Key Laboratory of Repr...,,be45f9e9d6d1f4f713546b943cb2ff455989adc2,,,"[{""end"":228,""start"":4},{""end"":380,""start"":229}...","[{""end"":104,""start"":13},{""end"":227,""start"":106...",...,,"[{""end"":8781,""start"":8771}]",,"[{""end"":4906,""start"":4898}]",,,,https://academic.oup.com/biolreprod/article-pd...,BRONZE,2
2,1412,1412,246020601,\nCOVID-19 and the Otolaryngology Residency Ma...,,b7decbb79e55bb3c9b5430bef4323775813c39e0,,"[{""end"":2210,""start"":391}]","[{""end"":109,""start"":85},{""end"":127,""start"":110...",,...,,"[{""end"":2224,""start"":2212},{""end"":4135,""start""...","[{""end"":16114,""start"":15573},{""end"":16650,""sta...","[{""attributes"":{""ref_id"":""tab_0""},""end"":7027,""...","[{""end"":82,""start"":1},{""end"":227,""start"":146}]",,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9...,GREEN,3
3,1413,1413,18163935,\nAnalysis of biophysical and functional conse...,,895fdf69460791863817c037332c0403c3e917b1,,"[{""end"":1608,""start"":861}]","[{""end"":217,""start"":201},{""end"":235,""start"":21...","[{""end"":363,""start"":257},{""end"":410,""start"":366}]",...,,"[{""end"":7790,""start"":7769},{""end"":7810,""start""...","[{""end"":32871,""start"":32129}]","[{""attributes"":{""ref_id"":""tab_0""},""end"":12397,...","[{""end"":95,""start"":1},{""end"":506,""start"":412}]",,CCBY,https://doi.org/10.1002/1873-3468.12346,HYBRID,4
4,1414,1414,237935836,\nImpact of Plant-Based Meat Alternatives on t...,,acf534c72039d2b2c10544b12f7cc872c2ba1b2a,,,"[{""end"":289,""start"":123},{""end"":375,""start"":29...","[{""end"":219,""start"":148},{""end"":288,""start"":22...",...,,"[{""attributes"":{""n"":""1.""},""end"":1115,""start"":1...","[{""end"":60480,""start"":59841},{""end"":61332,""sta...","[{""attributes"":{""ref_id"":""tab_1""},""end"":8301,""...","[{""end"":95,""start"":1},{""end"":755,""start"":661}]",,CCBY,https://www.mdpi.com/2304-8158/10/9/2040/pdf,GOLD,5


# Create Vectore Store

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

llm = OpenAI(temperature=0)