In [3]:
import sys
sys.path.append('/workspace/src/')

In [4]:
#get text chunks to index
from dotenv import dotenv_values, load_dotenv
from sqlalchemy import create_engine
from sqlalchemy.orm import Session

import pickle
from tqdm import tqdm
import os

import langchain_core.documents
from langchain_community.retrievers import BM25Retriever
from database.model import Base, Document, Table
from database.chunk_model import Chunk_Base, Chunk

from preprocessing.utils import create_vectorstore, load_vectorstore
import nltk
from nltk.tokenize import word_tokenize

nltk.download("punkt_tab")
db_vals = dotenv_values("/workspace/src/.env")


In [5]:
#get text chunks to index

chunk_engine = create_engine(f"postgresql+psycopg2://{db_vals['USER']}:{db_vals['PASSWORD']}@{db_vals['ADDRESS']}:{db_vals['PORT']}/cord19chunks", echo=False)
chunk_session = Session(chunk_engine)

In [7]:
#text_chunks = chunk_session.query(Chunk).filter(Chunk.modality_type == "text").limit(1000).all()
#table_chunks = chunk_session.query(Chunk).filter(Chunk.modality_type == "table").limit(1000).all()
#text_chunks = chunk_session.query(Chunk).filter(Chunk.modality_type == "text").all()

#chunks = text_chunks + table_chunks

chunk_ids = chunk_session.query(Chunk.id).distinct().all()
chunk_ids = [id[0] for id in chunk_ids]

In [None]:
BATCH_SIZE = 10000
doi_full_text = {}

#skip if vectorstore already exists
if os.path.exists("/workspace/src/preprocessing/vectorstores/chromadb_store"):
    vectorstore = load_vectorstore("/workspace/src/preprocessing/vectorstores/chromadb_store")
else:
    #create vectorstore with first chunk_dois
    first_chunks = chunk_session.query(Chunk).filter(Chunk.id.in_(chunk_ids[:BATCH_SIZE])).all()
    vectorstore = create_vectorstore(first_chunks)

    for i in tqdm(range(BATCH_SIZE, len(chunk_ids), BATCH_SIZE), position=1, leave=True):
        print(f"Processing: {i}/{len(chunk_ids)}", end="\r")
        batch = chunk_ids[i:i + BATCH_SIZE]
        docs = chunk_session.query(Chunk).filter(Chunk.id.in_(batch)).all()

        lang_docs = [langchain_core.documents.Document(page_content=chunk.chunk_text, metadata={"id": chunk.id, "doi": chunk.doi, "chunk_type": chunk.chunk_type}) for chunk in docs]
        vectorstore.add_documents(lang_docs)

In [None]:
#table_chunks = chunk_session.query(Chunk).filter(Chunk.modality_type == "table").all()

In [20]:
topics = pickle.load(open("/workspace/src/data/topics.pkl", "rb"))

In [None]:
#generade indices
pooling_results = {}
for query_id, query_info in tqdm(topics.items()):

    query_text = query_info["title"] + " " + query_info["description"]

    results = vectorstore.similarity_search(query_text, k=100)
    pooling_results[query_id] = results

In [9]:
pickle.dump(pooling_results, open("/workspace/src/data/pooling_results_cosine.pkl", "wb"))

In [1]:
#generate bm25 pool

In [9]:
lang_docs = [langchain_core.documents.Document(page_content=chunk.chunk_text, metadata={"id": chunk.id, "doi": chunk.doi, "chunk_type": chunk.chunk_type}) for chunk in chunks]

In [18]:
retriever_bm25 = BM25Retriever.from_documents(lang_docs, preprocess_func=word_tokenize, k=100)

In [None]:
pooling_results_bm25 = {}
for query_id, query_info in tqdm(topics.items()):

    query_text = query_info["title"] + " " + query_info["description"]

    results = retriever_bm25.invoke(query_text, k=100)
    pooling_results_bm25[query_id] = results

In [23]:
pickle.dump(pooling_results_bm25, open("/workspace/src/data/pooling_results_bm25.pkl", "wb"))