In [None]:
import sys
sys.path.append('/workspace/src/')

In [None]:
#get text chunks to index
from dotenv import dotenv_values, load_dotenv
from sqlalchemy import create_engine
from sqlalchemy.orm import Session

import pickle
from tqdm import tqdm
import os
import pandas as pd

import langchain_core.documents
from langchain_community.retrievers import BM25Retriever
from database.model import Base, Document, Table
from database.chunk_model import Chunk_Base, Chunk

from preprocessing.utils import create_vectorstore, load_vectorstore, process_pickle_embeddings_in_chunks
import nltk
from nltk.tokenize import word_tokenize

nltk.download("punkt_tab")
db_vals = dotenv_values("/workspace/src/.env")


In [None]:
chunk_engine = create_engine(f"postgresql+psycopg2://{db_vals['USER']}:{db_vals['PASSWORD']}@{db_vals['ADDRESS']}:{db_vals['PORT']}/cord19chunks", echo=False)
chunk_session = Session(chunk_engine)

In [None]:
#building vector store
vectorstore = create_vectorstore(chunks=[], save_path="vectorstores")

#vectorstore = load_vectorstore("/workspace/src/preprocessing/vectorstores/chromadb_store")

In [None]:
process_pickle_embeddings_in_chunks(
    vectorstore=vectorstore,
    directory="/workspace/embeddings/embeddings/",
    chunk_session=chunk_session,
    chunk_size=100  # Adjust based on memory constraints and performance
)


In [None]:
#building bm25 retriever/store
lang_docs = [langchain_core.documents.Document(page_content=chunk.chunk_text, metadata={"id": chunk.id, "doi": chunk.doi, "chunk_type": chunk.chunk_type}) for chunk in chunks]

In [None]:
retriever_bm25 = BM25Retriever.from_documents(lang_docs, preprocess_func=word_tokenize, k=100)

In [None]:
pickle.dump(retriever_bm25, open("/workspace/src/preprocessing/vectorstores/bm25/retriever_bm25.pkl", "wb"))