In [1]:
import sys
sys.path.append('/workspace/src/')

In [2]:
#get text chunks to index
from dotenv import dotenv_values, load_dotenv
from sqlalchemy import create_engine
from sqlalchemy.orm import Session

import pickle
from tqdm.auto import tqdm
import os
import json
import pandas as pd

import langchain_core.documents
from langchain_community.retrievers import BM25Retriever
from database.model import Base, Document, Table
from database.chunk_model import Chunk_Base, Chunk

from preprocessing.utils import create_vectorstore, load_vectorstore
import nltk
from nltk.tokenize import word_tokenize

nltk.download("punkt_tab")
db_vals = dotenv_values("/workspace/src/.env")



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
topics = pickle.load(open("/workspace/src/data/topics.pkl", "rb"))
query_variations = json.load(open("/workspace/src/data/query_variations.json", "r"))

In [4]:
#define retrieval variations for pooling
retrieval_models = ["cosine", "bm25"]
query_variation_keys = ["original", "variant1", "variant2", "variant3", "variant4", "variant5"]
modalities = ["table", "passage"]

rankings = {key: {} for key in [f"{model}_{query_variation_key}_{modality}" for model in retrieval_models for query_variation_key in query_variation_keys for modality in modalities]}

In [5]:
len(rankings)

24

In [6]:
#loading vector store
vectorstore = load_vectorstore("/workspace/src/preprocessing/vectorstores/chromadb_store")

retriever_bm25_passage = pickle.load(open("/workspace/src/preprocessing/vectorstores/bm25/retriever_bm25_passage.pkl", "rb"))
retriever_bm25_table = pickle.load(open("/workspace/src/preprocessing/vectorstores/bm25/retriever_bm25_table.pkl", "rb"))

  embedding = OpenAIEmbeddings(model=embedding_model)
  vectorstore = Chroma(


In [7]:
rankings = pickle.load(open("/workspace/src/data/rankings_new.pkl", "rb"))

In [20]:
outer_progress = tqdm(query_variations.keys(), desc="Processing Queries", position=0, leave=True)

for query_id in outer_progress:
    
    query_variation_keys = query_variations.get(query_id, {}).keys()
    
    # Inner progress bar (for variations of the current query)
    inner_progress = tqdm(query_variation_keys, desc=f"Query {query_id}", position=1, leave=False)

    for query_variation_key in inner_progress:
        query_text = query_variations[query_id][query_variation_key]

        #check if query variation key is in rankings
        if query_id in rankings[f"bm25_{query_variation_key}_passage"].keys():
            continue
        
        # Cosine search
        try:    
            results_table_cosine = vectorstore.similarity_search(query_text, k=100, filter={"chunk_type": "RCTS_8192_1000"})
        except Exception as e:
            print(e, f"Error for query: {query_text}")
            results_table_cosine = []
        
        results_passage_cosine = vectorstore.similarity_search(query_text, k=100, filter={"chunk_type": "RCTS_512_100"})

        # Bm25 search
        results_table_bm25 = retriever_bm25_table.invoke(query_text, k=100)
        results_passage_bm25 = retriever_bm25_passage.invoke(query_text, k=100)

        # Ensure rankings structure exists

        rankings[f"cosine_{query_variation_key}_table"][query_id] = results_table_cosine
        rankings[f"cosine_{query_variation_key}_passage"][query_id] = results_passage_cosine

        rankings[f"bm25_{query_variation_key}_table"][query_id] = results_table_bm25
        rankings[f"bm25_{query_variation_key}_passage"][query_id] = results_passage_bm25

    inner_progress.close()  # Explicitly close the inner progress bar to force updates


Processing Queries:   0%|          | 0/50 [00:00<?, ?it/s]

Query 1:   0%|          | 0/6 [00:00<?, ?it/s]

Query 2:   0%|          | 0/6 [00:00<?, ?it/s]

Query 3:   0%|          | 0/6 [00:00<?, ?it/s]

Query 4:   0%|          | 0/6 [00:00<?, ?it/s]

Query 5:   0%|          | 0/6 [00:00<?, ?it/s]

Query 6:   0%|          | 0/6 [00:00<?, ?it/s]

Query 7:   0%|          | 0/6 [00:00<?, ?it/s]

Query 8:   0%|          | 0/6 [00:00<?, ?it/s]

Query 9:   0%|          | 0/6 [00:00<?, ?it/s]

Query 10:   0%|          | 0/6 [00:00<?, ?it/s]

Query 11:   0%|          | 0/6 [00:00<?, ?it/s]

Query 12:   0%|          | 0/6 [00:00<?, ?it/s]

Query 13:   0%|          | 0/6 [00:00<?, ?it/s]

Query 14:   0%|          | 0/6 [00:00<?, ?it/s]

Query 15:   0%|          | 0/6 [00:00<?, ?it/s]

Query 16:   0%|          | 0/6 [00:00<?, ?it/s]

Query 17:   0%|          | 0/6 [00:00<?, ?it/s]

Query 18:   0%|          | 0/6 [00:00<?, ?it/s]

Query 19:   0%|          | 0/6 [00:00<?, ?it/s]

Query 20:   0%|          | 0/6 [00:00<?, ?it/s]

Query 21:   0%|          | 0/6 [00:00<?, ?it/s]

Query 22:   0%|          | 0/6 [00:00<?, ?it/s]

Query 23:   0%|          | 0/6 [00:00<?, ?it/s]

Query 24:   0%|          | 0/6 [00:00<?, ?it/s]

Query 25:   0%|          | 0/6 [00:00<?, ?it/s]

Query 26:   0%|          | 0/6 [00:00<?, ?it/s]

Query 27:   0%|          | 0/6 [00:00<?, ?it/s]

Query 28:   0%|          | 0/6 [00:00<?, ?it/s]

Query 29:   0%|          | 0/6 [00:00<?, ?it/s]

Query 30:   0%|          | 0/6 [00:00<?, ?it/s]

Query 31:   0%|          | 0/6 [00:00<?, ?it/s]

Query 32:   0%|          | 0/6 [00:00<?, ?it/s]

Query 33:   0%|          | 0/6 [00:00<?, ?it/s]

Query 34:   0%|          | 0/6 [00:00<?, ?it/s]

Query 35:   0%|          | 0/6 [00:00<?, ?it/s]

Cannot return the results in a contigious 2D array. Probably ef or M is too small Error for query: Are there any new open-access datasets focused on COVID-19 research?


Query 36:   0%|          | 0/6 [00:00<?, ?it/s]

Query 37:   0%|          | 0/6 [00:00<?, ?it/s]

Query 38:   0%|          | 0/6 [00:00<?, ?it/s]

Query 39:   0%|          | 0/6 [00:00<?, ?it/s]

Query 40:   0%|          | 0/6 [00:00<?, ?it/s]

Query 41:   0%|          | 0/6 [00:00<?, ?it/s]

Query 42:   0%|          | 0/6 [00:00<?, ?it/s]

Query 43:   0%|          | 0/6 [00:00<?, ?it/s]

Query 44:   0%|          | 0/6 [00:00<?, ?it/s]

Query 45:   0%|          | 0/6 [00:00<?, ?it/s]

Query 46:   0%|          | 0/6 [00:00<?, ?it/s]

Query 47:   0%|          | 0/6 [00:00<?, ?it/s]

Query 48:   0%|          | 0/6 [00:00<?, ?it/s]

Query 49:   0%|          | 0/6 [00:00<?, ?it/s]

Query 50:   0%|          | 0/6 [00:00<?, ?it/s]

In [21]:
pickle.dump(rankings, open("/workspace/src/data/rankings_new2.pkl", "wb"))

In [None]:
query_variation_key

In [9]:
search_query = query_variations['35']['variant2']

In [10]:
search_query

'Are there any new open-access datasets focused on COVID-19 research?'

In [19]:
try:    
    results_table_cosine = vectorstore.similarity_search(search_query, k=100, filter={"chunk_type": "RCTS_8192_1000"})
except Exception as e:
    print(e, f"Error for query: {search_query}")
    results_table_cosine = []

Cannot return the results in a contigious 2D array. Probably ef or M is too small Error for query: Are there any new open-access datasets focused on COVID-19 research?


In [17]:
results_table_cosine

[Document(metadata={'chunk_type': 'RCTS_8192_1000', 'doi': '10.1111/eci.13323#2', 'id': 8411576}, page_content="Table Name: TABLE 2 \nHeader: ['0', '1'] \nContent: [['Open data (shared datasets)', 'Fulfilled by 0.7% of the research items  (n = 1 out of 140)$^{a}$'], ['Patient data', 'Included in 22.9% of the research items  (n = 32 out of 140), with the remaining  being mainly opinion papers$^{c}$'], ['Expedited peer review policy', 'Fulfilled by all five b journals'], ['Elimination of embargo policy (open access  for COVID-19-related manuscripts)', 'Fulfilled by all five b journals (open access  to all items)'], ['Fit-for-purpose platforms to present COVID- 19 research', 'Fulfilled by all five b journals']] \nCaption: TABLE 2 Degree of adherence to the WHO call for open data in PHEIC in five milestone Medical journals b \nReferences: []"),
 Document(metadata={'chunk_type': 'RCTS_8192_1000', 'doi': '10.7759/cureus.7422#0', 'id': 8342006}, page_content="Table Name: TABLE 1 \nHeader: ['I