In [1]:
import sys
sys.path.append('/workspace/src/')

In [2]:
#get text chunks to index
from dotenv import dotenv_values, load_dotenv
from sqlalchemy import create_engine
from sqlalchemy.orm import Session

from typing import List

from langchain_core.output_parsers import BaseOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field

import pickle
from tqdm import tqdm
import os
import pandas as pd
import json

import langchain_core.documents
from langchain_community.retrievers import BM25Retriever
from database.model import Base, Document, Table
from database.chunk_model import Chunk_Base, Chunk

from langchain_openai import ChatOpenAI

from preprocessing.utils import create_vectorstore, load_vectorstore, remove_enumeration
import nltk
from nltk.tokenize import word_tokenize

from langchain_community.retrievers import (
    ElasticSearchBM25Retriever,
)

nltk.download("punkt_tab")
db_vals = dotenv_values("/workspace/src/.env")
load_dotenv()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
topics = pickle.load(open("/workspace/src/data/topics.pkl", "rb"))

In [4]:
# Output parser will split the LLM result into a list of queries
class LineListOutputParser(BaseOutputParser[List[str]]):
    """Output parser for a list of lines."""

    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        return list(filter(None, lines))  # Remove empty lines


output_parser = LineListOutputParser()

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI assistant specialized in retrieving scientific information. 
Your task is to generate five distinct rephrasings of the user question so they can be 
effectively used with both sparse (e.g., BM25) and dense (e.g., cosine similarity) retrieval methods. 
Make sure each rephrasing captures different potential keywords, synonyms, or contexts 
specific to scientific research. Provide the five versions separated by newlines.

Original question: {question}"""
)

llm = ChatOpenAI(temperature=0, model="gpt-4o-2024-11-20")

llm_chain = QUERY_PROMPT | llm | output_parser

In [5]:
#generate query variations
query_variations = {}
for query_id, query_info in tqdm(topics.items()):

    query_text = query_info["title"] + " " + query_info["description"]
    res = llm_chain.invoke(query_text)
    res = remove_enumeration(res)
    query_variations[query_id].append(query_text)
    query_variations[query_id] = res
    


#save query variations as json

with open("/workspace/src/data/query_variations.json", "w") as f:
    json.dump(query_variations, f)


100%|██████████| 50/50 [02:14<00:00,  2.68s/it]


In [6]:
query_variations

{'1': ['What is the source or origin of the coronavirus responsible for COVID-19?',
  'How did the COVID-19 pandemic begin, and what is the origin of the SARS-CoV-2 virus?',
  'Where did the coronavirus that causes COVID-19 originate, and what are its origins?',
  'What is the evolutionary or biological origin of the virus SARS-CoV-2 linked to COVID-19?',
  'Can you explain the origins and initial emergence of the coronavirus behind COVID-19?'],
 '2': ['How does the behavior of the coronavirus vary with different weather conditions?',
  'What is the impact of seasonal weather changes on the transmission and activity of the coronavirus?',
  'In what ways do temperature and humidity influence the spread of the coronavirus?',
  'How are coronavirus infection rates affected by fluctuations in climate and weather patterns?',
  'What role do environmental factors like weather play in the adaptability and spread of the coronavirus?'],
 '3': ['Do individuals infected with SARS-CoV-2 develop im