In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "gemini.json"

api_key = os.environ.get("OPENAI_API_KEY")
if api_key is None:
    raise RuntimeError("Please set the OPENAI_API_KEY ")

In [2]:
from scirag import SciRagHybrid, SciRagDataSet

In [3]:
import pandas as pd
import time
import datetime

In [4]:
def parse_response(response):
    """Parse response to extract answer and sources"""
    if not response:
        return None, None
    
    try:
        if "**Answer**:" in response and "**Sources**:" in response:
            parts = response.split("**Sources**:")
            answer = parts[0].replace("**Answer**:", "").strip()
            sources = parts[1].strip() if len(parts) > 1 else ""
            return answer, sources
        else:
            return response, ""
    except:
        return response, ""

two-type-embedding

In [6]:
def get_openAI_embedding_response(num_questions=5):
    """
    Returns separate DataFrames for Gemini and OpenAI embedding comparisons
    
    Returns:
        tuple: (gemini_df, openai_df, comparison_df, summary_df)
    """
    
    # Load dataset
    dataset = SciRagDataSet()
    df = dataset.load_dataset()
    
    # Get questions
    question_columns = ['question', 'query', 'text', 'prompt']
    question_col = None
    for col in question_columns:
        if col in df.columns:
            question_col = col
            break
    
    if question_col is None:
        text_cols = df.select_dtypes(include=['object']).columns
        question_col = text_cols[0] if len(text_cols) > 0 else df.columns[0]
    
    questions = df[question_col].head(num_questions).tolist()
    
    # Initialize RAG systems
    print("Initializing RAG systems...")
    openai_rag = SciRagHybrid(embedding_provider="openai", openai_embedding_model="text-embedding-3-large", vector_db_backend="chromadb")
    
    # Process questions
    openai_results = []
    
    for i, question in enumerate(questions, 1):
        print(f"Processing question {i}/{num_questions}")
        
        # OpenAI processing
        start_time = time.time()
        try:
            openai_response = openai_rag.get_response(question)
            openai_time = time.time() - start_time
            openai_answer, openai_sources = parse_response(openai_response)
            openai_success = True
            openai_error = None
        except Exception as e:
            openai_response = None
            openai_time = time.time() - start_time
            openai_answer = None
            openai_sources = None
            openai_success = False
            openai_error = str(e)
        
        openai_results.append({
            'question_id': i,
            'question': question,
            'response': openai_response,
            'answer': openai_answer,
            'sources': openai_sources,
            'processing_time': openai_time,
            'success': openai_success,
            'error': openai_error,
            'embedding_system': 'OpenAI'
        })
    
    # Create DataFrames
    openai_df = pd.DataFrame(openai_results)
    return openai_df

In [None]:
openai_df = get_openAI_embedding_response(num_questions=5)
openai_df.to_csv("openai_embedding_results.csv", index=False)

Initializing RAG systems...
  Loaded 2503.14454v1.md
  Loaded 2010.00619v2.md
  Loaded 2201.02202v1.md
  Loaded 1807.06209v4.md
  Loaded 1604.01424v3.md
Total markdown documents loaded: 5
Splitting documents into chunks...
Created 305 chunks from 5 documents
Building contextual retrieval index...
Processed 305 chunks
Initializing embeddings and vector database...
Embeddings not found. Generating embeddings...
Processed batch 1/4
Processed batch 2/4
Processed batch 3/4
Processed batch 4/4
Embeddings saved to: /Users/apple/Documents/GitHub/scirag_paperqa/scirag/embeddings/openai_text-embedding-3-large_embeddings.npy
Creating vector database...
Loaded existing ChromaDB collection: sci_rag_chunks_openai_text_embedding_3_large
Loaded existing ChromaDB collection
Processing question 1/5
Processing question 2/5
Processing question 3/5
Processing question 4/5
Processing question 5/5


In [None]:
def get_Gemini_embedding_response(num_questions=5):
    """
    Returns separate DataFrames for Gemini and OpenAI embedding comparisons
    
    Returns:
        tuple: (gemini_df, openai_df, comparison_df, summary_df)
    """
    
    # Load dataset
    dataset = SciRagDataSet()
    df = dataset.load_dataset()
    
    # Get questions
    question_columns = ['question', 'query', 'text', 'prompt']
    question_col = None
    for col in question_columns:
        if col in df.columns:
            question_col = col
            break
    
    if question_col is None:
        text_cols = df.select_dtypes(include=['object']).columns
        question_col = text_cols[0] if len(text_cols) > 0 else df.columns[0]
    
    questions = df[question_col].head(num_questions).tolist()
    
    # Initialize RAG systems
    print("Initializing RAG systems...")
    gemini_rag = SciRagHybrid(embedding_provider="gemini", vector_db_backend="chromadb") #we didn't specify the embedding, so it will use the default one in the config file, please check the config file for the embedding model used
    
    # Process questions
    gemini_results = []

    
    for i, question in enumerate(questions, 1):
        print(f"Processing question {i}/{num_questions}")
        
        # Gemini processing
        start_time = time.time()
        try:
            gemini_response = gemini_rag.get_response(question)
            gemini_time = time.time() - start_time
            gemini_answer, gemini_sources = parse_response(gemini_response)
            gemini_success = True
            gemini_error = None
        except Exception as e:
            gemini_response = None
            gemini_time = time.time() - start_time
            gemini_answer = None
            gemini_sources = None
            gemini_success = False
            gemini_error = str(e)
        
        gemini_results.append({
            'question_id': i,
            'question': question,
            'response': gemini_response,
            'answer': gemini_answer,
            'sources': gemini_sources,
            'processing_time': gemini_time,
            'success': gemini_success,
            'error': gemini_error,
            'embedding_system': 'Gemini'
        })
    
    # Create DataFrames
    gemini_df = pd.DataFrame(gemini_results)
    return gemini_df

In [11]:
gemini_df= get_Gemini_embedding_response(num_questions=5)

Initializing RAG systems...
  Loaded 2503.14454v1.md
  Loaded 2010.00619v2.md
  Loaded 2201.02202v1.md
  Loaded 1807.06209v4.md
  Loaded 1604.01424v3.md
Total markdown documents loaded: 5
Splitting documents into chunks...
Created 305 chunks from 5 documents
Building contextual retrieval index...
Processed 305 chunks
Initializing embeddings and vector database...
Embeddings not found. Generating embeddings...


Embedding texts (Gemini):   0%|          | 0/305 [00:00<?, ?doc/s]

Embeddings saved to: /Users/apple/Documents/GitHub/scirag_paperqa/scirag/embeddings/gemini_gemini-embedding-001_embeddings.npy
Creating vector database...
Collection sci_rag_chunks_gemini_gemini_embedding_001 not found: Collection [sci_rag_chunks_gemini_gemini_embedding_001] does not exists
No existing collection found: Collection [sci_rag_chunks] does not exists
Creating new ChromaDB collection...
Creating new collection: sci_rag_chunks_gemini_gemini_embedding_001
Stored 305 chunks in ChromaDB collection 'sci_rag_chunks_gemini_gemini_embedding_001' at /Users/apple/Documents/GitHub/scirag_paperqa/scirag/embeddings/chromadb
Collection sci_rag_chunks_gemini_gemini_embedding_001_gemini_gemini_embedding_001 not found: Collection [sci_rag_chunks_gemini_gemini_embedding_001_gemini_gemini_embedding_001] does not exists
Loaded fallback ChromaDB collection: sci_rag_chunks_gemini_gemini_embedding_001
ChromaDB vector DB built successfully
Processing question 1/5
Processing question 2/5
Processing

In [12]:
openai_df

Unnamed: 0,question_id,question,response,answer,sources,processing_time,success,error,embedding_system
0,1,How is the standard recombination history test...,**Answer**:\n\nThe Planck 2018 analysis tests ...,The Planck 2018 analysis tests the standard re...,"1807.06209v4.md, Section 7.7, 1807.06209v4.md,...",9.771257,True,,OpenAI
1,2,Which corrections in polarization spectra were...,"**Answer**:\n\nIn the 2018 Planck analysis, co...","In the 2018 Planck analysis, corrections were ...",1807.06209v4.md,7.833613,True,,OpenAI
2,3,What multipole cuts were applied in the Camspe...,"**Answer**:\n\nFor the Planck 2018 analysis, t...","For the Planck 2018 analysis, the Camspec temp...",1807.06209v4.md,3.523303,True,,OpenAI
3,4,What is the effective sky-fraction of the apod...,"**Answer**:\n\nFor the Planck 2018 analysis, t...","For the Planck 2018 analysis, the default CamS...",1807.06209v4.md,3.361629,True,,OpenAI
4,5,How large is the impact of beam window functio...,"**Answer**:\n\nIn the 2018 release, the applic...","In the 2018 release, the application of beam w...",1807.06209v4.md,2.974444,True,,OpenAI


openAI-rag

In [16]:
from scirag import SciRagOpenAI

In [17]:
def get_OpenAI_rag_response(num_questions=5):
    """
    Returns separate DataFrames for Gemini and OpenAI embedding comparisons
    
    Returns:
        tuple: (gemini_df, openai_df, comparison_df, summary_df)
    """
    
    # Load dataset
    dataset = SciRagDataSet()
    df = dataset.load_dataset()
    
    # Get questions
    question_columns = ['question', 'query', 'text', 'prompt']
    question_col = None
    for col in question_columns:
        if col in df.columns:
            question_col = col
            break
    
    if question_col is None:
        text_cols = df.select_dtypes(include=['object']).columns
        question_col = text_cols[0] if len(text_cols) > 0 else df.columns[0]
    
    questions = df[question_col].head(num_questions).tolist()
    
    # Initialize RAG system
    print("Initializing OpenAI RAG system...")
    scirag = SciRagOpenAI(vector_db_backend="openai")
    
    openai_results = []
    
    for i, question in enumerate(questions, 1):
        print(f"Processing question {i}/{num_questions}")
        
        # OpenAI processing
        start_time = time.time()
        try:
            openai_response = scirag.get_response(question)
            openai_time = time.time() - start_time
            openai_answer, openai_sources = parse_response(openai_response)
            openai_success = True
            openai_error = None
        except Exception as e:
            openai_response = None
            openai_time = time.time() - start_time
            openai_answer = None
            openai_sources = None
            openai_success = False
            openai_error = str(e)
        
        openai_results.append({
            'question_id': i,
            'question': question,
            'response': openai_response,
            'answer': openai_answer,
            'sources': openai_sources,
            'processing_time': openai_time,
            'success': openai_success,
            'error': openai_error,
            'embedding_system': 'OpenAI'
        })
    
    # Create DataFrame
    openai_df = pd.DataFrame(openai_results)
    return openai_df

In [18]:
openai_df = get_OpenAI_rag_response(num_questions=5)

Initializing OpenAI RAG system...
Listing existing RAG Corpora:
--- Found existing corpus: corpus ---
Creating assistant for existing vector store...
Assistant created for existing corpus with ID: asst_qVUaKGC1GDjm6UX0lPsQUinu
Processing question 1/5
Processing question 2/5
Processing question 3/5
Processing question 4/5
Processing question 5/5


In [19]:
openai_df

Unnamed: 0,question_id,question,response,answer,sources,processing_time,success,error,embedding_system
0,1,How is the standard recombination history test...,**Answer**:\n\nThe standard recombination hist...,The standard recombination history in the Plan...,1807.06209v4.md,12.331657,True,,OpenAI
1,2,Which corrections in polarization spectra were...,**Answer**:\n\nThe 2018 Planck analysis implem...,The 2018 Planck analysis implemented several k...,1807.06209v4.md section: Correction of systema...,13.407457,True,,OpenAI
2,3,What multipole cuts were applied in the Camspe...,"**Answer**:\n\nFor the Planck 2018 analysis, t...","For the Planck 2018 analysis, the Camspec temp...","1807.06209v4.md, Section discussing multipole ...",9.53364,True,,OpenAI
3,4,What is the effective sky-fraction of the apod...,**Answer**:\n\nThe effective sky-fraction of t...,The effective sky-fraction of the apodized Cam...,"1807.06209v4.md, see: 'The default CamSpec pol...",7.378245,True,,OpenAI
4,5,How large is the impact of beam window functio...,**Answer**:\n\nThe impact of beam window funct...,The impact of beam window functions on the 201...,"1807.06209v4.md, Section: Major changes with r...",13.333454,True,,OpenAI


paperqa2

In [20]:
from scirag import SciRagPaperQA2

In [23]:

def get_PaperQA2_rag_response(num_questions=5):
# Load dataset
    dataset = SciRagDataSet()
    df = dataset.load_dataset()
    
    # Get questions - using 'question' column based on your example
    question_columns = ['question', 'query', 'text', 'prompt']
    question_col = None
    for col in question_columns:
        if col in df.columns:
            question_col = col
            break
    
    if question_col is None:
        text_cols = df.select_dtypes(include=['object']).columns
        question_col = text_cols[0] if len(text_cols) > 0 else df.columns[0]
    
    questions = df[question_col].head(num_questions).tolist()
    
    # Initialize PaperQA2 RAG system
    print("Initializing PaperQA2 RAG system...")
    scirag = SciRagPaperQA2()
    
    paperqa2_results = []
    
    for i, question in enumerate(questions, 1):
        print(f"Processing question {i}/{num_questions}")
        print(f"Question: {question}")
        
        # PaperQA2 processing with timing
        start_time = time.time()
        try:
            paperqa2_response = scirag.get_response(question)
            paperqa2_time = time.time() - start_time
            paperqa2_answer, paperqa2_sources = parse_response(paperqa2_response.answer)
            paperqa2_success = True
            paperqa2_error = None
            
            # Display results for interactive use
            display(Markdown(f"**Response:**\n{paperqa2_response.answer}"))
            
        except Exception as e:
            paperqa2_response = None
            paperqa2_time = time.time() - start_time
            paperqa2_answer = None
            paperqa2_sources = None
            paperqa2_success = False
            paperqa2_error = str(e)
        print(f"Processing Time: {paperqa2_time:.2f} seconds")
        
        paperqa2_results.append({
            'question_id': i,
            'question': question,
            'response': paperqa2_response,
            'answer': paperqa2_answer,
            'sources': paperqa2_sources,
            'processing_time': paperqa2_time,
            'success': paperqa2_success,
            'error': paperqa2_error,
            'embedding_system': 'PaperQA2'
        })
    
    
    # Create DataFrame
    paperqa2_df = pd.DataFrame(paperqa2_results)
    
    return paperqa2_df

In [24]:
paperqa2_df = get_PaperQA2_rag_response(num_questions=5)

Initializing PaperQA2 RAG system...
[SciRagPaperQA2] Building index on initialization...
[SciRagPaperQA2] Checking for paper directory: /Users/apple/Documents/GitHub/scirag_paperqa/scirag/txt_files
[SciRagPaperQA2] Building PaperQA2 document index (only happens once)...
Using index: pqa_index_969208873c16842f6232653a9eca7e7a
Index files: {'2201.02202v1.txt': 'a569bb587a9091300ba2f479aba3efc8', '1604.01424v3.txt': '70196f1a55258dc0d6ece83887dec601', '1807.06209v4.txt': '61f1a51953008a179d876dded8a7f3a1', '2010.00619v2.txt': 'cc0b6c6c76fe9a1b9132c0dd9de161bf', '2503.14454v1.txt': 'cc44e168db0f207acfe5c6a6f185e266'}
[SciRagPaperQA2] Index built successfully.
Processing question 1/5
Question: How is the standard recombination history tested in the Planck 2018 analysis?


**Response:**
**Answer**:

The standard recombination history is tested in the Planck 2018 analysis by utilizing precision data from the Planck satellite to examine the decoupling of cosmic microwave background (CMB) photons from baryons around redshift z ≈ 10^3, employing advanced recombination codes like CosmoRec and HyRec, and performing a semi-blind eigen-analysis on deviations of the free-electron fraction . Additionally, a non-parametric reconstruction using the ModRec model is applied to parametrize deviations in the ionization fraction, with results influenced by baryon acoustic oscillation (BAO) data .

**Sources**:

aghanim2020<i>planck<i>2018results chunk 63, calabrese2025theatacamacosmology chunk 22


Processing Time: 12.38 seconds
Processing question 2/5
Question: Which corrections in polarization spectra were implemented in the 2018 Planck analysis?


**Response:**
**Answer**:

In the 2018 Planck analysis, significant corrections to the polarization spectra were implemented, primarily addressing polarization efficiencies and beam leakage. Two approaches were utilized: a 'map-based' approach, which used estimates from the EE spectrum for both TE and EE spectra, and a 'spectrum-based' approach, which applied different efficiencies to TE and EE spectra based on their respective fits .

**Sources**:

aghanim2020<i>planck<i>2018results chunk 9


Processing Time: 10.74 seconds
Processing question 3/5
Question: What multipole cuts were applied in the Camspec temperature likelihood for the 143x217 spectrum for the Planck 2018 analysis?


**Response:**
**Answer**:

In the Camspec temperature likelihood for the Planck 2018 analysis, the multipole cuts applied to the 143 × 217 spectrum were set at \( \ell_{\min} = 500 \) and \( \ell_{\max} = 2500 \) to mitigate potential systematic biases arising from Galactic dust .

**Sources**:

aghanim2020<planck<2018results chunk 10, aghanim2020<planck<2018results chunk 11


Processing Time: 11.04 seconds
Processing question 4/5
Question: What is the effective sky-fraction of the apodized Camspec polarization mask for the Planck 2018 analysis?


**Response:**
**Answer**:

The effective sky-fraction of the apodized CamSpec polarization mask used for the Planck 2018 analysis is \( f_{\mathrm{sky}}^{\mathrm{W}} = 47.7\% \), while it preserves a fraction of \( f_{\mathrm{sky}} = 57.7\% \) to account for polarized Galactic dust emission .

**Sources**:

aghanim2020<planck<2018results chunk 11


Processing Time: 35.74 seconds
Processing question 5/5
Question: How large is the impact of beam window functions on the 2018 spectra in the baseline Plik likelihood?


**Response:**
**Answer**:

The impact of beam window functions on the 2018 spectra in the baseline Plik likelihood is small, approximately 0.1% at ℓ=2000, due to a new approach in calculating effective beam window functions tailored to specific sky fractions at each frequency .

**Sources**:

aghanim2020<i>planck<i>2018results chunk 8


Processing Time: 12.11 seconds


In [25]:
paperqa2_df

Unnamed: 0,question_id,question,response,answer,sources,processing_time,success,error,embedding_system
0,1,How is the standard recombination history test...,answer='**Answer**:\n\nThe standard recombinat...,The standard recombination history is tested i...,"aghanim2020<i>planck<i>2018results chunk 63, c...",12.382422,True,,PaperQA2
1,2,Which corrections in polarization spectra were...,"answer=""**Answer**:\n\nIn the 2018 Planck anal...","In the 2018 Planck analysis, significant corre...",aghanim2020<i>planck<i>2018results chunk 9,10.743154,True,,PaperQA2
2,3,What multipole cuts were applied in the Camspe...,answer='**Answer**:\n\nIn the Camspec temperat...,In the Camspec temperature likelihood for the ...,"aghanim2020<planck<2018results chunk 10, aghan...",11.040152,True,,PaperQA2
3,4,What is the effective sky-fraction of the apod...,answer='**Answer**:\n\nThe effective sky-fract...,The effective sky-fraction of the apodized Cam...,aghanim2020<planck<2018results chunk 11,35.736266,True,,PaperQA2
4,5,How large is the impact of beam window functio...,answer='**Answer**:\n\nThe impact of beam wind...,The impact of beam window functions on the 201...,aghanim2020<i>planck<i>2018results chunk 8,12.105115,True,,PaperQA2
