In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "gemini.json"

api_key = os.environ.get("OPENAI_API_KEY")
if api_key is None:
    raise RuntimeError("Please set the OPENAI_API_KEY ")

In [2]:
from scirag import SciRagHybrid, SciRagDataSet

In [3]:
import pandas as pd
import time
import datetime

In [4]:
def parse_response(response):
    """Parse response to extract answer and sources"""
    if not response:
        return None, None
    
    try:
        if "**Answer**:" in response and "**Sources**:" in response:
            parts = response.split("**Sources**:")
            answer = parts[0].replace("**Answer**:", "").strip()
            sources = parts[1].strip() if len(parts) > 1 else ""
            return answer, sources
        else:
            return response, ""
    except:
        return response, ""

In [5]:
def save_dataframes_separately(gemini_df, openai_df, base_name=None):
    """Save each DataFrame to a separate CSV file"""
    if base_name is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        base_name = f"embedding_comparison_{timestamp}"
    
    files = {}
    
    # Save each DataFrame
    files['gemini'] = f"{base_name}_gemini.csv"
    gemini_df.to_csv(files['gemini'], index=False)
    
    files['openai'] = f"{base_name}_openai.csv"
    openai_df.to_csv(files['openai'], index=False)
    return files

In [None]:
def get_separate_dataframes(num_questions=5):
    """
    Returns separate DataFrames for Gemini and OpenAI embedding comparisons
    
    Returns:
        tuple: (gemini_df, openai_df, comparison_df, summary_df)
    """
    
    # Load dataset
    dataset = SciRagDataSet()
    df = dataset.load_dataset()
    
    # Get questions
    question_columns = ['question', 'query', 'text', 'prompt']
    question_col = None
    for col in question_columns:
        if col in df.columns:
            question_col = col
            break
    
    if question_col is None:
        text_cols = df.select_dtypes(include=['object']).columns
        question_col = text_cols[0] if len(text_cols) > 0 else df.columns[0]
    
    questions = df[question_col].head(num_questions).tolist()
    
    # Initialize RAG systems
    print("Initializing RAG systems...")
    gemini_rag = SciRagHybrid(embedding_provider="gemini", vector_db_backend="chromadb")
    openai_rag = SciRagHybrid(embedding_provider="openai", openai_embedding_model="text-embedding-3-large", vector_db_backend="chromadb")
    
    # Process questions
    gemini_results = []
    openai_results = []
    
    for i, question in enumerate(questions, 1):
        print(f"Processing question {i}/{num_questions}")
        
        # Gemini processing
        start_time = time.time()
        try:
            gemini_response = gemini_rag.get_response(question)
            gemini_time = time.time() - start_time
            gemini_answer, gemini_sources = parse_response(gemini_response)
            gemini_success = True
            gemini_error = None
        except Exception as e:
            gemini_response = None
            gemini_time = time.time() - start_time
            gemini_answer = None
            gemini_sources = None
            gemini_success = False
            gemini_error = str(e)
        
        gemini_results.append({
            'question_id': i,
            'question': question,
            'response': gemini_response,
            'answer': gemini_answer,
            'sources': gemini_sources,
            'processing_time': gemini_time,
            'success': gemini_success,
            'error': gemini_error,
            'embedding_system': 'Gemini'
        })
        
        # OpenAI processing
        start_time = time.time()
        try:
            openai_response = openai_rag.get_response(question)
            openai_time = time.time() - start_time
            openai_answer, openai_sources = parse_response(openai_response)
            openai_success = True
            openai_error = None
        except Exception as e:
            openai_response = None
            openai_time = time.time() - start_time
            openai_answer = None
            openai_sources = None
            openai_success = False
            openai_error = str(e)
        
        openai_results.append({
            'question_id': i,
            'question': question,
            'response': openai_response,
            'answer': openai_answer,
            'sources': openai_sources,
            'processing_time': openai_time,
            'success': openai_success,
            'error': openai_error,
            'embedding_system': 'OpenAI'
        })
    
    # Create DataFrames
    gemini_df = pd.DataFrame(gemini_results)
    openai_df = pd.DataFrame(openai_results)
    return gemini_df, openai_df

In [7]:
gemini_df, openai_df = get_separate_dataframes(5)

Initializing RAG systems...
  Loaded 2503.14454v1.md
  Loaded 2010.00619v2.md
  Loaded 1807.06209v4.md
  Loaded 1604.01424v3.md
Total markdown documents loaded: 4
Splitting documents into chunks...
Created 282 chunks from 4 documents
Building contextual retrieval index...
Processed 282 chunks
Initializing embeddings and vector database...
Embeddings not found. Generating embeddings...


Embedding texts (Gemini):   0%|          | 0/282 [00:00<?, ?doc/s]

Embeddings saved to: /Users/apple/Documents/GitHub/scirag_paperqa/scirag/embeddings/gemini_gemini-embedding-001_embeddings.npy
Creating vector database...
Collection sci_gemini_gemini_embedding_001 not found: Collection [sci_gemini_gemini_embedding_001] does not exists
No existing collection found: Collection [sci_rag_chunks] does not exists
Creating new ChromaDB collection...
Stored 282 chunks in ChromaDB collection 'sci_rag_chunks_gemini_gemini_embedding_001' at /Users/apple/Documents/GitHub/scirag_paperqa/scirag/embeddings/chromadb
Collection sci_gemini_gemini_embedding_001 not found: Collection [sci_gemini_gemini_embedding_001] does not exists
Loaded fallback ChromaDB collection: sci_rag_chunks_gemini_gemini_embedding_001
ChromaDB vector DB built successfully with 282 chunks
  Loaded 2503.14454v1.md
  Loaded 2010.00619v2.md
  Loaded 1807.06209v4.md
  Loaded 1604.01424v3.md
Total markdown documents loaded: 4
Splitting documents into chunks...
Created 282 chunks from 4 documents
Buil

In [8]:
gemini_df

Unnamed: 0,question_id,question,response,answer,sources,processing_time,success,error,embedding_system
0,1,How is the standard recombination history test...,**Answer**:\n\nThe Planck 2018 analysis tests ...,The Planck 2018 analysis tests the standard re...,1807.06209v4.md,6.900003,True,,Gemini
1,2,Which corrections in polarization spectra were...,**Answer**:\n\nThe 2018 Planck analysis applie...,The 2018 Planck analysis applied corrections t...,"1807.06209v4.md, ""Correction of systematic eff...",10.739286,True,,Gemini
2,3,What multipole cuts were applied in the Camspe...,"**Answer**:\n\nFor the Planck 2018 analysis, t...","For the Planck 2018 analysis, the Camspec temp...",1807.06209v4.md,3.451063,True,,Gemini
3,4,What is the effective sky-fraction of the apod...,"**Answer**:\n\nFor the Planck 2018 analysis, t...","For the Planck 2018 analysis, the apodized Cam...",1807.06209v4.md,2.661637,True,,Gemini
4,5,How large is the impact of beam window functio...,"**Answer**:\n\nIn the 2018 release, the applic...","In the 2018 release, the application of beam w...",1807.06209v4.md,3.468771,True,,Gemini


In [9]:
openai_df

Unnamed: 0,question_id,question,response,answer,sources,processing_time,success,error,embedding_system
0,1,How is the standard recombination history test...,**Answer**:\n\nThe Planck 2018 analysis tests ...,The Planck 2018 analysis tests the standard re...,"1807.06209v4.md, Section 7.7, 2503.14454v1.md,...",6.377028,True,,OpenAI
1,2,Which corrections in polarization spectra were...,**Answer**:\n\nThe 2018 Planck analysis implem...,The 2018 Planck analysis implemented correctio...,1807.06209v4.md: Correction of systematic effe...,6.933423,True,,OpenAI
2,3,What multipole cuts were applied in the Camspe...,"**Answer**:\n\nFor the Planck 2018 analysis, t...","For the Planck 2018 analysis, the Camspec temp...","1807.06209v4.md, Section 2.2.2, Context 3",3.384928,True,,OpenAI
3,4,What is the effective sky-fraction of the apod...,"**Answer**:\n\nFor the Planck 2018 analysis, t...","For the Planck 2018 analysis, the default CamS...",1807.06209v4.md,2.882507,True,,OpenAI
4,5,How large is the impact of beam window functio...,**Answer**:\n\nThe impact of beam window funct...,The impact of beam window functions on the 201...,1807.06209v4.md,4.404723,True,,OpenAI
