In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "gemini.json"
# Remove the API key if it exists
if "GOOGLE_API_KEY" in os.environ:
    del os.environ["GOOGLE_API_KEY"]
    print("Removed GOOGLE_API_KEY from environment")



Removed GOOGLE_API_KEY from environment


In [2]:
from scirag import SingleRAGEvaluationSystem,GeminiEvaluator

  import pkg_resources


In [3]:
import pandas as pd

# Cost Performance Analysis

In [53]:
import tiktoken
import os
import glob
from typing import List, Dict, Union

def count_tokens_in_markdown_files(file_paths: Union[str, List[str]], encoding_name: str = "cl100k_base") -> Dict[str, int]:
    """
    Count tokens in markdown files using tiktoken.
    
    Args:
        file_paths: Either a glob pattern (str) or list of file paths
        encoding_name: Tiktoken encoding to use (default: "cl100k_base" for GPT-4)
    
    Returns:
        Dictionary with file paths as keys and token counts as values
    """
    # Handle glob pattern or list of paths
    if isinstance(file_paths, str):
        # It's a glob pattern
        file_list = glob.glob(file_paths)
        if not file_list:
            print(f"No files found matching pattern: {file_paths}")
            return {}
    else:
        # It's already a list
        file_list = file_paths
    # Initialize the encoding
    encoding = tiktoken.get_encoding(encoding_name)
    
    token_counts = {}
    total_tokens = 0
    
    for file_path in file_list:
        try:
            # Read the markdown file
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
            # Count tokens
            tokens = encoding.encode(content)
            token_count = len(tokens)
            
            # Store result
            filename = os.path.basename(file_path)
            token_counts[filename] = token_count
            total_tokens += token_count
            
            print(f"{filename}: {token_count:,} tokens")
            
        except FileNotFoundError:
            print(f"Error: File not found - {file_path}")
            token_counts[os.path.basename(file_path)] = 0
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            token_counts[os.path.basename(file_path)] = 0
    
    # Add total
    token_counts['TOTAL'] = total_tokens
    print(f"\nTotal across all files: {total_tokens:,} tokens")
    
    return token_counts

In [54]:
count_tokens_in_markdown_files("../markdowns/*.md")

2503.14454v1.md: 111,530 tokens
2010.00619v2.md: 42,730 tokens
2201.02202v1.md: 24,819 tokens
1807.06209v4.md: 133,523 tokens
1604.01424v3.md: 62,766 tokens

Total across all files: 375,368 tokens


{'2503.14454v1.md': 111530,
 '2010.00619v2.md': 42730,
 '2201.02202v1.md': 24819,
 '1807.06209v4.md': 133523,
 '1604.01424v3.md': 62766,
 'TOTAL': 375368}

In [None]:
#Vertex AI embedding ->$0.004692
# text-embedding-3-large ->$0.048798
#gemini-embedding-001-> $0.004692

In [None]:
#Paperqa2 0.0045+$0.048798

#Modified PaperQA2 0.1604+$0.048798

#OpenAI PDF/MD 0.03859+0.048798

#Vertex MD

#perplexity

#hybrid

#gemini




# Perplexity Result

In [4]:
def missing_question(qids, responses_df):
    """
    Evaluate multiple questions from a pandas DataFrame
    
    Args:
        qids: List of question IDs to evaluate
        responses_df: pandas DataFrame with columns: question_id, question, answer, ideal_solution
        gemini_evaluator: Initialized evaluator instance
        
    Returns:
        Dictionary mapping question IDs to evaluation results
    """
    results = {}
    
    print(f"Evaluating {len(qids)} questions...")
    print("=" * 50)
    gemini_evaluator=GeminiEvaluator()
    
    for i, qid in enumerate(qids, 1):
        print(f"\n[{i}/{len(qids)}] Evaluating Question ID: {qid}")
        
        try:
            # Filter DataFrame for this question ID
            question_rows = responses_df[responses_df["question_id"] == qid]
            
            # Check if question exists
            if question_rows.empty:
                print(f"  Question ID {qid} not found in DataFrame")
                results[qid] = {
                    "eval_accuracy_score": None,
                    "eval_rationale": f"Question ID {qid} not found",
                    "eval_successful": False,
                    "eval_error": "Question not found"
                }
                continue
            
            # Get the first (should be only) row
            row = question_rows.iloc[0]
            
            # Check for required columns and non-null values
            required_cols = ["question", "answer", "ideal_solution"]
            missing_cols = [col for col in required_cols if col not in responses_df.columns]
            
            if missing_cols:
                print(f"  ❌ Missing columns: {missing_cols}")
                results[qid] = {
                    "eval_accuracy_score": None,
                    "eval_rationale": f"Missing columns: {missing_cols}",
                    "eval_successful": False,
                    "eval_error": f"Missing columns: {missing_cols}"
                }
                continue
            
            # Check for null values
            null_fields = [col for col in required_cols if pd.isna(row[col])]
            if null_fields:
                print(f"  ❌ Null values in: {null_fields}")
                results[qid] = {
                    "eval_accuracy_score": None,
                    "eval_rationale": f"Null values in: {null_fields}",
                    "eval_successful": False,
                    "eval_error": f"Null values: {null_fields}"
                }
                continue
            
            # Perform evaluation
            result = gemini_evaluator.evaluate_single_response(
                question=row["question"],
                generated_answer=row["answer"],
                ideal_answer=row["ideal_solution"],
            )
            
            # Store result
            results[qid] = result
            
            # Print progress
            if result['eval_successful']:
                print(f"  Score: {result['eval_accuracy_score']}/100")
                print(f"  Rationale: {result['eval_rationale'][:100]}...")
            else:
                print(f"  Failed: {result['eval_error']}")
                
        except Exception as e:
            print(f"  Exception occurred: {str(e)}")
            results[qid] = {
                "eval_accuracy_score": None,
                "eval_rationale": f"Exception: {str(e)}",
                "eval_successful": False,
                "eval_error": str(e)
            }

In [5]:
perplexity=pd.read_pickle("results/perplexity_results_final.pkl")


In [4]:
gemini_evaluator = SingleRAGEvaluationSystem(evaluator_backend="gemini")

[Init] Initialized gemini-2.5-pro-preview-06-05 successfully with service account: gemini.json
Initialized evaluation system with gemini backend
Using model: gemini-2.5-pro-preview-06-05


In [10]:
perplexity_evaluated = gemini_evaluator.evaluate_single_dataframe(
    df=perplexity,
    system_name="Perplexity"
)


EVALUATING: Perplexity (using gemini backend)
All required columns found
Available columns: ['question_id', 'question', 'response', 'answer', 'sources', 'ideal_solution', 'processing_time', 'success', 'error', 'embedding_system']
Filtering by success column: 105 successful out of 105 total
Rate limit status: 0/150 requests, 0/2000000 tokens

Evaluating 1/105 - Question ID: 1
Accuracy:0
  Time: 11.27s

Evaluating 2/105 - Question ID: 2
Accuracy:100
  Time: 15.74s

Evaluating 3/105 - Question ID: 3
Accuracy:0
  Time: 9.19s

Evaluating 4/105 - Question ID: 4
Accuracy:0
  Time: 11.03s

Evaluating 5/105 - Question ID: 5
Accuracy:100
  Time: 11.78s

Evaluating 6/105 - Question ID: 6
  ✗ Failed: Unknown error after 1 attempts: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 2.

Evaluating 7/105 - Question ID: 7
Accuracy:0
 

In [6]:
qids=[6,14,18,41,69,71,81]

In [7]:
missing_question(qids, perplexity)

Evaluating 7 questions...
[Init] Initialized gemini-2.5-pro-preview-06-05 successfully with service account: gemini.json

[1/7] Evaluating Question ID: 6
[Backoff] Attempt 1/3 failed. Unknown error: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 2.
[Backoff] Sleeping for 24.6 seconds (base: 30s, multiplier: 2.0)
[Backoff] Attempt 2/3 failed. Unknown error: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 2.
[Backoff] Sleeping for 70.7 seconds (base: 30s, multiplier: 2.0)
  Score: 0/100
  Rationale: The generated answer provides a value for a different, though related, parameter. The question asks ...

[2/7] Evaluating Question ID: 14
  Score: 0/100
 

In [8]:
(98*31.63+0+0+100+0+0+100+0)/105

31.426095238095236

# Vertex RAG

In [9]:
Vetex_RAG=pd.read_pickle("results/vertexai_rag_results_final.pkl")

In [13]:

Vetex_RAG_evaluated = gemini_evaluator.evaluate_single_dataframe(
    df=Vetex_RAG,
    system_name="Gemini_embedding"
)


EVALUATING: Gemini_embedding (using gemini backend)
All required columns found
Available columns: ['question_id', 'question', 'response', 'answer', 'sources', 'ideal_solution', 'processing_time', 'success', 'error', 'embedding_system']
Filtering by success column: 105 successful out of 105 total
Rate limit status: 1/150 requests, 207/2000000 tokens

Evaluating 1/105 - Question ID: 1
Accuracy:100
  Time: 7.54s

Evaluating 2/105 - Question ID: 2
Accuracy:100
  Time: 11.56s

Evaluating 3/105 - Question ID: 3
Accuracy:100
  Time: 9.42s

Evaluating 4/105 - Question ID: 4
Accuracy:100
  Time: 12.28s

Evaluating 5/105 - Question ID: 5
Accuracy:100
  Time: 9.01s

Evaluating 6/105 - Question ID: 6
Accuracy:100
  Time: 9.11s

Evaluating 7/105 - Question ID: 7
Accuracy:0
  Time: 8.91s

Evaluating 8/105 - Question ID: 8
Accuracy:0
  Time: 15.97s

Evaluating 9/105 - Question ID: 9
Accuracy:100
  Time: 14.44s

Evaluating 10/105 - Question ID: 10
Accuracy:100
  Time: 9.23s

Evaluating 11/105 - Quest

In [10]:
qids=[48,77,90]

In [11]:
missing_question(qids, Vetex_RAG)

Evaluating 3 questions...
[Init] Initialized gemini-2.5-pro-preview-06-05 successfully with service account: gemini.json

[1/3] Evaluating Question ID: 48
  Score: 0/100
  Rationale: The generated answer is fundamentally incorrect because it provides a list of tracked properties tha...

[2/3] Evaluating Question ID: 77
  Score: 100/100
  Rationale: The generated answer provides a value for the Hubble constant ($73.24 \pm 1.74 \mathrm{~km} \mathrm{...

[3/3] Evaluating Question ID: 90
[Backoff] Attempt 1/3 failed. Unknown error: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 2.
[Backoff] Sleeping for 25.3 seconds (base: 30s, multiplier: 2.0)
  Score: 100/100
  Rationale: The generated answer correctly identifies one of the two primary strategies mentioned in the ideal a...


In [12]:
(86.27*102+200)/105

85.70990476190475

# Evaluate Gemini Embedding

In [13]:
gemini_embedding_df=pd.read_pickle("results/gemini_embedding_results.pkl")

In [15]:

gemini_embedding_evaluated = gemini_evaluator.evaluate_single_dataframe(
    df=gemini_embedding_df,
    system_name="Gemini_embedding"
)


EVALUATING: Gemini_embedding (using gemini backend)
All required columns found
Available columns: ['question_id', 'question', 'ideal_solution', 'response', 'answer', 'sources', 'processing_time', 'success', 'error', 'embedding_system']
Filtering by success column: 105 successful out of 105 total
Rate limit status: 6/150 requests, 1237/2000000 tokens

Evaluating 1/105 - Question ID: 1
Accuracy:100
  Time: 6.97s

Evaluating 2/105 - Question ID: 2
Accuracy:100
  Time: 17.73s

Evaluating 3/105 - Question ID: 3
Accuracy:100
  Time: 8.79s

Evaluating 4/105 - Question ID: 4
Accuracy:100
  Time: 6.48s

Evaluating 5/105 - Question ID: 5
Accuracy:100
  Time: 7.76s

Evaluating 6/105 - Question ID: 6
Accuracy:100
  Time: 7.06s

Evaluating 7/105 - Question ID: 7
Accuracy:100
  Time: 11.27s

Evaluating 8/105 - Question ID: 8
Accuracy:100
  Time: 10.65s

Evaluating 9/105 - Question ID: 9
Accuracy:100
  Time: 12.16s

Evaluating 10/105 - Question ID: 10
Accuracy:100
  Time: 6.61s

Evaluating 11/105 - 

In [14]:
qids=[26,45]

In [15]:
missing_question(qids, gemini_embedding_df)

Evaluating 2 questions...
[Init] Initialized gemini-2.5-pro-preview-06-05 successfully with service account: gemini.json

[1/2] Evaluating Question ID: 26
  Score: 0/100
  Rationale: The generated answer correctly identifies the parameters that are varied (cosmological and astrophys...

[2/2] Evaluating Question ID: 45
Failed to parse Gemini JSON response: Invalid \escape: line 3 column 243 (char 269)
Raw response: {
  "accuracy_score": 100,
  "rationale": "The generated answer correctly identifies all the fixed initial parameters and their corresponding values as listed in the ideal answer. It includes all the core factual content required. The additional information provided ($\Omega_{K}=0$) is also correct ...
  Failed: JSON parse error: Invalid \escape: line 3 column 243 (char 269)


In [17]:
(86.41*103+100)/105

85.71647619047619

# gemini_no_rag

In [19]:
gemini_no_rag=pd.read_pickle("results/gemini_norag_results.pkl")

In [17]:

gemini_no_rag_eval = gemini_evaluator.evaluate_single_dataframe(
    df=gemini_no_rag,
    system_name="Gemini_no_rag"
)


EVALUATING: Gemini_no_rag (using gemini backend)
All required columns found
Available columns: ['question_id', 'question', 'response', 'answer', 'sources', 'ideal_solution', 'processing_time', 'success', 'error', 'embedding_system']
Filtering by success column: 105 successful out of 105 total
Rate limit status: 6/150 requests, 1238/2000000 tokens

Evaluating 1/105 - Question ID: 1
Accuracy:0
  Time: 9.16s

Evaluating 2/105 - Question ID: 2
Accuracy:0
  Time: 9.92s

Evaluating 3/105 - Question ID: 3
Accuracy:0
  Time: 6.27s

Evaluating 4/105 - Question ID: 4
Accuracy:0
  Time: 8.07s

Evaluating 5/105 - Question ID: 5
Accuracy:0
  Time: 12.16s

Evaluating 6/105 - Question ID: 6
Accuracy:0
  Time: 16.82s

Evaluating 7/105 - Question ID: 7
Failed to parse Gemini JSON response: Unterminated string starting at: line 3 column 16 (char 40)
Raw response: {
  "accuracy_score": 0,
  "rationale": "The generated answer is fundamentally incorrect. It states that the best-fit value for 10^9 A_s is 3

In [20]:
qids=[7,12,68,81,93]

In [21]:
missing_question(qids, gemini_no_rag)

Evaluating 5 questions...
[Init] Initialized gemini-2.5-pro-preview-06-05 successfully with service account: gemini.json

[1/5] Evaluating Question ID: 7
  Score: 0/100
  Rationale: The generated answer provides a best-fit value for 10^9 A_s as 3.044 and a 1-sigma constraint of 0.0...

[2/5] Evaluating Question ID: 12
  Score: 0/100
  Rationale: The generated answer is factually correct in its own right; BAO is indeed used as a standard ruler t...

[3/5] Evaluating Question ID: 68
Failed to parse Gemini JSON response: Unterminated string starting at: line 3 column 16 (char 40)
Raw response: {
  "accuracy_score": 0,
  "rationale": "The user's question asks for the *leading* source of scatter in the Period-Luminosity relations for HST observations of SN hosts. The ideal answer identifies this as 'Uncertainty in the Cephid background,' which refers to photometric crowding. The generated a...
  Failed: JSON parse error: Unterminated string starting at: line 3 column 16 (char 40)

[4/5] Eva

In [22]:
(27*100+200)/105

27.61904761904762

# Evaluate Vanilla PaperQA

In [4]:
gemini_evaluator = SingleRAGEvaluationSystem(evaluator_backend="gemini")

[Init] Initialized gemini-2.5-pro-preview-06-05 successfully with service account: gemini.json
Initialized evaluation system with gemini backend
Using model: gemini-2.5-pro-preview-06-05


In [40]:
evaluator=SingleRAGEvaluationSystem(evaluator_backend="autogen")

Initialized evaluation system with autogen backend
Using model: o3-mini


In [5]:
import pandas as pd

In [6]:
paperqa=pd.read_pickle("results/paperqa2_valina_gpt4.1_results_final.pkl")

In [7]:
# Evaluate in smaller batches
paperqa_eval = gemini_evaluator.evaluate_single_dataframe(
    df=paperqa,
    system_name="paperqa_valina_gpt4.1"
)


EVALUATING: paperqa_valina_gpt4.1 (using gemini backend)
All required columns found
Available columns: ['question_id', 'question', 'response', 'answer', 'sources', 'ideal_solution', 'processing_time', 'success', 'error', 'embedding_system']
Filtering by success column: 105 successful out of 105 total
Rate limit status: 0/150 requests, 0/2000000 tokens

Evaluating 1/105 - Question ID: 1
Accuracy:100
  Time: 7.77s

Evaluating 2/105 - Question ID: 2
Accuracy:100
  Time: 14.64s

Evaluating 3/105 - Question ID: 3
Accuracy:100
  Time: 7.78s

Evaluating 4/105 - Question ID: 4
Accuracy:100
  Time: 98.79s

Evaluating 5/105 - Question ID: 5
Accuracy:100
  Time: 9.51s

Evaluating 6/105 - Question ID: 6
Accuracy:100
  Time: 9.94s

Evaluating 7/105 - Question ID: 7
Accuracy:100
  Time: 10.75s

Evaluating 8/105 - Question ID: 8
Accuracy:0
  Time: 20.40s

Evaluating 9/105 - Question ID: 9
Accuracy:0
  Time: 8.66s

Evaluating 10/105 - Question ID: 10
Accuracy:100
  Time: 16.67s

Evaluating 11/105 - Q

# Eavaluate PaperQA

In [19]:
paperqa_modified=pd.read_pickle("results/paperqa2_gpt4.1_results.pkl")
paperqa_modified_eval = gemini_evaluator.evaluate_single_dataframe(
    df=paperqa_modified,
    system_name="paperqa_modified_gpt4.1"
)


EVALUATING: paperqa_modified_gpt4.1 (using gemini backend)
All required columns found
Available columns: ['question_id', 'question', 'response', 'answer', 'sources', 'ideal_solution', 'processing_time', 'success', 'error', 'embedding_system']
Filtering by success column: 105 successful out of 105 total
Rate limit status: 6/150 requests, 851/2000000 tokens

Evaluating 1/105 - Question ID: 1
Accuracy:100
  Time: 9.96s

Evaluating 2/105 - Question ID: 2
Accuracy:100
  Time: 9.01s

Evaluating 3/105 - Question ID: 3
Accuracy:100
  Time: 8.60s

Evaluating 4/105 - Question ID: 4
Accuracy:100
  Time: 13.54s

Evaluating 5/105 - Question ID: 5
Accuracy:100
  Time: 7.43s

Evaluating 6/105 - Question ID: 6
Accuracy:100
  Time: 9.48s

Evaluating 7/105 - Question ID: 7
Accuracy:100
  Time: 12.54s

Evaluating 8/105 - Question ID: 8
Accuracy:0
  Time: 18.23s

Evaluating 9/105 - Question ID: 9
Accuracy:0
  Time: 18.02s

Evaluating 10/105 - Question ID: 10
Accuracy:100
  Time: 11.05s

Evaluating 11/105

In [26]:
(102*82.35+200)/105

81.90190476190475

# OpenAI_Vector_Store_no_pdf

In [27]:
openai_vector_no_pdf=pd.read_pickle("results/openai_vector_store_results.pkl")

In [21]:
openai_vector_no_pdf_eval = gemini_evaluator.evaluate_single_dataframe(
    df=openai_vector_no_pdf,
    system_name="OpenAI_Vector_Store_md"
)


EVALUATING: OpenAI_Vector_Store_md (using gemini backend)
All required columns found
Available columns: ['question_id', 'question', 'ideal_solution', 'response', 'answer', 'sources', 'processing_time', 'success', 'error', 'embedding_system']
Filtering by success column: 105 successful out of 105 total
Rate limit status: 4/150 requests, 924/2000000 tokens

Evaluating 1/105 - Question ID: 1
Accuracy:100
  Time: 8.36s

Evaluating 2/105 - Question ID: 2
Accuracy:100
  Time: 8.36s

Evaluating 3/105 - Question ID: 3
Accuracy:100
  Time: 13.01s

Evaluating 4/105 - Question ID: 4
Accuracy:100
  Time: 11.25s

Evaluating 5/105 - Question ID: 5
Accuracy:100
  Time: 9.87s

Evaluating 6/105 - Question ID: 6
Accuracy:100
  Time: 9.96s

Evaluating 7/105 - Question ID: 7
Accuracy:100
  Time: 15.08s

Evaluating 8/105 - Question ID: 8
  ✗ Failed: Unknown error after 1 attempts: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned.

In [28]:
qids=[8]
missing_question(qids, openai_vector_no_pdf)

Evaluating 1 questions...
[Init] Initialized gemini-2.5-pro-preview-06-05 successfully with service account: gemini.json

[1/1] Evaluating Question ID: 8
[Backoff] Attempt 1/3 failed. Unknown error: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 2.
[Backoff] Sleeping for 25.5 seconds (base: 30s, multiplier: 2.0)
  Score: 0/100
  Rationale: The generated answer provides a different and less fundamental explanation than the ideal answer. Th...


In [29]:
89.42*104/105

88.56838095238095

# OpenAI_Vector_Store_pdf


In [30]:
openai_vector_df=pd.read_pickle("results/openai_pdf_vector_store_results.pkl")

In [23]:
openai_vector_df_eval = gemini_evaluator.evaluate_single_dataframe(
    df=openai_vector_df,
    system_name="OpenAI_Vector_Store"
)


EVALUATING: OpenAI_Vector_Store (using gemini backend)
All required columns found
Available columns: ['question_id', 'question', 'ideal_solution', 'response', 'answer', 'sources', 'processing_time', 'success', 'error', 'embedding_system']
Filtering by success column: 105 successful out of 105 total
Rate limit status: 1/150 requests, 162/2000000 tokens

Evaluating 1/105 - Question ID: 1
Accuracy:100
  Time: 8.55s

Evaluating 2/105 - Question ID: 2
Accuracy:100
  Time: 7.88s

Evaluating 3/105 - Question ID: 3
Accuracy:100
  Time: 8.80s

Evaluating 4/105 - Question ID: 4
Accuracy:100
  Time: 14.16s

Evaluating 5/105 - Question ID: 5
Accuracy:100
  Time: 9.59s

Evaluating 6/105 - Question ID: 6
Accuracy:100
  Time: 8.80s

Evaluating 7/105 - Question ID: 7
  ✗ Failed: Unknown error after 1 attempts: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate

In [32]:
qids=[22]
missing_question(qids, openai_vector_df)

Evaluating 1 questions...
[Init] Initialized gemini-2.5-pro-preview-06-05 successfully with service account: gemini.json

[1/1] Evaluating Question ID: 22
  Score: 0/100
  Rationale: The generated answer directly contradicts the ideal answer. The ideal answer states that 'Only the d...


In [33]:
(91.26*103+100)/105

90.47409523809525

# OpenAi Embedding

In [24]:
openai_embedding=pd.read_pickle("results/openai_embedding_results_final.pkl")

In [25]:

gemini_no_openai_embedding_eval = gemini_evaluator.evaluate_single_dataframe(
    df=openai_embedding,
    system_name="OpenAI_embedding"
)


EVALUATING: OpenAI_embedding (using gemini backend)
All required columns found
Available columns: ['question_id', 'question', 'response', 'ideal_solution', 'answer', 'sources', 'processing_time', 'success', 'error', 'embedding_system']
Filtering by success column: 105 successful out of 105 total
Rate limit status: 1/150 requests, 153/2000000 tokens

Evaluating 1/105 - Question ID: 1
Accuracy:100
  Time: 7.74s

Evaluating 2/105 - Question ID: 2
Accuracy:100
  Time: 7.86s

Evaluating 3/105 - Question ID: 3
Accuracy:100
  Time: 7.61s

Evaluating 4/105 - Question ID: 4
Accuracy:100
  Time: 8.29s

Evaluating 5/105 - Question ID: 5
Accuracy:100
  Time: 8.76s

Evaluating 6/105 - Question ID: 6
Accuracy:100
  Time: 10.28s

Evaluating 7/105 - Question ID: 7
Accuracy:100
  Time: 11.98s

Evaluating 8/105 - Question ID: 8
Accuracy:100
  Time: 17.63s

Evaluating 9/105 - Question ID: 9
Accuracy:100
  Time: 13.68s

Evaluating 10/105 - Question ID: 10
Accuracy:100
  Time: 28.90s

Evaluating 11/105 - 

In [16]:
91.35*104/105

90.47999999999999