In [1]:
import re
import json
from datetime import datetime

import pandas as pd
from sparql_eval_module import SingleGraphCQEvaluator
import concurrent.futures

In [None]:
# ============================================================
# 1. Initialize Nones
# ============================================================
PREFIX_TTL_FILE = None
END_POINT = None
evaluator = None
graph_id = None
input_file = None

END_POINT_DICT = {
    "Small": "localhost:3030/temp_chris_thesis1/sparql",
    "Big":  "localhost:3030/temp_chris_thesis2/sparql"
}

In [3]:
# ============================================================
# Helper: extract SPARQL from LLM analysis text
#    - Handles ```sparql ...``` blocks
#    - Falls back to first query-like substring
# ============================================================

cache_prefixes = dict()

def load_prefixes_from_ttl(ttl_path: str) -> dict:
    """
    Load prefixes from a TTL file with lines like:
      @prefix ex: <http://example.com/> .
    Returns: { 'ex': 'http://example.com/' , ... }
    """
    if cache_prefixes.get(ttl_path,0):
        return cache_prefixes[ttl_path]
    prefixes = {}
    try:
        with open(ttl_path, "r", encoding="utf-8") as f:
            for line in f:
                # match lines: @prefix foo: <URI> .
                m = re.match(
                    r"\s*@prefix\s+([A-Za-z][A-Za-z0-9_-]*):\s*<([^>]+)>\s*\.",
                    line
                )
                if m:
                    pfx, uri = m.group(1), m.group(2)
                    prefixes[pfx] = uri
    except FileNotFoundError:
        print(f"TTL prefix file not found: {ttl_path} (continuing without extra prefixes)")
    cache_prefixes[ttl_path] = prefixes
    return prefixes

def extract_sparql(text: str) -> str:
    """
    Extract SPARQL query from LLM output:
      - remove <think>...</think> blocks entirely
      - unescape \\n, \\t, \\r
      - prefer fenced ```sparql ...``` or ``` ...``` blocks
      - otherwise take earliest of PREFIX/SELECT/ASK/CONSTRUCT/DESCRIBE
      - preserve PREFIX declarations
    """
    if not isinstance(text, str):
        return ""

    # -----------------------------------------------------------
    # 1. Remove <think>...</think> blocks (case-insensitive)
    # -----------------------------------------------------------
    text = re.sub(
        r"<think>[\s\S]*?</think>",
        "",
        text,
        flags=re.IGNORECASE
    ).strip()

    # -----------------------------------------------------------
    # 2. Unescape literal \n, \t, \r if present
    # -----------------------------------------------------------
    if "\\n" in text or "\\t" in text or "\\r" in text:
        text = (
            text
            .replace("\\r", "\r")
            .replace("\\n", "\n")
            .replace("\\t", "\t")
        )

    # -----------------------------------------------------------
    # 3. Fenced code blocks: ```sparql ...``` or ``` ...```
    # -----------------------------------------------------------
    fenced = re.findall(
        r"```(?:sparql)?\s*(.*?)```",
        text,
        flags=re.DOTALL | re.IGNORECASE,
    )
    if fenced:
        return fenced[0].strip()

    # -----------------------------------------------------------
    # 4. Prefix-aware extraction:
    #    earliest of PREFIX/SELECT/ASK/CONSTRUCT/DESCRIBE
    # -----------------------------------------------------------
    lowered = text.lower()
    keywords = ["prefix", "select", "ask", "construct", "describe"]

    start_idx = None
    for kw in keywords:
        i = lowered.find(kw)
        if i != -1:
            if start_idx is None or i < start_idx:
                start_idx = i

    if start_idx is not None:
        return text[start_idx:].strip()

    # -----------------------------------------------------------
    # 5. Fallback: trimmed text
    # -----------------------------------------------------------
    return text.strip()

def add_missing_prefixes(query: str, ttl_prefixes: dict) -> str:
    """
    Add PREFIX declarations from ttl_prefixes that are not already
    declared in the query. Do NOT override anything already present.

    ttl_prefixes: dict { 'ex': 'http://example.com/' , ... }
    """
    if not query or not ttl_prefixes:
        return query

    # Find existing prefixes in query (PREFIX or @prefix)
    existing_pfx = set()
    for m in re.finditer(
        r"(?i)(?:@prefix|prefix)\s+([A-Za-z][A-Za-z0-9_-]*):\s*<([^>]+)>",
        query
    ):
        existing_pfx.add(m.group(1))

    extra_lines = []
    for pfx, uri in ttl_prefixes.items():
        if pfx not in existing_pfx:
            extra_lines.append(f"PREFIX {pfx}: <{uri}>")

    if not extra_lines:
        return query

    # Put extra prefixes at the very top, before any existing content
    return "\n".join(extra_lines) + "\n" + query.lstrip()

In [4]:
def perform_eval(input_file):
    graph_id = "Small" if input_file.lower().split("/")[-1].startswith("small_") else "Big"
    END_POINT = END_POINT_DICT[graph_id]
    evaluator = SingleGraphCQEvaluator(endpoint=END_POINT)
    # ============================================================
    # 3. Main processing
    # ============================================================
    PREFIX_TTL_FILE = f"{graph_id.lower()}_schema_for_prefix.ttl"  # <-- TTL file with @prefix declarations
    ttl_prefixes = load_prefixes_from_ttl(PREFIX_TTL_FILE)

    # ------------------------------------------------
    # Temp + final filenames (NEW)
    # ------------------------------------------------
    base_name = os.path.splitext(os.path.basename(input_file))[0]
    temp_file = f"temp_eval_{base_name}.xlsx"
    
    # -------------------------
    # 3.1 Read input spreadsheet
    #     (resume from temp if exists)
    # -------------------------
    if os.path.exists(temp_file):
        print(f"Resuming from temp file: {temp_file}")
        df = pd.read_excel(temp_file)
    else:
        print(f"Starting new eval from original file: {input_file}")
        df = pd.read_excel(input_file)
    
    # Expect at least these stable column names
    required_base_cols = ["CQ", "Prompt"]
    for col in required_base_cols:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")
            
    # ---------------------------------------------
    # 3.2 Detect model-specific analysis columns
    #     (suffix-based, model prefix can change)
    # ---------------------------------------------
    raw_col = None
    result_col = None
    
    for col in df.columns:
        lc = col.lower()
        if lc.endswith("_raw"):
            raw_col = col
        elif lc.endswith("_result"):
            result_col = col
    
    if raw_col is None or result_col is None:
        raise ValueError(
            "Could not locate columns ending with '_Raw' and "
            "'_Result'.\nColumns available: "
            f"{list(df.columns)}"
        )
    
    print(f"Detected analysis columns:")
    print(f"  RAW:    {raw_col}")
    print(f"  RESULT: {result_col}")
    
    # Derive model name from the result column (prefix before suffix)
    model_name_suffix = "_Result"
    if result_col.endswith(model_name_suffix):
        model_name = result_col[: -len(model_name_suffix)]
    else:
        model_name = "model"

    # ------------------------------------------------
    # 3.3 Initialize / add output columns to dataframe
    # ------------------------------------------------
    # Basic info
    df["sparql_query"] = ""
    df["eval_json"] = ""
    
    # Top-level booleans/ints
    df["syntax_ok"] = ""
    df["satisfiable"] = ""
    df["deterministic"] = ""
    df["rows"] = ""
    df["vars"] = ""
    
    # Latency metrics
    df["latency_p50_ms"] = ""
    df["latency_p95_ms"] = ""
    df["latency_mean_ms"] = ""
    
    # Additional metrics from sample JSON
    df["lexical_query_overlap"] = ""
    df["semantic_similarity_to_CQ"] = ""
    df["semantic_soft_coverage_to_CQ"] = ""
    df["tuple_cohesion"] = ""
    df["always_unbound_vars"] = ""
    df["variables"] = ""  # store variables list as JSON string

    # ------------------------------------------------
    # 3.3.5 Temp + final filenames  (ADDED)
    # ------------------------------------------------
    base_name = os.path.splitext(os.path.basename(input_file))[0]  # e.g., Claude_big_...
    temp_file = f"temp_eval_{base_name}.xlsx"                      # <<< temp file
    output_file = f"evaluation_results_{model_name}_{graph_id}.xlsx"  # final file (same name as before)  # <<< moved up

    # ------------------------------------------
    # 3.4 Iterate through rows and run evaluation
    # ------------------------------------------
    rows_done = 0
    if "eval_json" in df.columns:
        rows_done = (df["eval_json"].astype(str).str.strip() != "").sum()
    print(f"Rows already processed: {rows_done}")
    
    for idx, row in df.iterrows():
        # Skip row if eval_json already has something (resumable part)
        if "eval_json" in df.columns:
            current_val = str(df.at[idx, "eval_json"])
            print(current_val)
            if current_val.strip():   # non-empty => already processed
                continue
        
        cq_text = row["CQ"]
        raw_text = row[raw_col]
        analysis_text = row[result_col]
    
        # Extract SPARQL query from Raw or Result columns
        sparql_query = extract_sparql(analysis_text) or extract_sparql(raw_text) 
    
        # Add prefixes from TTL, unless already present
        sparql_query = add_missing_prefixes(sparql_query, ttl_prefixes)
        
        df.at[idx, "sparql_query"] = sparql_query
    
        if not sparql_query:
            df.at[idx, "eval_json"] = "NO SPARQL FOUND"
        else:
            # ------------------------------------
            # Per-row timeout: 15 minutes (900 s)
            # ------------------------------------
            try:
                with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
                    future = executor.submit(evaluator.evaluate, cq_text, sparql_query)
                    # raises concurrent.futures.TimeoutError if > 900 seconds
                    result = future.result(timeout=1800)
            except concurrent.futures.TimeoutError:
                # Treat as timeout + invalid query
                timeout_err = TimeoutError("Timeout after 30 minutes: invalid query")
                df.at[idx, "eval_json"] = f"ERROR: {timeout_err}"
            except Exception as e:
                df.at[idx, "eval_json"] = f"ERROR: {e}"
            else:
                # If the evaluator returns a JSON string, parse it
                if isinstance(result, str):
                    try:
                        result = json.loads(result)
                    except Exception:
                        # Keep raw string if not valid JSON
                        df.at[idx, "eval_json"] = result
                        # <<< continue not needed; we still want to temp-save
                    else:
                        df.at[idx, "eval_json"] = json.dumps(result, indent=2)
                else:
                    df.at[idx, "eval_json"] = json.dumps(result, indent=2)
    
                # ---- Top-level fields ----
                if isinstance(result, dict):
                    df.at[idx, "syntax_ok"] = result.get("syntax_ok", False)
                    df.at[idx, "satisfiable"] = result.get("satisfiable", False)
                    df.at[idx, "deterministic"] = result.get("deterministic", False)
                    df.at[idx, "rows"] = result.get("rows", 0)
                    df.at[idx, "vars"] = result.get("vars", 0)
    
                    # ---- Latency ----
                    latency = result.get("latency", {}) or {}
                    df.at[idx, "latency_p50_ms"] = latency.get("p50_ms")
                    df.at[idx, "latency_p95_ms"] = latency.get("p95_ms")
                    df.at[idx, "latency_mean_ms"] = latency.get("mean_ms")
    
                    # ---- Additional numeric metrics ----
                    df.at[idx, "lexical_query_overlap"] = result.get("lexical_query_overlap", 0)
                    df.at[idx, "semantic_similarity_to_CQ"] = result.get("semantic_similarity_to_CQ", 0)
                    df.at[idx, "semantic_soft_coverage_to_CQ"] = result.get("semantic_soft_coverage_to_CQ", 0)
                    df.at[idx, "tuple_cohesion"] = result.get("tuple_cohesion", 0)
    
                    # ---- Variables-related info ----
                    always_unbound_vars = result.get("always_unbound_vars", [])
                    df.at[idx, "always_unbound_vars"] = json.dumps(always_unbound_vars)
    
                    variables = result.get("variables", [])
                    df.at[idx, "variables"] = json.dumps(variables)
                else:
                    result = dict()
                    df.at[idx, "syntax_ok"] = result.get("syntax_ok", False)
                    df.at[idx, "satisfiable"] = result.get("satisfiable", False)
                    df.at[idx, "deterministic"] = result.get("deterministic", False)
                    df.at[idx, "rows"] = result.get("rows", 0)
                    df.at[idx, "vars"] = result.get("vars", 0)
    
                    # ---- Latency ----
                    latency = result.get("latency", {}) or {}
                    df.at[idx, "latency_p50_ms"] = latency.get("p50_ms")
                    df.at[idx, "latency_p95_ms"] = latency.get("p95_ms")
                    df.at[idx, "latency_mean_ms"] = latency.get("mean_ms")
    
                    # ---- Additional numeric metrics ----
                    df.at[idx, "lexical_query_overlap"] = result.get("lexical_query_overlap", 0)
                    df.at[idx, "semantic_similarity_to_CQ"] = result.get("semantic_similarity_to_CQ", 0)
                    df.at[idx, "semantic_soft_coverage_to_CQ"] = result.get("semantic_soft_coverage_to_CQ", 0)
                    df.at[idx, "tuple_cohesion"] = result.get("tuple_cohesion", 0)
    
                    # ---- Variables-related info ----
                    always_unbound_vars = result.get("always_unbound_vars", [])
                    df.at[idx, "always_unbound_vars"] = json.dumps(always_unbound_vars)
    
                    variables = result.get("variables", [])
                    df.at[idx, "variables"] = json.dumps(variables)

        # ------------------------------------------
        # TEMP SAVE EVERY FEW ROWS  (ADDED)
        # ------------------------------------------
        if idx % 1 == 0:                       # change 5 to 1 if you want *every* row
            df.to_excel(temp_file, index=False)
            print(f"Temp saved at row {idx} -> {temp_file}")

    # ------------------------------------------
    # 3.5 Save new spreadsheet with model in name
    # ------------------------------------------
    output_file = f"evaluation_results_{model_name}_{graph_id}.xlsx"
    df.to_excel(output_file, index=False)
    
    print(f"Saved: {output_file}")

    # ------------------------------------------
    # 3.6 Remove temp file when complete (ADDED)
    # ------------------------------------------
    if os.path.exists(temp_file):
        os.remove(temp_file)
        print(f"Removed temp: {temp_file}")

In [5]:
import os

def get_files_with_extension(directory, extension):
    # List all files in the directory with the given extension
    files = [f for f in os.listdir(directory) if f.endswith(extension)]
    return files

# Example usage
directory = 'temp_set_2'  # Replace with your directory path
extension = '.xlsx'  # Replace with your desired file extension
files = get_files_with_extension(directory, extension)

In [6]:
for file in sorted(files, reverse=True):
    if not ("gemma" in file):
    # if not ("Claude" in file and "big" in file):
        continue
    print(file)
    input_file = os.path.join(directory,file)
    perform_eval(input_file)

small_schema_trim_cq_gemma3-27b_results.xlsx


  from .autonotebook import tqdm as notebook_tqdm


Starting new eval from original file: temp_set_2/small_schema_trim_cq_gemma3-27b_results.xlsx
Detected analysis columns:
  RAW:    gemma3:27b_Analysis_Raw
  RESULT: gemma3:27b_Analysis_Result
Rows already processed: 0

Temp saved at row 0 -> temp_eval_small_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 1 -> temp_eval_small_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 2 -> temp_eval_small_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 3 -> temp_eval_small_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 4 -> temp_eval_small_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 5 -> temp_eval_small_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 6 -> temp_eval_small_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 7 -> temp_eval_small_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 8 -> temp_eval_small_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 9 -> temp_eval_small_schema_trim_cq_gemma3-27b_resul



Starting new eval from original file: temp_set_2/big_schema_trim_cq_gemma3-27b_results.xlsx
Detected analysis columns:
  RAW:    gemma3:27b_Analysis_Raw
  RESULT: gemma3:27b_Analysis_Result
Rows already processed: 0

Temp saved at row 0 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 1 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 2 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 3 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 4 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 5 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 6 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 7 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 8 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 9 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx





Temp saved at row 10 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 11 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 12 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 13 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 14 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 15 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 16 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 17 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 18 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 19 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 20 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 21 -> temp_eval_big_schema_trim_cq_gemma3-27b_results.xlsx

Temp saved at row 22 -> temp_eval_big_schema_trim_cq_gemma3-27b_

In [7]:
from hypothesis_specific_eval import add_hyp_scores
directory = "temp_set_2"
files = get_files_with_extension(directory, extension)
files = [file for file in files if "evaluation" in file and file.endswith("xlsx")]

for file in sorted(files, reverse=True):
    print(file)
    input_file = file
    add_hyp_scores(f"{directory}/{input_file}")

evaluation_results_gemma3:27b_Analysis_Small.xlsx

=== Per-difficulty summary (H1 + semantic_diversity + latency, etc.) ===
difficulty  latency_p50_ms  latency_p95_ms  latency_mean_ms  rows     vars  lexical_query_overlap  semantic_similarity_to_CQ  semantic_soft_coverage_to_CQ  tuple_cohesion  determinism_score  satisfiability_binding_score  h1_overall  semantic_diversity_score  syntax_ok_rate  satisfiable_rate  deterministic_rate
   complex            7.62           17.14            10.74   0.0 0.125000               0.000000                   0.081250                      0.087500             0.0           0.300000                      0.200000    0.250000                  0.133333        0.300000          0.200000            0.300000
  moderate            8.08           16.39            10.81   0.0 0.111111               0.000000                   0.076667                      0.076667             0.0           0.272727                      0.181818    0.227273                  0.1