In [None]:


import os, re, sys, json, csv, pandas as pd
from typing import List, Tuple, Optional
from openai import OpenAI, APIConnectionError, RateLimitError, OpenAIError

# ------------ Config ------------
MODEL = os.environ.get("OPENAI_MODEL", "gpt-5")  # or "gpt-5-chat-latest"
MAX_CTX_ROWS   = 25
MAX_ROW_CHARS  = 400

client = OpenAI(api_key="")  # uses OPENAI_API_KEY from env

# ------------ Prompts ------------
SYS_INSTRUCT_REGEX = """You are a world-class assistant that writes correct, specific Regular Expressions for CSV retrieval.

You must output exactly one JSON object and nothing else.

Goal:
Given a natural-language question about a CSV and minimal schema context, produce ONE Python 're' compatible regex that:
1) Matches the CSV row(s) that contain the data needed to answer the question, and
2) Captures the specific value(s) needed in a single named group: (?P<value>...).

Context:
- Question: {question}
- Column names: {column_context}
- CSV sample (header + first 2 rows, raw text):
{csv_context}

Key requirements:
- Use (?P<value>...) to capture only the value(s) needed to answer the question.
- Handle CSV properly (quoted/unquoted, commas). Use a tolerant CSV cell atom: (?:"[^"]*"|[^,]*)
- For quoted-or-not capture, use: "?(?P<value>[^",]+)"?
- Prefer literal tokens from the question (e.g., exact date/ID). If a precise row key exists, anchor with ^KEY, at line start; otherwise avoid over-anchoring.
- One pattern only. No flags in-pattern (caller sets flags).
- Engine: Python 're' only. Do NOT use \\K, atomic groups, or variable-length lookbehind.
- Escape backslashes as required by JSON.

Return ONLY:
{"regex":"YOUR_PATTERN_HERE"}"""

SYS_INSTRUCT_ANSWER = """You are a data assistant. You will receive a user question and a small set of matched CSV rows/cells.
Use ONLY the provided rows/cells as ground truth. If the answer is not present, say you cannot find it.
"""


# ------------ Helpers ------------
def first_n_raw_lines(path: str, n: int = 3) -> List[str]:
    lines = []
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        for i, line in enumerate(f):
            lines.append(line.rstrip("\n"))
            if i + 1 >= n:
                break
    return lines

def header_columns(path: str) -> List[str]:
    with open(path, "r", encoding="utf-8", newline="") as f:
        r = csv.reader(f)
        try:
            return next(r)
        except StopIteration:
            return []

def extract_regex(text: str) -> Optional[str]:
    try:
        s, e = text.find("{"), text.rfind("}")
        if s == -1 or e == -1 or e <= s: return None
        obj = json.loads(text[s:e+1])
        rgx = obj.get("regex")
        if not isinstance(rgx, str) or not rgx.strip():
            return None
        # Validate it compiles in Python 're'
        re.compile(rgx)
        return rgx
    except Exception:
        return None

def apply_regex_capture(csv_path: str, pattern: str) -> List[Tuple[int, str]]:
    prog = re.compile(pattern, re.IGNORECASE | re.MULTILINE)
    results = []
    with open(csv_path, "r", encoding="utf-8", errors="replace") as f:
        for i, line in enumerate(f):
            m = prog.search(line)
            if m:
                val = m.groupdict().get("value")
                results.append((i, (val if val is not None else line[:MAX_ROW_CHARS]).rstrip()))
                if len(results) >= MAX_CTX_ROWS:
                    break
    return results

def build_answer_prompt(question: str, matches: List[Tuple[int, str]]) -> str:
    ctx = "\n".join(f"- Row#{i}: {s}" for i, s in matches) if matches else "(no matches)"
    return f"User question:\n{question}\n\nMatched rows (capped):\n{ctx}\n\nAnswer the question using ONLY the matched rows."

# ------------ OpenAI wrappers ------------
def openai_chat_json(system: str, user: str, *, max_tokens: int = 300, temperature: float = 0.1) -> str:
    """
    Call Chat Completions with Structured Outputs so the model MUST return {"regex": "..."}.
    """
    schema = {
        "name": "regex_schema",
        "schema": {
            "type": "object",
            "properties": {"regex": {"type": "string"}},
            "required": ["regex"],
            "additionalProperties": False,
        },
        "strict": True,
    }
    resp = client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "system", "content": system},
                  {"role": "user", "content": user}],
        temperature=temperature,
        max_tokens=max_tokens,
        response_format={"type": "json_schema", "json_schema": schema},
    )
    return resp.choices[0].message.content or ""

def openai_chat(system: str, user: str, *, max_tokens: int = 400, temperature: float = 0.2) -> str:
    resp = client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "system", "content": system},
                  {"role": "user", "content": user}],
        temperature=temperature,
        max_tokens=max_tokens,
    )
    return resp.choices[0].message.content or ""


# ------------ Main RAG pipeline ------------
def rag_csv_answer(question: str, csv_path: str) -> str:
    # Keep context small & robust (avoid pasting the full CSV into the prompt).
    cols = header_columns(csv_path)
    head = first_n_raw_lines(csv_path, 3)

    column_context = ", ".join([f'"{c}"' for c in cols]) if cols else "(unknown)"
    csv_context = "\n".join(head) if head else "(empty file)"

    user_msg = SYS_INSTRUCT_REGEX.format(
        question=question,
        column_context=column_context,
        csv_context=csv_context
    )

    try:
        raw = openai_chat_json("You must return a single JSON object with a regex.", user_msg,
                               max_tokens=200, temperature=0.1)
    except (APIConnectionError, RateLimitError, OpenAIError) as e:
        print(f"[OpenAI error] {e}", file=sys.stderr)
        return "Could not generate a regex due to an API error."

    rgx = extract_regex(raw)
    if not rgx:
        # very mild fallback: extract literal tokens from question
        tokens = [t for t in re.findall(r"[A-Za-z0-9_./:-]+", question) if len(t) >= 3]
        rgx = "|".join(map(re.escape, tokens)) if tokens else r".*"

    matches = apply_regex_capture(csv_path, rgx)

    # Build a short answer using ONLY matched rows (optional; comment out if you truly only want the regex)
    answer_prompt = build_answer_prompt(question, matches)
    try:
        final = openai_chat(SYS_INSTRUCT_ANSWER, answer_prompt, max_tokens=400, temperature=0.2)
    except (APIConnectionError, RateLimitError, OpenAIError) as e:
        print(f"[OpenAI error] {e}", file=sys.stderr)
        final = "Regex generated, but answering failed due to an API error."

    # Show both regex and the derived answer
    out = {
        "regex_used": rgx,
        "matches_preview": matches[:5],
        "answer": final
    }
    return json.dumps(out, ensure_ascii=False, indent=2)


def main():
    if len(sys.argv) != 3:
        print('Usage: OPENAI_API_KEY=... python rag_csv_openai.py "your question" path/to/file.csv', file=sys.stderr)
        sys.exit(1)

    a, b = sys.argv[1], sys.argv[2]
    if os.path.exists(a) and not os.path.exists(b):
        csv_path, question = a, b
    elif os.path.exists(b) and not os.path.exists(a):
        csv_path, question = b, a
    elif os.path.exists(a) and os.path.exists(b):
        print("Please provide exactly one CSV path and one question.", file=sys.stderr)
        sys.exit(1)
    else:
        print("Could not find a CSV file in the two arguments.", file=sys.stderr)
        sys.exit(1)

    print(rag_csv_answer(question, csv_path))


if __name__ == "__main__":
    main()


Usage: OPENAI_API_KEY=... python rag_csv_openai.py "your question" path/to/file.csv


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
