In [1]:
# Dependency setup (run this first)
import sys, subprocess, importlib, datetime

def install(pkg):
    print(f"Installing {pkg} ...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

REQUIRED = ["requests", "beautifulsoup4", "pandas"]  # base libs
for pkg in REQUIRED:
    module_name = pkg if pkg != "beautifulsoup4" else "bs4"
    try:
        importlib.import_module(module_name)
    except ImportError:
        install(pkg)

# Attempt lxml (preferred) with fallback to html5lib if build/wheel missing
PARSERS = []
try:
    importlib.import_module("lxml")
    PARSERS.append("lxml")
except ImportError:
    try:
        install("lxml")
        importlib.import_module("lxml")
        PARSERS.append("lxml")
    except Exception as e:
        print("Could not install lxml:", e)
        # fallback html5lib
        try:
            importlib.import_module("html5lib")
        except ImportError:
            install("html5lib")
        PARSERS.append("html5lib")

import requests
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
print("Dependencies ready at", datetime.datetime.utcnow(), "UTC", "Parsers available:", PARSERS)

Dependencies ready at 2025-08-11 14:46:04.834203 UTC Parsers available: ['lxml']


In [5]:
context = """
Yoshihiro Nakata, President and CEO of Toyota Motor Europe
Members of the Board and Corporate Auditors | Toyota Tsusho
Executive Biographies
Toyota Motor Europe - Wikipedia
Toyota Motor Europe announces executive changes
New Chairman Hiroshi Kuriyama now based at European HQ in ...
Toyota in the World | Toyota Europe
Executive Officers｜Corporate Outline｜Corporate Information ...
Executive Bios
Automotive News Europe - Automotive News
"""

test_query = "Toyota European headquarters CEO"


# 使用 llm 帮我提取信息，返回一个 string。
# 功能是从我的 context 里提取出 query 的答案。
# 帮我设计 system prompt 和 user prompt的构建 


### 提取答案的 Prompt 设计

目标：从给定 `context` 行内抽取 `query` 问题需要的最精确答案（通常是姓名或最短实体短语），不得臆造。

System Prompt 约束核心：
- 只能使用 CONTEXT 中出现的原文（可截取行内连续子串，不得改写）。
- 优先输出人物姓名（若问题寻求如 *CEO* 这类角色且姓名+头衔同列出现）。
- 如果无法在 context 中找到明确答案，输出 `NOT_FOUND`。
- 输出只允许一个字符串，无解释、无引号、无额外标点。

User Prompt 模板：
```
CONTEXT:
{context}

QUESTION:
{query}

Answer:
```

实现函数：
1. `build_prompts(context, query)` 返回 (system, user)
2. `extract_answer(context, query, **gen_kwargs)` 调用 LLM 并返回答案。

测试：对 `test_query` 调用后应返回 `Yoshihiro Nakata`（或 `NOT_FOUND` 若上下文缺失）。

In [6]:
# Prompt 构建与答案抽取
from typing import Tuple
from llm import chat_once

SYSTEM_PROMPT_BASE = (
    "You are an extraction engine. Given a CONTEXT (list of short lines) and a QUESTION, "
    "return the minimal exact substring from the context that answers the question. Rules: "
    "1) Only output text that appears verbatim (contiguous characters) in the context. "
    "2) Prefer a person's full name if the question is about a role/title and the name+title appears; return only the name unless title is inseparable. "
    "3) If multiple candidates, choose the most specific (full name over partial). "
    "4) If no answer is explicitly present, output NOT_FOUND. "
    "5) Output must be a single line with no explanation, no quotes, no extra punctuation."
)

def build_prompts(context: str, query: str) -> Tuple[str, str]:
    user = f"CONTEXT:\n{context.strip()}\n\nQUESTION:\n{query.strip()}\n\nAnswer:"
    return SYSTEM_PROMPT_BASE, user


def extract_answer(context: str, query: str, **gen_kwargs) -> str:
    system, user = build_prompts(context, query)
    ans = chat_once(system, user, max_tokens=20, temperature=0, **gen_kwargs)
    if not ans:
        return "NOT_FOUND"
    ans = ans.strip().splitlines()[0].strip()
    # Sanity: ensure answer (if not NOT_FOUND) appears in context
    if ans != "NOT_FOUND" and ans not in context:
        return "NOT_FOUND"
    return ans

# # Demo run
# if 'test_query' in globals():
demo_answer = extract_answer(context, test_query)

print('Query:', test_query)
print('Answer:', demo_answer)



Query: Toyota European headquarters CEO
Answer: Yoshihiro Nakata


In [None]:
test_query = "Toyota European headquarters CEO"


In [7]:
# Google 搜索 -> 抽取流水线
import importlib.util, pathlib, textwrap
from pathlib import Path

search_file = Path('google-search-api.py')
if not search_file.exists():
    raise FileNotFoundError('google-search-api.py 不存在，无法执行搜索。')

spec = importlib.util.spec_from_file_location('google_search_api_mod', str(search_file))
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)  # type: ignore

# 获取搜索结果 titles 作为 context
search_context = mod.google_search_titles(test_query, n=10)
print('--- Raw Search Titles Context ---')
print(search_context or '[EMPTY / FAILED]')

if not search_context.strip():
    answer_from_search = 'NOT_FOUND'
else:
    answer_from_search = extract_answer(search_context, test_query)

print('\nQuery:', test_query)
print('Answer (from search):', answer_from_search)

--- Raw Search Titles Context ---
Yoshihiro Nakata, President and CEO of Toyota Motor Europe
Members of the Board and Corporate Auditors | Toyota Tsusho
Executive Biographies
Toyota Motor Europe - Wikipedia
Toyota Motor Europe announces executive changes
New Chairman Hiroshi Kuriyama now based at European HQ in ...
Toyota in the World | Toyota Europe
Executive Officers｜Corporate Outline｜Corporate Information ...
Executive Bios
Automotive News Europe - Automotive News

Query: Toyota European headquarters CEO
Answer (from search): Yoshihiro Nakata


### 基于 Google 搜索动态抽取

流程：
1. 用 `test_query` 调用 Google Custom Search (只取 title 聚合成 context)。
2. 复用前面 `extract_answer` 进行答案抽取。
3. 若 API key / cx 缺失或搜索为空，则输出 `NOT_FOUND`。

下面代码会：
- 动态加载已有 `google-search-api.py`（文件名含 `-` 用 importlib 方式）。
- 获取 context、打印 context、打印最终答案。