# Google reverse image search for AVeriMaTeC claims

Small helper to pick a dev claim by its numeric ID, show the claim text and images, and send each image to Google Lens via [SerpAPI](https://serpapi.com/google-lens) for reverse image search. You can also switch to [Serper](https://serper.dev) if you have an externally reachable URL for each image.

Fill in the config cell below, set `SERPAPI_API_KEY` *or* `SERPER_API_KEY` in your environment (or hardcode it in the config), and run the notebook. Results are printed and also saved as JSON under `logs/ris_results/`.


In [3]:
from pathlib import Path
import json
import os
from typing import Dict, List

from IPython.display import display, HTML
from PIL import Image
import requests

# --- configure here ---
DATA_ROOT = Path("/mnt/data/factcheck/averimatec")
SPLIT = "test"  # "dev"/"val"/"test" depending on how you name the file
CLAIM_ID = 0
INCLUDE_QUESTION_IMAGES = True
SEARCH_BACKEND = "serper"  # serpapi | serper
SERPAPI_KEY = os.environ.get("SERPAPI_API_KEY", "")  # put your key here if you prefer
SERPER_KEY = os.environ.get("SERPER_API_KEY", "")    # Serper API key
SPLIT = "val"
SERPER_IMAGE_BASE_URL = os.environ.get("SERPER_IMAGE_BASE_URL", f"https://fcheck.fel.cvut.cz/images/averimatec")  # e.g., https://your-bucket/images
OUTPUT_DIR = Path("logs/ris_results")
# -----------------------


In [None]:
def load_claim(split: str, claim_id: int) -> Dict:
    json_path = DATA_ROOT / f"{split}.json"
    if not json_path.exists():
        raise FileNotFoundError(f"Missing split file at {json_path}")
    with open(json_path, "r") as f:
        data = json.load(f)
    if claim_id < 0 or claim_id >= len(data):
        raise IndexError(f"Claim ID {claim_id} is out of range (0-{len(data)-1})")
    claim = data[claim_id].copy()
    claim["claim_id"] = claim_id
    return claim


def collect_images(claim: Dict, include_question_images: bool = True) -> List[Path]:
    paths = [DATA_ROOT / "images" / name for name in claim.get("claim_images", [])]
    if include_question_images:
        for q in claim.get("questions", []):
            for name in q.get("input_images", []):
                paths.append(DATA_ROOT / "images" / name)
    seen = set()
    unique_paths = []
    for p in paths:
        if p not in seen:
            unique_paths.append(p)
            seen.add(p)
    return unique_paths


claim = load_claim(SPLIT, CLAIM_ID)
image_paths = collect_images(claim, INCLUDE_QUESTION_IMAGES)

display(HTML(f"""
<h3>Claim {claim['claim_id']}</h3>
<p><b>claim_text:</b> {claim.get('claim_text', 'â€”')}</p>
<p><b>images found:</b> {len(image_paths)} ({'with' if INCLUDE_QUESTION_IMAGES else 'without'} question input images)</p>
"""))


In [None]:
for img_path in image_paths:
    if not img_path.exists():
        print(f"Missing file: {img_path}")
        continue
    print(f"Showing {img_path.name}")
    display(Image.open(img_path))


In [None]:
def google_reverse_image_search(image_path: Path, api_key: str = SERPAPI_KEY, timeout: int = 60) -> Dict:
    """Send one image to Google Lens via SerpAPI and return the parsed JSON response."""
    if not api_key:
        raise RuntimeError("Missing SERPAPI_API_KEY. Set the env var or populate SERPAPI_KEY in the config cell.")
    if not image_path.exists():
        raise FileNotFoundError(f"Image not found: {image_path}")
    files = {"file": (image_path.name, open(image_path, "rb"), "image/jpeg")}
    params = {
        "engine": "google_lens",
        "api_key": api_key,
        "hl": "en",
    }
    resp = requests.post("https://serpapi.com/search.json", params=params, files=files, timeout=timeout)
    resp.raise_for_status()
    return resp.json()


def serper_reverse_image_search(image_url: str, api_key: str = SERPER_KEY, timeout: int = 60) -> Dict:
    """Use Serper Google Lens with an externally reachable image URL."""
    if not api_key:
        raise RuntimeError("Missing SERPER_API_KEY. Set the env var or populate SERPER_KEY in the config cell.")
    headers = {
        "X-API-KEY": api_key,
        "Content-Type": "application/json",
    }
    payload = {"engine": "google_lens", "imageUrl": image_url, "hl": "en"}
    resp = requests.post("https://google.serper.dev/images", headers=headers, json=payload, timeout=timeout)
    resp.raise_for_status()
    return resp.json()


def make_serper_url(img_path: Path) -> str:
    if SERPER_IMAGE_BASE_URL:
        return f"{SERPER_IMAGE_BASE_URL.rstrip('/')}/{img_path.name}"
    raise RuntimeError("Serper needs an externally reachable image URL. Set SERPER_IMAGE_BASE_URL to where the images are hosted (e.g., an S3 bucket).")


def reverse_image_search(img_path: Path) -> Dict:
    if SEARCH_BACKEND == "serpapi":
        return google_reverse_image_search(img_path)
    if SEARCH_BACKEND == "serper":
        url = make_serper_url(img_path)
        return serper_reverse_image_search(url)
    raise ValueError(f"Unknown SEARCH_BACKEND: {SEARCH_BACKEND}")


In [None]:
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
all_results = []

for img_path in image_paths:
    if not img_path.exists():
        continue
    print(f"\nRunning {SEARCH_BACKEND} reverse image search for {img_path.name}...")
    result = reverse_image_search(img_path)
    all_results.append({"image": img_path.name, "result": result})

    visuals = (result.get("visual_matches") or result.get("images_results") or [])[:3]
    organic = (result.get("organic_results") or result.get("organic") or [])[:3]
    print(f"Top {len(visuals)} visual matches:")
    for idx, match in enumerate(visuals, start=1):
        title = match.get("title") or match.get("link") or "(no title)"
        print(f"  {idx}. {title}")
    print(f"Top {len(organic)} web results:")
    for idx, hit in enumerate(organic, start=1):
        title = hit.get("title") or hit.get("link") or "(no title)"
        print(f"  {idx}. {title}")

save_path = OUTPUT_DIR / f"claim_{claim['claim_id']}_ris.json"
with open(save_path, "w") as f:
    json.dump({"claim": claim, "results": all_results}, f, indent=2)
print(f"\nSaved raw responses to {save_path}")
