In [1]:
import os
import time
import requests
from firebase import db
from urllib.parse import urlparse
from dotenv import find_dotenv, load_dotenv
from datetime import datetime

In [2]:
# ——— ENV & Firebase Init ———
dotenv = find_dotenv()
if dotenv:
    load_dotenv(dotenv)

In [3]:
# Search API config
SEARXNG_URL    = os.getenv("SEARXNG_URL")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CX      = os.getenv("GOOGLE_CX")

if not SEARXNG_URL or not GOOGLE_API_KEY or not GOOGLE_CX:
    raise ValueError("Missing one of SEARXNG_URL, GOOGLE_API_KEY, GOOGLE_CX in .env")

In [None]:
# ——— Search Helpers ———

def searxng_search(query, num=300):
    try:
        r = requests.get(
            f"{SEARXNG_URL}/search",
            params={"q": query, "format": "json", "categories": "general", "limit": num},
            timeout=10
        )
        r.raise_for_status()
        hits = r.json().get("results", [])[:num]
        return [
            {"title": h.get("title",""), "snippet": h.get("content",""), "url": h.get("url","")}
            for h in hits if h.get("url")
        ]
    except Exception:
        return []

def google_search(query, num=300):
    results = []
    start = 1
    while len(results) < num:
        batch = min(10, num - len(results))
        resp = requests.get(
            "https://www.googleapis.com/customsearch/v1",
            params={"key": GOOGLE_API_KEY, "cx": GOOGLE_CX, "q": query, "start": start, "num": batch},
            timeout=10
        ).json()
        items = resp.get("items", [])
        if not items:
            break
        for i in items:
            results.append({"title": i["title"], "snippet": i.get("snippet",""), "url": i["link"]})
            if len(results) >= num:
                break
        start += len(items)
        time.sleep(1)  # small pause to respect rate limits
    return results

In [6]:
# ——— Parameters ———
QUERIES = [
    # Local & lifestyle
    "weather forecast for Toronto",
    "best Italian restaurants near me",
    "public library hours near me",
    "movie showtimes near me tonight",
    "electric car charging stations near me",
    "public transit schedule for Chicago",
    "flight status for AA100",
    "cheap hotel deals in London",
    "top 10 tourist attractions in Paris",
    "hiking trails in Colorado",

    # How-tos & tutorials
    "how to tie a tie",
    "how to bake sourdough bread",
    "home workout routines for beginners",
    "how to change a tire",
    "how to meditate for stress relief",
    "how to knot a bowline",
    "how to troubleshoot Wi-Fi connectivity",
    "how to start a garden",
    "how to reset a Windows 10 password",
    "how to make coffee at home",
    "how to knit a scarf",

    # Health & wellness
    "symptoms of the common cold",
    "symptoms of depression",
    "calorie count for banana",
    "healthy smoothie recipes",
    "home remedies for headache",

    # Education & careers
    "python tutorial for beginners",
    "free online courses for data science",
    "job interview tips for software engineers",
    "how to write a cover letter",
    "college scholarships for international students",
    "best coding bootcamps 2025",
    "list of universities in Canada",

    # Tech & shopping
    "latest smartphone reviews",
    "best VPN services",
    "organic vs non-GMO foods",
    "DIY home repair tips",

    # Lists & directories
    "list of hospitals in Ontario",
    "list of national parks in US",
    "list of programming languages in demand",
    "list of museums in New York City"
]
TARGET_COUNT =10000 # Total number of results to store

# ——— Main Collection Handle ———

rank_col = db.collection("ranking_training_data")

# ——— Fetch & Store Loop ———

seen_urls = set()
count     = 0

for q in QUERIES:
    if count >= TARGET_COUNT:
        break

    print(f"▶️ Querying: {q!r}")
    # combine both engines
    for fn, src in ((searxng_search, "searxng"), (google_search, "google")):
        results = fn(q, num=TARGET_COUNT//len(QUERIES))
        for r in results:
            url = r["url"].strip()
            if not url or url in seen_urls:
                continue
            seen_urls.add(url)

            # write to Firestore
            rank_col.add({
                "query": q,
                "title": r["title"],
                "snippet": r["snippet"],
                "url": url,
                "source": src,
                "timestamp": datetime.utcnow()
            })
            count += 1

            if count % 50 == 0:
                print(f"  → Stored {count} so far…")

            if count >= TARGET_COUNT:
                break
        if count >= TARGET_COUNT:
            break

print(f"✅ Completed. Total stored: {count}")

▶️ Querying: 'weather forecast for Toronto'


  "timestamp": datetime.utcnow()


  → Stored 50 so far…
  → Stored 100 so far…
▶️ Querying: 'best Italian restaurants near me'
  → Stored 150 so far…
  → Stored 200 so far…
  → Stored 250 so far…
▶️ Querying: 'public library hours near me'
  → Stored 300 so far…
  → Stored 350 so far…
▶️ Querying: 'movie showtimes near me tonight'
  → Stored 400 so far…
  → Stored 450 so far…
  → Stored 500 so far…
▶️ Querying: 'electric car charging stations near me'
  → Stored 550 so far…
  → Stored 600 so far…
  → Stored 650 so far…
▶️ Querying: 'public transit schedule for Chicago'
  → Stored 700 so far…
  → Stored 750 so far…
▶️ Querying: 'flight status for AA100'
  → Stored 800 so far…
▶️ Querying: 'cheap hotel deals in London'
  → Stored 850 so far…
  → Stored 900 so far…
  → Stored 950 so far…
▶️ Querying: 'top 10 tourist attractions in Paris'
  → Stored 1000 so far…
  → Stored 1050 so far…
▶️ Querying: 'hiking trails in Colorado'
  → Stored 1100 so far…
  → Stored 1150 so far…
  → Stored 1200 so far…
▶️ Querying: 'how to tie a