In [24]:
import requests

# Define the base URL of your SearXNG instance
base_url = "http://192.168.1.63:4000/search"

# Define search parameters
params = {
    'q': 'latest tech news',  # Example query
    'category_general': '1',  # General search category
    'language': 'en',  # Language: English
    'safesearch': '0',  # SafeSearch disabled
    'theme': 'simple',  # Use simple theme (less relevant for JSON format)
    'format': 'json',  # Request JSON format
}

# Define headers with a user-agent
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}

print(f"Searching SearXNG for: {params['q']}...")
try:
    # Send the GET request to SearXNG
    # Add a timeout for robustness
    response = requests.get(base_url, params=params, headers=headers, timeout=15)
    response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)

    # Parse the JSON response
    search_results = response.json()

    # Extract title, URL, and content snippet from the JSON response
    if 'results' in search_results:
        print(f"\nFound {len(search_results['results'])} results.\n")
        for i, result in enumerate(search_results['results']):
            title = result.get('title', 'No title found')
            url = result.get('url', 'No URL found')
            # *** Extract the content snippet provided by SearXNG ***
            content_snippet = result.get('content', 'No content snippet provided by SearXNG.')

            print(f"--- Result {i+1} ---")
            print(f"Title: {title}")
            print(f"URL: {url}")
            # *** Print the snippet directly from the SearXNG result ***
            print(f"SearXNG Content Snippet: {content_snippet}\n")

    elif 'infoboxes' in search_results and search_results['infoboxes']:
         print("\nFound infoboxes instead of standard results:")
         for i, box in enumerate(search_results['infoboxes']):
             print(f"--- Infobox {i+1} ---")
             print(f"Infobox Engine: {box.get('engine')}")
             print(f"Content: {box.get('content')}") # Often contains structured HTML
             if box.get('urls'):
                 print("Associated URLs:")
                 for u in box.get('urls'):
                     print(f"  - {u.get('title')}: {u.get('url')}")
             print("-" * 10)

    else:
        print("No 'results' or 'infoboxes' key found in the SearXNG JSON response.")
        # Optional: print part of the response for debugging if results are empty
        # print("Response details:", response.text[:500])

# Handle potential errors during the request to SearXNG
except requests.exceptions.Timeout:
    print(f"Error: Request to SearXNG instance timed out ({base_url}). Is it running and accessible?")
except requests.exceptions.ConnectionError:
    print(f"Error: Could not connect to SearXNG instance at {base_url}. Is the URL correct and the service running?")
except requests.exceptions.RequestException as e:
    # Catches other request errors, including bad status codes caught by raise_for_status()
    print(f"Error during SearXNG request: {e}")
    # It's helpful to see the response text if available, even in case of an error status
    if response:
        print("Response Text:", response.text[:500]) # Print first 500 chars
except requests.exceptions.JSONDecodeError:
    print("Error: Failed to decode JSON response from SearXNG.")
    print("Response Text:", response.text[:500]) # Print first 500 chars
except Exception as e:
    # Catch any other unexpected errors
    print(f"An unexpected error occurred: {e}")

Searching SearXNG for: latest tech news...

Found 33 results.

--- Result 1 ---
Title: Reuters Tech News | Today's Latest Technology News | Reuters
URL: https://www.reuters.com/technology
SearXNG Content Snippet: Technology · Musk's xAI buys Musk's X social media platform for $33 billion · Exclusive: Blackstone mulls small stake in US TikTok spinoff, sources say.

--- Result 2 ---
Title: Tech - The Verge
URL: https://www.theverge.com/tech
SearXNG Content Snippet: 4 days ago - The latest tech news about the world’s best (and sometimes worst) hardware, apps, and much more. From top companies like Google and Apple to tiny startups vying for your attention, Verge Tech has the latest in what matters in technology daily.

--- Result 3 ---
Title: WIRED - The Latest in Technology, Science, Culture and Business | WIRED
URL: https://www.wired.com
SearXNG Content Snippet: WIRED · Special Edition · Most Recent · Today's Picks · DOGE Plans to Rebuild SSA Codebase in Months, Risking Benefits and Sys

In [26]:
# Import necessary libraries
import requests
import json
from typing import List, Dict, Union, Optional

# Define default headers - can be overridden by passing 'headers' to the function
DEFAULT_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}

def search_searxng(
    query: str,
    base_url: str,
    language: str = 'en',
    safesearch: int = 0, # 0: none, 1: moderate, 2: strict
    max_results: Optional[int] = 10, # Optional: Limit results client-side
    timeout: int = 15,
    headers: Optional[Dict[str, str]] = None,
    categories: str = 'general' # Use 'general', 'news', 'images', etc.
) -> Dict[str, Union[List[Dict[str, str]], str]]:
    """
    Performs a web search using a specified SearXNG instance and returns formatted results.

    This function is designed to be called as a tool by an LLM agent.

    Args:
        query: The search query string provided by the user or LLM.
        base_url: The base URL of the SearXNG instance (e.g., "http://127.0.0.1:8080" or "https://searx.example.com").
                  Crucially, this should NOT end with '/search'.
        language: The language code for the search results (e.g., 'en', 'de', 'fr'). Defaults to 'en'.
        safesearch: SafeSearch level (0=off, 1=moderate, 2=strict). Defaults to 0 (off).
        max_results: The maximum number of results to return (processed client-side from the response).
                     Set to None to return all results found in the first page. Defaults to 10.
        timeout: The timeout in seconds for the web request. Defaults to 15.
        headers: Optional dictionary of HTTP headers to use instead of the default User-Agent.
        categories: The SearXNG category to search within (e.g., 'general', 'news', 'science'). Defaults to 'general'.

    Returns:
        A dictionary containing either:
        - 'results': A list of dictionaries, where each dictionary has 'title', 'url', and 'snippet' keys
                     representing a search result. This list can be empty if no results were found.
        or
        - 'error': A string describing the error encountered during the search process (e.g., connection error, timeout, bad response).
    """
    if not base_url:
        # Essential configuration is missing
        return {"error": "SearXNG base URL is not configured or provided."}

    # Ensure the base URL doesn't end with a slash, then append /search
    search_endpoint = f"{base_url.rstrip('/')}/search"
    search_headers = headers if headers else DEFAULT_HEADERS

    # Define search parameters for the GET request
    params = {
        'q': query,
        'categories': categories, # Use 'categories' (standard for SearXNG)
        'language': language,
        'safesearch': str(safesearch), # SearXNG expects safesearch level as a string
        'format': 'json', # Request JSON format for easy parsing
        # 'pageno': 1, # Could add pagination support later if needed
        # 'theme': 'simple', # Theme is irrelevant for JSON format
    }

    print(f"DEBUG: Querying SearXNG endpoint: {search_endpoint}")
    print(f"DEBUG: Using parameters: {params}")

    response = None # Initialize response to None for broader scope in error handling
    try:
        # Send the GET request to the SearXNG instance
        response = requests.get(
            search_endpoint,
            params=params,
            headers=search_headers,
            timeout=timeout
        )
        # Raise an HTTPError exception for bad status codes (4xx or 5xx)
        response.raise_for_status()

        # Parse the JSON response from SearXNG
        search_data = response.json()

        # Prepare the list to hold formatted results
        formatted_results = []

        # --- Process standard 'results' ---
        if 'results' in search_data and isinstance(search_data['results'], list):
            results_list = search_data['results']
            # Apply client-side limit if max_results is specified
            if max_results is not None:
                results_list = results_list[:max_results]

            for result in results_list:
                # Extract relevant fields, providing defaults if keys are missing
                formatted_results.append({
                    "title": result.get('title', 'No title provided'),
                    "url": result.get('url', 'No URL provided'),
                    # 'content' usually holds the descriptive snippet in SearXNG JSON
                    "snippet": result.get('content', 'No snippet available.')
                })

        # --- Process 'infoboxes' (e.g., knowledge graph panels) ---
        # Format them similarly to standard results for consistency
        if 'infoboxes' in search_data and isinstance(search_data['infoboxes'], list):
             for box in search_data['infoboxes']:
                 # Try to create a meaningful entry from the infobox
                 title = f"Infobox ({box.get('engine', 'Source Unknown')})"
                 content = box.get('content', 'No content in infobox.') # Often HTML, LLM might handle it
                 infobox_url = None
                 # Try to find a primary URL associated with the infobox
                 if box.get('urls') and isinstance(box['urls'], list) and box['urls']:
                     infobox_url = box['urls'][0].get('url') # Get URL from the first link, if available

                 formatted_results.append({
                     "title": title,
                     "url": infobox_url or box.get('infobox_url', 'No URL for infobox'), # Fallback check
                     "snippet": content # Provide the raw content, often HTML
                 })
                 # Apply max_results limit check again if infoboxes push count over limit
                 if max_results is not None and len(formatted_results) >= max_results:
                     break # Stop processing infoboxes if limit reached

        # --- Handle cases with no results ---
        if not formatted_results:
            # Check if engines were unresponsive, might be useful info
            if search_data.get('unresponsive_engines'):
                 print(f"DEBUG: No results found. Unresponsive engines: {search_data['unresponsive_engines']}")
                 # You could potentially return this info in the response if needed by the agent
            else:
                 print("DEBUG: No results or infoboxes found in the response.")
            # Return success, but with an empty list
            return {"results": []}

        # Return the successfully gathered and formatted results
        return {"results": formatted_results}

    # --- Error Handling ---
    except requests.exceptions.Timeout:
        error_msg = f"Error: Request to SearXNG timed out after {timeout} seconds ({search_endpoint}). The instance might be down or slow."
        print(error_msg)
        return {"error": error_msg}
    except requests.exceptions.ConnectionError:
        error_msg = f"Error: Could not connect to SearXNG instance at {search_endpoint}. Check the URL and ensure the service is running and accessible."
        print(error_msg)
        return {"error": error_msg}
    except requests.exceptions.HTTPError as e:
        # Handle errors like 404 Not Found, 500 Internal Server Error, etc.
        error_msg = f"Error: SearXNG request failed with HTTP status code {e.response.status_code}."
        print(error_msg)
        # Include response text snippet for debugging if possible
        try:
            error_details = e.response.text[:500] # Limit details length
            error_msg += f" Response snippet: {error_details}"
            print(f"Response snippet: {error_details}")
        except Exception:
            pass # Ignore errors trying to get response text
        return {"error": error_msg}
    except requests.exceptions.JSONDecodeError:
        error_msg = "Error: Failed to decode the JSON response from SearXNG. The instance might have returned invalid JSON or HTML error page."
        print(error_msg)
        # Print response text snippet for debugging
        if response:
            print("Response Text Snippet:", response.text[:500])
        return {"error": error_msg}
    except requests.exceptions.RequestException as e:
        # Catch any other exceptions from the 'requests' library
        error_msg = f"Error: An unexpected error occurred during the SearXNG request: {e}"
        print(error_msg)
        return {"error": error_msg}
    except Exception as e:
        # Catch any other unexpected Python errors during processing
        error_msg = f"An unexpected error occurred: {e}"
        import traceback
        print(error_msg)
        print(traceback.format_exc()) # Print stack trace for unexpected errors
        return {"error": error_msg}

# --- Example Usage (for Jupyter Notebook or manual testing) ---

# !!! IMPORTANT: Replace with the actual URL of YOUR SearXNG instance !!!
# Examples:
my_searxng_url = "http://192.168.1.63:4000" # Your local instance
# my_searxng_url = "https://searx.space/searxng-instance-url" # A public instance (check its terms)
# my_searxng_url = None # Set to None or "" to test the base_url error handling

if __name__ == "__main__": # Only run example when script is executed directly
    print("--- Testing SearXNG Search Function ---")

    test_query = "What are the latest developments in quantum computing?"
    # test_query = "python requests library documentation" # Another query example
    # test_query = "news about renewable energy" # Example for 'news' category

    if my_searxng_url:
        print(f"\nSearching for: '{test_query}' using base URL: {my_searxng_url}\n")

        # Call the function
        search_result_data = search_searxng(
            query=test_query,
            base_url=my_searxng_url,
            language='en',
            max_results=5, # Ask for fewer results for cleaner test output
            categories='general' # Or try 'news'
        )

        print("\n--- Function Return Value ---")
        # Pretty print the JSON-like dictionary output
        print(json.dumps(search_result_data, indent=2))

        # Example of how an agent might use the result:
        if "results" in search_result_data:
            print(f"\n--- Agent Processing Example (Found {len(search_result_data['results'])} results) ---")
            if search_result_data["results"]:
                for i, res in enumerate(search_result_data["results"]):
                    print(f"Result {i+1}:")
                    print(f"  Title: {res['title']}")
                    print(f"  URL: {res['url']}")
                    print(f"  Snippet: {res['snippet'][:150]}...") # Show truncated snippet
                    print("-" * 10)
            else:
                print("The search was successful, but no results were found for the query.")
        elif "error" in search_result_data:
            print(f"\n--- Agent Processing Example (Error Occurred) ---")
            print(f"Search failed: {search_result_data['error']}")
    else:
        print("\nSkipping search test because 'my_searxng_url' is not set.")
        # Test the error handling for missing URL
        print("\n--- Testing Missing URL Error Handling ---")
        error_result = search_searxng(query="test", base_url=my_searxng_url)
        print(json.dumps(error_result, indent=2))


    # --- Example Tool Schema for OpenAI Agents SDK (or similar frameworks) ---
    # This schema tells the LLM how to use your function.
    tool_schema = {
        "type": "function",
        "function": {
            "name": "search_searxng", # Must match your function name
            "description": "Performs a web search using a private SearXNG instance to find up-to-date information or answer questions about recent events. Returns a list of search results including title, URL, and snippet.",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "The specific search query string to search the web for.",
                    },
                    "language": {
                        "type": "string",
                        "description": "Optional language code for the search (e.g., 'en' for English, 'de' for German). Defaults to 'en'.",
                        "default": "en"
                    },
                    "categories": {
                        "type": "string",
                        "description": "Optional category to search within (e.g., 'general', 'news', 'science', 'images'). Defaults to 'general'.",
                        "default": "general"
                    }
                    # Add other parameters like 'safesearch' or 'max_results' here
                    # if you want the LLM to be able to control them.
                },
                "required": ["query"], # Only 'query' is strictly required by the function logic
            },
        }
    }

    if __name__ == "__main__": # Only print schema when run directly
        print("\n--- Example Tool Schema for LLM Agent ---")
        print(json.dumps(tool_schema, indent=2))

--- Testing SearXNG Search Function ---

Searching for: 'What are the latest developments in quantum computing?' using base URL: http://192.168.1.63:4000

DEBUG: Querying SearXNG endpoint: http://192.168.1.63:4000/search
DEBUG: Using parameters: {'q': 'What are the latest developments in quantum computing?', 'categories': 'general', 'language': 'en', 'safesearch': '0', 'format': 'json'}

--- Function Return Value ---
{
  "results": [
    {
      "title": "Quantum Computers News -- ScienceDaily",
      "url": "https://www.sciencedaily.com/news/computers_math/quantum_computers",
      "snippet": "Quantum Computer Research. Read the latest news in developing quantum computers. ... Major Development Successes in Diamond Spin Photon Quantum Computers; Thursday, October 24, 2024."
    },
    {
      "title": "The latest developments in quantum computing: A transformative frontier | Open Access Government",
      "url": "https://www.openaccessgovernment.org/the-latest-developments-in-quantum-