In [None]:
!pip install langchain_community

In [162]:
## SearXNG

from langchain_community.utilities import SearxSearchWrapper
import pprint

def call_searxng_search(query, num_results=5, categories=['general']):
    s = SearxSearchWrapper(searx_host="http://192.168.8.116:8877")
    results = s.results(
        query=query, 
        num_results=num_results, 
        categories=categories,
        # time_range="day",
    )
    return results

In [94]:
# Jina AI Reader

import requests
import json 

def call_jina_reader(url, headers=None):
    if not url.startswith("https://r.jina.ai/"):
        url = f"https://r.jina.ai/{url}"
    
    # Set default headers if none are provided
    default_headers = {
        # "x-with-generated-alt": "true", # Enable image captioning
        "Accept": "application/json",
        # "x-respond-with": "text",
    }
    
    if headers:
        default_headers.update(headers) # Add user-provided headers
    
    response = requests.get(url, headers=default_headers)
    
    # Check for successful response
    response.raise_for_status() 

    data = response.text
    
    return json.loads(data)


In [116]:
# Search Helpers

import re
from dataclasses import dataclass

@dataclass
class SearchResult:
    Index: int
    Title: str
    Url: str
    Page_Snippet: str 
    Page_Content:str = None

def replace_urls(text):
    url_pattern_base = r'(https?://\S+)'
    # url_pattern_full = r'\[.*?\]\(.*?\)'
    pass1 = re.sub(url_pattern_base, '[*]', text)
    # pass2 = re.sub(url_pattern_base, '[*]', pass1)
    return pass1

def parse_results_simple(results):
    parsed_results = []
    for i,result in enumerate(search_results):
        parsed_results.append(SearchResult(Index=i+1,
                                  Title=result['title'],
                                  Url=result['link'],
                                  Page_Snippet=result['snippet']))
    return parsed_results

# def llm_query_simple(parsed_results, search_query):
    # combine parsed results

In [163]:
## Simple Search 
from openai import OpenAI

# query
search_query = "meta ai"

# Search
search_results = call_searxng_search(search_query, num_results=10)
parsed_results = parse_results_simple(search_results)

# Prompt Generation
prompt_text = f"Query: {search_query} \n\nSearch Results:\n\n"
for i,result in enumerate(parsed_results):
    prompt_text = prompt_text + f"Result: #{result.Index}\nTitle: {result.Title}\nSnippet: {result.Page_Snippet}\n\n"

system_message = """You are an expert assistant at reviewing search results for relevance. Below is a user query and a preliminary list of results. Your job is to give the following information: 
1. Do the search results contain the answer to the user's query?
2. What is the answer to the user's query, based on the search results?
3. Which search results contain the answer to the user's query

Respond with ONLY a JSON object, looking like this: 

{
    "has_answer": "true",
    "answer": "The answer is 42.",
    "citations": [2,3,5]
}
"""

# Query LLM
client = OpenAI(
    base_url='http://192.168.8.116:11434/v1/',
    api_key='ollama',
)
chat_completion = client.chat.completions.create(
    messages=[
        {"role": "system", "content": system_message},
        {'role': 'user', 'content': prompt_text, }
    ],
    model='llama3.1:8b',
    temperature=0,
)
result_json = json.loads(chat_completion.choices[0].message.content)

# Results
pprint.pp(result_json)
for result in parsed_results:
    if result.Index in result_json['citations']:
        print(f"{result.Index} - {result.Url}")

{'has_answer': 'true',
 'answer': 'Meta AI is an intelligent assistant that can help with various '
           "tasks, from planning to learning, across Meta's apps and the web.",
 'citations': [1, 3, 5]}
1 - https://ai.meta.com/meta-ai/
3 - https://about.fb.com/news/2024/04/meta-ai-assistant-built-with-llama-3/
5 - https://en.wikipedia.org/wiki/Meta_AI


In [None]:
# Advanced Search

# TODO: if answer more complex, investigate urls individually
# TODO: add this level as [deep] search

# Parse results
for i,result in enumerate(search_results):
    print(f"Result #{i+1}: {result['title']}\n")
    
    # Get page content
    result_page_content = call_jina_reader(result['link'])
    result_page_text = result_page_content['data']['content']

    # clean page
    page_text_parsed = replace_urls(result_page_text)
    print(page_text_parsed)

    break

# TODO: check if user query can be answered via top search result titles and snippets
# TODO: deeper search within each link, if answer not simple 

# TODO: IDEA ... have LLM provide answer, then search for answer, then compare answers...

In [71]:
data = call_jina_reader('https://oscarliang.com/am32-esc-firmware-an-open-source-alternative-to-blheli32/')
pprint.pp(data)

{'code': 200,
 'status': 20000,
 'data': {'title': 'AM32 ESC Firmware - An Open Source Alternative to BLHeli32 '
                   '- Oscar Liang',
          'url': 'https://oscarliang.com/am32-esc-firmware-an-open-source-alternative-to-blheli32/',
          'content': 'If the term AM32 has caught your attention recently, '
                     'you’ve landed in the right place. In this article, we’ll '
                     'dive deep into the world of AM32 ESC firmware – '
                     'understanding its core features, unique advantages, and '
                     'how you can leverage its power for your next FPV drone '
                     'build.\n'
                     '\n'
                     '[![Image 1: '
                     'sponsor-banner](https://oscarliang.com/wp-content/uploads/2024/06/banggood-banner-13-06-2024-1.jpg)](https://bit.ly/49Tvu2I)\n'
                     '\n'
                     '_Some of the links on this page are affiliate links. I '
            