In [None]:
import logging
import asyncio
import aiohttp
from typing import List, Dict
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from newspaper import Article
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from html2text import HTML2Text
import re
import time
import random
import gzip

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class SearchAPI:
    def __init__(self, name, base_url, params, user_agent_rotator, results_path):
        self.name = name
        self.base_url = base_url
        self.params = params
        self.user_agent_rotator = user_agent_rotator
        self.results_path = results_path
        self.used = 0
        self.last_request_time = 0

    async def search(self, session, query: str, num_results: int) -> List[Dict]:
        await self.respect_rate_limit()
        logger.info(f"Searching {self.name} for: {query}")
        params = self.params.copy()
        params['q'] = query
        params['num'] = min(num_results, 10) if self.name == 'Google' else num_results
        headers = {'User-Agent': self.user_agent_rotator.random}
        try:
            async with session.get(self.base_url, params=params, headers=headers, timeout=10) as response:
                response.raise_for_status()
                self.used += 1
                self.last_request_time = time.time()
                data = await response.json()
                results = []
                for item in data.get(self.results_path, []):
                    url = item.get('link') or item.get('url')
                    title = item.get('title') or "No title"
                    snippet = item.get('snippet') or "No snippet"
                    results.append({'title': title, 'url': url, 'snippet': snippet})
                return results
        except aiohttp.ClientError as e:
            logger.error(f"Error during {self.name} search: {e}")
            return []

    async def respect_rate_limit(self):
        if time.time() - self.last_request_time < 2:
            await asyncio.sleep(2 - (time.time() - self.last_request_time))

class WebContentExtractor:
    MAX_RETRIES = 2
    TIMEOUT = 10
    _driver = None

    @classmethod
    def _initialize_driver(cls):
        if cls._driver is not None:
            return
        edge_options = Options()
        edge_options.add_argument("--headless=new")
        edge_options.add_argument("--disable-gpu")
        edge_options.add_argument("--no-sandbox")
        user_agent = random.choice(USER_AGENTS)
        edge_options.add_argument(f"user-agent={user_agent}")
        cls._driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=edge_options)
        stealth(cls._driver, languages=["en-US", "en"], vendor="Google Inc.", platform="Win32", webgl_vendor="Intel Inc.", renderer="Angle", fix_hairline=True)

    @classmethod
    def extract_with_selenium(cls, url: str) -> str:
        try:
            cls._initialize_driver()
            cls._driver.get(url)
            WebDriverWait(cls._driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
            html_content = cls._driver.page_source
            soup = BeautifulSoup(html_content, 'html.parser')
            main_content = soup.find(['div', 'main', 'article'], class_=CONTENT_CLASS_PATTERN) or soup.body
            main_text = main_content.get_text(separator=' ', strip=True) if main_content else ''
            return re.sub(r'\s+', ' ', main_text)
        except Exception as e:
            logging.error(f"Selenium extraction failed for {url}: {e}")
            return ""

    @classmethod
    def extract_content(cls, url: str) -> str:
        if not cls.is_valid_url(url):
            logger.error(f"Invalid URL: {url}")
            return ""
        for extractor in [cls._extract_with_requests, cls._extract_with_newspaper, cls.extract_with_selenium]:
            text = extractor(url)
            if len(text.strip()) >= 200:
                return text
        return ""

    @classmethod
    def _extract_with_requests(cls, url: str) -> str:
        for attempt in range(1, cls.MAX_RETRIES + 1):
            try:
                headers = get_headers()
                response = requests.get(url, headers=headers, timeout=cls.TIMEOUT)
                response.raise_for_status()
                content_type = response.headers.get('Content-Type', '').lower()
                if 'text/html' not in content_type:
                    logger.warning(f"Non-HTML content returned for {url}: {content_type}")
                    return ""
                if response.headers.get('content-encoding') == 'gzip':
                    try:
                        html_content = gzip.decompress(response.content).decode('utf-8', errors='ignore')
                    except (OSError, gzip.BadGzipFile) as e:
                        logger.warning(f"Error decoding gzip content: {e}. Using raw content.")
                        html_content = response.text
                else:
                    html_content = response.text
                soup = BeautifulSoup(html_content, 'html.parser')
                return cls._extract_content_from_soup(soup)
            except requests.exceptions.RequestException as e:
                if attempt < cls.MAX_RETRIES:
                    logger.warning(f"Error with requests for {url} (attempt {attempt}): {e}. Retrying...")
                    time.sleep(2 ** attempt)
                else:
                    logger.warning(f"Error with requests for {url} after {cls.MAX_RETRIES} attempts: {e}. Giving up.")
                    return ""

    @classmethod
    def _extract_with_newspaper(cls, url: str) -> str:
        try:
            article = Article(url)
            article.download()
            article.parse()
            return article.text
        except Exception as e:
            logger.warning(f"Newspaper error for {url}: {e}")
            return ""

    @staticmethod
    def _extract_content_from_soup(soup: BeautifulSoup) -> str:
        for element in soup(['nav', 'header', 'footer', 'aside', 'script', 'style']):
            element.decompose()
        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()
        content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content|main-content|post-content|body|main-body|body-content|main', re.IGNORECASE))
        if not content:
            content = soup.body
        if content:
            h = HTML2Text()
            h.ignore_links = True
            h.ignore_images = True
            text = h.handle(str(content))
            text = re.sub(r'\n+', '\n', text)
            text = re.sub(r'\s+', ' ', text)
            text = text.strip()
            return text
        else:
            return ""

    @staticmethod
    def is_valid_url(url: str) -> bool:
        try:
            result = urlparse(url)
            return all([result.scheme, result.netloc])
        except ValueError:
            return False

    @classmethod
    def quit_driver(cls):
        if cls._driver is not None:
            cls._driver.quit()
            cls._driver = None

class SearchManager:
    def __init__(self, apis: List[SearchAPI], web_search_provider: SearchProvider, max_content_length: int = 10000,
                 cache_size: int = 100):
        self.apis = apis
        self.web_search_provider = web_search_provider
        self.content_extractor = WebContentExtractor()
        self.max_content_length = max_content_length
        self.cache = {}
        self.cache_size = cache_size

    async def search(self, query: str, num_results: int = 10):
        if query in self.cache:
            return self.cache[query]

        api_order = ["Google", "Brave", "DuckDuckGo"]
        results = []

        async with aiohttp.ClientSession() as session:
            for api_name in api_order:
                api = next((api for api in self.apis if api.name == api_name), None)
                if api and api.is_within_quota():
                    try:
                        logger.info(f"Trying {api_name} for query: {query}")
                        search_results = await api.search(session, query, num_results)
                        if search_results:
                            detailed_results = []
                            for result in search_results:
                                content = self.content_extractor.extract_content(result['url'])
                                result['content'] = content[:self.max_content_length]
                                detailed_results.append(result)
                            results.extend(detailed_results)
                            if len(results) >= num_results:
                                break
                    except Exception as e:
                        logger.error(f"Error searching {api_name}: {e}")

            if len(results) < num_results:
                logger.info(f"Trying DuckDuckGo for query: {query}")
                duck_results = self.web_search_provider.search(query, num_results)
                for result in duck_results:
                    content = self.content_extractor.extract_content(result.url)
                    results.append({
                        'title': result.title,
                        'url': result.url,
                        'snippet': result.snippet,
                        'content': content[:self.max_content_length] if content is not None else ""
                    })
                    if len(results) >= num_results:
                        break

        self._cache_results(query, results[:num_results])
        return results[:num_results]

    def _cache_results(self, query: str, results: List[Dict]):
        self.cache[query] = results
        if len(self.cache) > self.cache_size:
            self.cache.pop(next(iter(self.cache)))

# Example usage
# apis = [SearchAPI(...), SearchAPI(...)]
# web_search_provider = ...
# search_manager = SearchManager(apis, web_search_provider)
# results = asyncio.run(search_manager.search("example query"))