```python
## \file /sandbox/davidka/crawler_simple_driver.py
# -*- coding: utf-8 -*-
#! .pyenv/bin/python3
```
Модуль для сбора данных со страниц товаров через SimpleDriver
=====================================================
(адаптация исходного crawler.py)
```rst
.. module:: sandbox.davidka.crawler_simple_driver
```


In [1]:
import header
from header import __root__
from src import gs

[37m🔑 Found password in password.txt (DEBUG MODE)[0m
[37m✅ Successfully opened KeePass database: C:\Users\user\Documents\repos\hypotez\secrets\credentials.kdbx[0m
[37mFailed to load GAPI credentials[0m


2025-05-10 21:38:24,075 - DEBUG - 🐛 [36m[49mНет нового релиза: https://github.com/repos/hypotez/hypo/releases/latest
 response.status_code=404 [0m


In [2]:
import os
import asyncio
import random
from types import SimpleNamespace
from typing import List, Dict, Any, Optional, Callable, Type, Tuple, AsyncIterator
from pathlib import Path
from src.webdriver.llm_driver.simple_driver import SimpleDriver
from src.logger import logger
from src.utils.jjson import j_loads_ns, j_loads, j_dumps 
from src.utils.file import get_filenames_from_directory
from src.utils.printer import pprint as print

2025-05-10 21:38:32,076 - INFO - ℹ️ [32m[49mАсинхронные контроллеры успешно импортированы из src.webdriver.llm_driver.controllers. [0m
2025-05-10 21:38:32,107 - INFO - ℹ️ [32m[49mКонфигурация успешно загружена из C:\Users\user\Documents\repos\hypotez\src\webdriver\llm_driver\use_llm.json [0m
2025-05-10 21:38:32,107 - INFO - ℹ️ [32m[49mConfig Gemini: Status=active, Model=gemini-2.5-flash-preview-04-17, Key Present=True [0m
2025-05-10 21:38:32,107 - INFO - ℹ️ [32m[49mConfig OpenAI: Status=disabled, Model=gpt-4o, Key Present=True [0m
2025-05-10 21:38:32,110 - INFO - ℹ️ [32m[49mConfig SerpAPI: Status=inactive, Key Present=True [0m
2025-05-10 21:38:32,111 - INFO - ℹ️ [32m[49mConfig DuckDuckGo: Status=active [0m
2025-05-10 21:38:32,111 - INFO - ℹ️ [32m[49mConfig Tavily: Status=inactive, Key Present=False [0m
2025-05-10 21:38:33,485 - INFO - Anonymized telemetry enabled. See https://docs.browser-use.com/development/telemetry for more information.


In [3]:
class Config:
    ENDPOINT: Path = __root__ / 'SANDBOX' / 'davidka'
    config:SimpleNamespace = j_loads_ns(ENDPOINT/'davidka.json')
    mining_data_path: Path= Path(config.random_urls)

    #train_data_supplier_categories_path: Path = ENDPOINT / 'train_data_supplier_categories'
    #checked_domains: list = read_text_file(ENDPOINT / 'checked_domains.txt', as_list=True)
    crawl_files_list: list = get_filenames_from_directory(mining_data_path, 'json')
    instruction_grab_product_page_simple_driver: str = (ENDPOINT / 'instructions' / 'grab_product_page_simple_driver.md').read_text(encoding='utf-8')
    instruction_get_supplier_categories: str = (ENDPOINT / 'instructions' / 'get_supplier_categories.md').read_text(encoding='utf-8')
    instruction_find_product_in_supplier_domain: str = (ENDPOINT / 'instructions' / 'find_product_in_supplier_domain.md').read_text(encoding='utf-8')
    instruction_for_products_urls_one_product: str = (ENDPOINT / 'instructions' / 'get_product_links_one_product.md').read_text(encoding='utf-8')
    instruction_links_from_search: str = (ENDPOINT / 'instructions' / 'links_from_search.md').read_text(encoding='utf-8')
    instruction_links_from_searh_page: str = (ENDPOINT / 'instructions' / 'links_from_searh_page.md').read_text(encoding='utf-8')
    GEMINI_API_KEY = gs.credentials.gemini.onela.api_key
    


In [4]:
def get_products_urls_list_from_files(crawl_files_list: list = None) -> list:
    """Читает файлы с товарами и возвращает product_url списком"""
    products_urls_list = []
    for filename in crawl_files_list or Config.crawl_files_list:
        try:
            file_path = Config.mining_data_path / filename
            crawl_data = j_loads(file_path)['products']
            for product in crawl_data:
                products_urls_list.append(product['product_url'])
        except Exception as ex:
            logger.error(f'Ошибка при обработке файла {filename=}', ex, exc_info=True)
    random.shuffle(products_urls_list)
    return products_urls_list


def yield_product_urls_from_files(directory: Path = Config.mining_data_path, pattern: str = 'json'):
    """Генератор url товаров из файлов"""
    filenames = get_filenames_from_directory(directory, pattern)
    for filename in filenames:
        try:
            file_path = directory / filename
            crawl_data = j_loads(file_path)['products']
            for product in crawl_data:
                yield product['product_url']
        except Exception as ex:
            logger.error(f'Ошибка при обработке файла {filename=}', ex, exc_info=True)


def get_categories_from_random_urls(crawl_files_list: list = None) -> list:
    """Возвращает все категории из файлов товаров"""
    categories_list = []
    for filename in crawl_files_list or Config.crawl_files_list:
        try:
            file_path = Config.mining_data_path / filename
            crawl_data = j_loads(file_path)
            crawl_data = crawl_data.get('products', [])
            for product in crawl_data:
                if 'parent_category' in product:
                    categories_list.append(product['parent_category'])
                if 'category_name' in product:
                    categories_list.append(product['category_name'])
        except Exception as ex:
            logger.error(f'Ошибка при обработке файла {filename=}', ex, exc_info=True)
    categories_list = list(filter(None, set(categories_list)))
    random.shuffle(categories_list)
    return categories_list


async def build_random_products_urls_by_category(driver: SimpleDriver, category: str, task:str = '', num_of_links: str = '10') -> str:
    """Получить товары по категории"""
    try:
        logger.info(f'Обработка {category=}')
        
        task = task or Config.instruction_links_from_searh_page.replace('{PRODUCT_CATEGORY}', category).replace('{NUM_LINKS}', num_of_links)
        #ipdb.set_trace()
        answer = await driver.simple_process_task_async(task)
        if not answer:
            return ''
        answer:dict = j_loads(answer)
        print('\n -------------------------------- EXTRACTED DATA \n------------------------------------------\n')
        print(answer)
        print('\n -------------------------------------------------------------------------------------------')
        save_text_file(answer, Path(f'F:/llm/random_products_links/{gs.now}.json'))
        return answer
    except Exception as ex:
        logger.error(f'Ошибка при обработке {category=}', ex, exc_info=True)
        return ''


async def fetch_categories_from_suppliers_random_urls() -> dict:
    """Сбор категорий с сайтов"""
    categories_dict = {}
    driver = Config.driver

    for filename in Config.crawl_files_list:
        try:
            file_path = Config.mining_data_path / filename
            crawl_data = j_loads(file_path)
            crawl_data = crawl_data.get('products', [])
            for product in crawl_data:
                domain = get_domain(product['product_url'])
                if domain in Config.checked_domains:
                    continue
                task = Config.instruction_get_supplier_categories.replace('{INPUT_URL}', domain)
                res = await driver.simple_process_task_async(task)
                if not res:
                    continue
                normalized_res = normalize_answer(res.get('output', ''))
                data = j_loads(normalized_res)
                print(data)
                j_dumps(data, Config.train_data_supplier_categories_path / f'{gs.now}.json')
                Config.checked_domains.append(domain)
                save_text_file(Config.checked_domains, Config.ENDPOINT / 'checked_domains.txt')
                j_dumps(Config.checked_domains, Config.ENDPOINT / 'checked_domains.json')
        except Exception as ex:
            logger.error(f'Ошибка при обработке файла {filename=}', ex, exc_info=True)
    return categories_dict

In [5]:
driver: SimpleDriver = SimpleDriver(gemini_model_name='gemini-1.5-flash-8b-exp-0924', GEMINI_API_KEY = gs.credentials.gemini.onela.api_key )


# Пример: обработка товаров по категориям
for category in get_categories_from_random_urls():
    #ipdb.set_trace()
    if not await build_random_products_urls_by_category(driver = driver, category = category, num_of_links = '10'):
        break

2025-05-10 21:38:47,516 - INFO - ℹ️ [32m[49m--- Начало СИНХРОННОЙ инициализации Driver --- [0m
2025-05-10 21:38:47,516 - INFO - ℹ️ [32m[49mКлюч джемини os.environ['GEMINI_API_KEY']='AIzaSyATzEjS5-C-YguhlyReXgEQmQy5U3vnXmM' [0m
2025-05-10 21:38:47,517 - INFO - ℹ️ [32m[49mИнициализация Gemini: Model=gemini-1.5-flash-8b-exp-0924 [0m
2025-05-10 21:38:47,594 - INFO - ℹ️ [32m[49mGemini LLM инициализирован. [0m
2025-05-10 21:38:47,594 - INFO - ℹ️ [32m[49m--- Синхронная инициализация Driver завершена. Вызовите async_init() --- [0m
2025-05-10 21:38:50,225 - INFO - ℹ️ [32m[49mОбработка category='Power Supplies' [0m
2025-05-10 21:38:50,308 - INFO - 🧠 Starting an agent with main_model=models/gemini-1.5-flash-8b-exp-0924 +vision, planner_model=None, extraction_model=None 
2025-05-10 21:38:50,328 - INFO - ℹ️ [32m[49mАгент начинает выполнение задачи: "
```md
**Роль:** Ты — Автоматизированный Веб-Агент для Поиска Страниц.

**Цель:**  
Найти **все ссылки** из первой страницы поиска 