In [3]:
import requests
import time
import random
import pandas as pd
import numpy as np
import re
import logging
import configparser
import os

cfg = configparser.ConfigParser()
if os.path.exists("settings.ini"):
    cfg.read("settings.ini")
    log_enabled = cfg.getboolean("logging", "enabled", fallback=True)
    log_level_str = cfg.get("logging", "level", fallback="INFO")
    log_file = cfg.get("logging", "file", fallback="app.log")
else:
    log_enabled = True
    log_level_str = "INFO"
    log_file = "SL.log"

if log_enabled:
    num_level = getattr(logging, log_level_str.upper(), logging.INFO)
    logging.basicConfig(level=num_level, filename=log_file, filemode='a',
                        format="%(asctime)s - %(levelname)s - %(message)s")

def filter(items, query):
    logging.info(f"Filtering items with query: {query}")
    results = []
    pattern = re.compile(rf"\b{re.escape(query.lower())}\b")
    for it in items:
        name = it.get("name", "")
        if not name:
            continue
        if pattern.search(name.lower()):
            results.append(it)
    logging.info(f"Filtered {len(results)} items matching query")
    return results

def fetch_trademark_name(trademark_id, token, cache):
    logging.info(f"Fetching trademark name for trademark_id: {trademark_id}")
    if not trademark_id:
        logging.warning("No trademark_id provided, returning NaN")
        return np.nan
    if trademark_id in cache:
        logging.info(f"Trademark id {trademark_id} found in cache")
        return cache[trademark_id]

    url = f"https://www.sima-land.ru/api/v5/trademark/{trademark_id}"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/json",
        "User-Agent": "SimaLandClient/1.0"
    }
    resp = requests.get(url, headers=headers)
    if resp.status_code == 200:
        data = resp.json()
        brand_name = data.get("name", "")
        cache[trademark_id] = brand_name
        logging.info("Fetched trademark name: %s", brand_name)
        return brand_name
    else:
        logging.error(f"Failed to fetch trademark for id {trademark_id}, status code: {resp.status_code}")
        cache[trademark_id] = np.nan
        return np.nan

def get_sima_land_items(query, token, max_pages=2, per_page=50):
    logging.info(f"Fetching Sima Land items with query: {query}")
    items_result = []
    token_ascii = token.encode("ascii", errors="ignore").decode("ascii")

    headers = {
        "Authorization": f"Bearer {token_ascii}",
        "Accept": "application/json",
        "User-Agent": "SimaLandClient/1.0"
    }

    for page_num in range(1, max_pages + 1):
        url = "https://www.sima-land.ru/api/v5/item"
        params = {
            "name_ilike": query,
            "p": page_num,
            "per-page": per_page
        }
        resp = requests.get(url, headers=headers, params=params)
        data_json = resp.json()
        if isinstance(data_json, list):
            items = data_json
        else:
            items = data_json.get("items", [])

        if not items:
            logging.info(f"No items found on page {page_num}, breaking loop")
            break

        filtered = filter(items, query)
        logging.info(f"Page {page_num}: found {len(items)} items, {len(filtered)} after filtering")
        items_result.extend(filtered)
        time.sleep(random.uniform(1, 2))

    logging.info(f"Completed fetching Sima Land items; total items fetched: {len(items_result)}")
    return items_result

def build_dataset(raw_items, token):
    logging.info(f"Building dataset from raw items; total raw items: {len(raw_items)}")
    trademark_cache = {}
    products = []

    for it in raw_items:
        product_name = it.get("name", "")
        trademark_id = it.get("trademark_id")
        brand_name = fetch_trademark_name(trademark_id, token, trademark_cache)

        price = it.get("price", np.nan)
        price_max = it.get("price_max", np.nan)
        is_markdown = it.get("is_markdown", False)

        if (is_markdown and isinstance(price, (int, float)) and isinstance(price_max, (int, float))
                and price_max > price):
            old_price = price_max
            discount_val = 100.0 * (price_max - price) / price_max
        else:
            old_price = np.nan
            discount_val = np.nan

        rating = np.nan
        reviews_count = np.nan

        supply_period = it.get("supply_period")
        delivery_time = f"{supply_period} дн." if supply_period is not None else np.nan

        product_info = {
            "product_url": it.get("id"),
            "product_name": product_name,
            "brand": brand_name,
            "is_original": None,
            "final_price": price,
            "old_price": old_price,
            "discount": np.round(discount_val, 2) if not np.isnan(discount_val) else np.nan,
            "rating": rating,
            "reviews_count": reviews_count,
            "delivery_time": delivery_time,
            "market": "SIMA"
        }
        products.append(product_info)
        logging.debug(f"Processed product: {product_info}")
    logging.info(f"Dataset built with {len(products)} products")
    return products

if __name__ == "__main__":
    logging.info("Script started")
    token = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NDE1MzEwMDYsImlhdCI6MTc0MDkyNjIwNiwianRpIjo3MTE2Mzg4LCJuYmYiOjE3NDA5MjYyMDZ9.fsprwJGEdo4QKMTwG7nW6be-M1MLjwAT3lcCdKiJ1JE"
    query = "кружка"
    logging.info(f"Fetching Sima Land items with query: {query}")
    raw_items = get_sima_land_items(query, token, max_pages=10, per_page=50)
    logging.info("Building dataset from fetched items")
    dataset = build_dataset(raw_items, token)

    df = pd.DataFrame(dataset)
    df.to_csv("simaland_data_zz5.csv", index=False, encoding="utf-8")
    logging.info("Data saved to simaland_data_zz5.csv")
