In [15]:
import requests
import time
import random
import pandas as pd
import numpy as np
import re

def filter(items, query):
    results = []
    pattern = re.compile(rf"\b{re.escape(query.lower())}\b")
    for it in items:
        name = it.get("name", "")
        if not name:
            continue
        name_lower = name.lower()
        if pattern.search(name_lower):
            results.append(it)
    return results

def fetch_trademark_name(trademark_id, token, cache):
    if not trademark_id:
        return np.nan
    if trademark_id in cache:
        return cache[trademark_id]

    url = f"https://www.sima-land.ru/api/v5/trademark/{trademark_id}"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/json",
        "User-Agent": "SimaLandClient/1.0"
    }
    resp = requests.get(url, headers=headers)
    if resp.status_code == 200:
        data = resp.json()
        brand_name = data.get("name", "")
        cache[trademark_id] = brand_name
        return brand_name
    else:
        cache[trademark_id] = np.nan
        return np.nan

def fetch_country_name(country_id, token, cache):
    if not country_id:
        return np.nan
    if country_id in cache:
        return cache[country_id]

    url = f"https://www.sima-land.ru/api/v5/country/{country_id}"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/json",
        "User-Agent": "SimaLandClient/1.0"
    }
    resp = requests.get(url, headers=headers)
    if resp.status_code == 200:
        data = resp.json()
        country_name = data.get("name", "")
        cache[country_id] = country_name
        return country_name
    else:
        cache[country_id] = np.nan
        return np.nan

def fetch_all_item_categories(token, max_pages=5, per_page=200):
    item_to_cats = {}
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/json",
        "User-Agent": "SimaLandClient/1.0"
    }
    for page_num in range(1, max_pages + 1):
        url = "https://www.sima-land.ru/api/v5/item-category"
        params = {
            "p": page_num,
            "per-page": per_page
        }
        resp = requests.get(url, headers=headers, params=params)
        if resp.status_code != 200:
            print(f"Не удалось получить item-category, стр. {page_num}, код {resp.status_code}")
            break

        data = resp.json()
        if isinstance(data, list):
            records = data
        else:
            records = data.get("items", [])

        if not records:
            break

        for rec in records:
            item_id = rec.get("item_id")
            cat_id = rec.get("category_id")
            if item_id and cat_id:
                if item_id not in item_to_cats:
                    item_to_cats[item_id] = set()
                item_to_cats[item_id].add(cat_id)

        time.sleep(random.uniform(1, 2))

    return item_to_cats

def fetch_category_name(cat_id, token, cache):
    if not cat_id:
        return np.nan
    if cat_id in cache:
        return cache[cat_id]

    url = f"https://www.sima-land.ru/api/v5/category/{cat_id}"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/json",
        "User-Agent": "SimaLandClient/1.0"
    }
    resp = requests.get(url, headers=headers)
    if resp.status_code == 200:
        data = resp.json()
        cat_name = data.get("name", "")
        cache[cat_id] = cat_name
        return cat_name
    else:
        cache[cat_id] = np.nan
        return np.nan

def get_sima_land_items(query, token, max_pages=2, per_page=50):
    items_result = []
    token_ascii = token.encode("ascii", errors="ignore").decode("ascii")

    headers = {
        "Authorization": f"Bearer {token_ascii}",
        "Accept": "application/json",
        "User-Agent": "SimaLandClient/1.0"
    }

    for page_num in range(1, max_pages + 1):
        url = "https://www.sima-land.ru/api/v5/item"
        params = {
            "name_ilike": query,
            "p": page_num,
            "per-page": per_page
        }
        resp = requests.get(url, headers=headers, params=params)
        if resp.status_code != 200:
            print(f"Ошибка при запросе товаров, стр. {page_num}: {resp.status_code}")
            print(resp.text)
            break

        data_json = resp.json()
        if isinstance(data_json, list):
            items = data_json
        else:
            items = data_json.get("items", [])

        if not items:
            print(f"Товары не найдены на странице {page_num}. Останавливаемся.")
            break

        filtered = filter(items, query)

        for it in filtered:
            items_result.append(it)

        time.sleep(random.uniform(1, 2))

    return items_result

def build_dataset(raw_items, token):
    trademark_cache = {}
    country_cache = []
    country_cache = {}

    products = []

    for it in raw_items:
        product_name = it.get("name", "")
        trademark_id = it.get("trademark_id")
        country_id = it.get("country_id")

        brand_name = fetch_trademark_name(trademark_id, token, trademark_cache)
        country_name = fetch_country_name(country_id, token, country_cache)

        price = it.get("price", np.nan)
        price_max = it.get("price_max", np.nan)
        is_markdown = it.get("is_markdown", False)

        if (is_markdown
                and isinstance(price, (int,float))
                and isinstance(price_max, (int,float))
                and price_max > price):
            old_price = price_max
            discount_val = 100.0 * (price_max - price) / price_max
        else:
            old_price = np.nan
            discount_val = np.nan

        rating = np.nan
        reviews_count = np.nan

        supply_period = it.get("supply_period")
        if supply_period is not None:
            delivery_time = f"{supply_period} дн."
        else:
            delivery_time = np.nan

        product_info = {
            "product_url": it.get("id"),
            "product_name": product_name,
            "brand": brand_name,
            "is_original": None,
            "final_price": price,
            "old_price": old_price,
            "discount": np.round(discount_val, 2) if not np.isnan(discount_val) else np.nan,
            "rating": rating,
            "reviews_count": reviews_count,
            "delivery_time": delivery_time,
            "market": 'SIMA'
        }
        products.append(product_info)

    return products

def attach_categories(dataset, token, max_pages_itemcat=2):
    itemcat_map = fetch_all_item_categories(token, max_pages=max_pages_itemcat)
    cat_cache = {}
    item_map = {}

    for row in dataset:
        iid = row["item_id"]
        item_map[iid] = row

    all_cat_ids = set()
    for iid, cat_ids in itemcat_map.items():
        all_cat_ids.update(cat_ids)

    for cid in all_cat_ids:
        name = fetch_category_name(cid, token, cat_cache)
        cat_cache[cid] = name  # может быть np.nan, если не найдено

    for iid, cat_ids in itemcat_map.items():
        if iid in item_map:
            cat_names = []
            for cid in cat_ids:
                cat_names.append(cat_cache.get(cid, ""))
            item_map[iid]["categories"] = ", ".join([c for c in cat_names if c])

    return list(item_map.values())


if __name__ == "__main__":
    token = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NDE1MzEwMDYsImlhdCI6MTc0MDkyNjIwNiwianRpIjo3MTE2Mzg4LCJuYmYiOjE3NDA5MjYyMDZ9.fsprwJGEdo4QKMTwG7nW6be-M1MLjwAT3lcCdKiJ1JE"

    query = input("Введите строку поиска: ")
    raw_items = get_sima_land_items(query, token, max_pages=2, per_page=50)
    dataset = build_dataset(raw_items, token)
    dataset_with_cat = attach_categories(dataset, token, max_pages_itemcat=2)

    df = pd.DataFrame(dataset_with_cat)
    df.to_csv("simaland_dataset.csv", index=False, encoding="utf-8")

    print(f"Собрано товаров (после фильтрации): {len(dataset_with_cat)}.")
    print("Результат сохранён в 'simaland_dataset.csv'.")


Собрано товаров (после фильтрации): 3.
Результат сохранён в 'simaland_dataset.csv'.


In [16]:
df = pd.read_csv("simaland_dataset.csv")
df

Unnamed: 0,item_id,product_name,brand,country,final_price,old_price,discount,rating,reviews_count,delivery_time
0,4644,"кружка ""Плетеная серия"" с металлом В3091-317А",,Китай,111.0,,,,,0 дн.
1,4642,"кружка ""21 ВЕК"" с металлом В3091-305А",,Китай,111.0,,,,,0 дн.
2,4652,"кружка ""Вы просили полчашки кофе"" MD5200",,Китай,74.37,,,,,0 дн.
