Если я правильно понимаю, то при многих запросах происходит http ошибка 509 на стороне colab, поэтому после ошибки надо скачать текущий прогресс, удалить среду и запустить заново. Потом смержить файлыы прогресса

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from tqdm import tqdm
import json
import re

HEADERS = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-language': 'ru-RU,ru;q=0.9',
    'cache-control': 'max-age=0',
    # <--- COOKIE --->
    'cookie': 'cookie',
    'priority': 'u=0, i',
    'referer': 'https://zoon.ru/msk/restaurants/', # Общий реферер
    'sec-ch-ua': '"Google Chrome";v="137", "Chromium";v="137", "Not/A)Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'cross-site',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36',
}

URLS = {
    'msk': 'https://zoon.ru/msk/restaurants/type/kofejni/',
    'spb': 'https://zoon.ru/spb/restaurants/type/kofejni/'
}

POSITIVE_KEYWORDS = ['кофейня', 'кофе с собой', 'кафе', 'кондитерская', 'пекарня', 'кофе на вынос']
BAN_WORDS = [
    'стейк-хаус', 'хинкальная', 'кальянная', 'паб', 'пиццерия', 'ресторан', 'хинкальная', 'гриль-бар', 'шашлычная'
]


def save_chunk_to_csv(data_list, city_code, start_page, end_page):
    if not data_list:
        return

    filename = f"{city_code}_coffee_{start_page - 1}_{end_page}.csv"
    df = pd.DataFrame(data_list)
    df['city'] = city_code

    cols_order = [
        'city', 'name', 'zoon_id', 'rating', 'reviews_count', 'price_category', 'tags',
        'address', 'metro_station', 'work_hours_short', 'phone', 'lat', 'lon',
        'photo_count', 'url', 'photos_urls'
    ]
    existing_cols = [col for col in cols_order if col in df.columns]
    df = df[existing_cols]

    df.to_csv(filename, index=False, encoding='utf-8-sig')
    tqdm.write(f"\nПромежуточное сохранение: {len(data_list)} записей за стр. {start_page}-{end_page} сохранены в: {filename}\n")


def get_establishment_tags(detail_url, session):
    try:
        time.sleep(random.uniform(0.5, 1.5))
        response = session.get(detail_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'lxml')
        title_dt = soup.find('dt', attrs={'data-value': 'Тип заведения'})
        if not title_dt: return []
        tags_dd = title_dt.find_next_sibling('dd')
        if not tags_dd: return []
        tags = [a.text.strip().lower() for a in tags_dd.find_all('a')]
        return tags
    except requests.RequestException as e:
        tqdm.write(f"  - Ошибка при запросе тегов {detail_url}: {e}")
        return None
    except Exception as e:
        tqdm.write(f"  - Не удалось обработать теги на {detail_url}: {e}")
        return None


def scrape_zoon_coffee(city_name, base_url):
    page = 1
    chunk_data = []
    chunk_start_page = 1

    MAX_PAGES = 50
    CHUNK_SIZE = 5
    MAX_RETRIES = 3
    RETRY_DELAY = 45

    print(f" Начинается сбор данных по городу: {city_name.upper()}")

    with requests.Session() as session:
        session.headers.update(HEADERS)

        while page <= MAX_PAGES:
            if page == 1:
                url = base_url
            else:
                url = f"{base_url}page-{page}/"

            page_processed_successfully = False

            for attempt in range(MAX_RETRIES):
                print(f"\nПарсинг страницы: {url} (Попытка {attempt + 1}/{MAX_RETRIES})")
                try:
                    response = session.get(url, timeout=20)
                    response.raise_for_status()

                    soup = BeautifulSoup(response.text, 'lxml')
                    items = soup.find_all('li', class_='minicard-item')

                    if not items and "Ничего не найдено" in response.text:
                        page = MAX_PAGES + 1
                        break

                    for item in tqdm(items, desc=f"Обработка страницы {page}", unit=" заведение"):
                        data = {}
                        title_tag = item.find('a', class_='title-link')
                        data['name'] = title_tag.text.strip() if title_tag else 'Без названия'
                        data['url'] = title_tag['href'] if title_tag and title_tag.has_attr('href') else None
                        if not data['url']:
                            tqdm.write(f"  -  Пропуск '{data['name']}', т.к. нет URL")
                            continue

                        tags = get_establishment_tags(data['url'], session)
                        if tags is None:
                            tqdm.write(f"  - Пропуск '{data['name']}' из-за ошибки получения тегов.")
                            continue

                        data['tags'] = ', '.join(sorted(list(set(tags))))
                        has_positive = any(kw in tags for kw in POSITIVE_KEYWORDS)
                        banned_found = [word for word in tags if word in BAN_WORDS]
                        has_banned = bool(banned_found)

                        if has_banned:
                            reason = f"Найдены запрещенные теги: {banned_found}"
                            tqdm.write(f"  - 🚫 Пропущено: '{data['name']}' | Причина: {reason} | Все теги: {tags}")
                            continue
                        elif not has_positive:
                            reason = "Не найден ни один из обязательных тегов"
                            tqdm.write(f"  - 🚫 Пропущено: '{data['name']}' | Причина: {reason} | Все теги: {tags}")
                            continue
                        else:
                            tqdm.write(f"  - ✅ Принято: '{data['name']}' | Теги: {tags}")

                        data['zoon_id'] = item.get('data-id')
                        data['lat'] = item.get('data-lat')
                        data['lon'] = item.get('data-lon')
                        address_container = item.find('address', class_='minicard-item__address')
                        if address_container:
                            address_span = address_container.find('span', class_='address')
                            data['address'] = address_span.text.strip() if address_span else None
                            metro_tag = address_container.find('a', class_='metro')
                            data['metro_station'] = metro_tag.text.strip() if metro_tag else None
                        else:
                            data['address'] = None
                            data['metro_station'] = None
                        rating_tag = item.select_one('div.stars div.z-text--bold')
                        data['rating'] = rating_tag.text.strip().replace(',', '.') if rating_tag else None
                        reviews_tag = item.select_one('div.comments')
                        if reviews_tag:
                            match = re.search(r'\d+', reviews_tag.text)
                            data['reviews_count'] = match.group(0) if match else None
                        else:
                            data['reviews_count'] = None
                        work_time_tag = item.select_one('.minicard-item__work-time span:last-child')
                        data['work_hours_short'] = work_time_tag.text.strip() if work_time_tag else None
                        phone_tag = item.find('span', class_='js-phone')
                        if phone_tag and phone_tag.has_attr('data-json'):
                            try:
                                phone_json = json.loads(phone_tag['data-json'])
                                data['phone'] = phone_json.get('formatted')
                            except json.JSONDecodeError:
                                data['phone'] = None
                        else:
                            data['phone'] = None
                        price_tag = item.select_one('.price-category')
                        if price_tag:
                            all_spans = price_tag.find_all('span')
                            filled_count = len([s for s in all_spans if '_deselected' not in s.get('class', [])])
                            data['price_category'] = f"{filled_count}/{len(all_spans)}"
                        else:
                            data['price_category'] = None
                        photo_count_tag = item.select_one('.controls__item.count')
                        if photo_count_tag:
                            match = re.search(r'\d+', photo_count_tag.text)
                            data['photo_count'] = match.group(0) if match else None
                        else:
                            data['photo_count'] = None
                        photos_tag = item.find('div', class_='js-slider-block')
                        if photos_tag and photos_tag.has_attr('data-photos'):
                            try:
                                data['photos_urls'] = ', '.join(json.loads(photos_tag['data-photos']))
                            except json.JSONDecodeError:
                                data['photos_urls'] = None
                        else:
                            data['photos_urls'] = None

                        chunk_data.append(data)

                    page_processed_successfully = True
                    break

                except requests.RequestException as e:
                    print(f"  - Ошибка при запросе страницы {page}: {e}")
                    if attempt < MAX_RETRIES - 1:
                        print(f"  - Ожидание {RETRY_DELAY} секунд перед следующей попыткой...")
                        time.sleep(RETRY_DELAY)
                    else:
                        print(f"  - Не удалось получить доступ к странице {page} после {MAX_RETRIES} попыток. Прерываю работу по городу.")
                        page = MAX_PAGES + 1
                        break

            if page_processed_successfully:
                if page % CHUNK_SIZE == 0:
                    save_chunk_to_csv(chunk_data, city_name, chunk_start_page, page)
                    chunk_data = []
                    chunk_start_page = page + 1
                page += 1
                time.sleep(random.uniform(3, 5))

    if chunk_data:
        last_scraped_page = page - 1
        save_chunk_to_csv(chunk_data, city_name, chunk_start_page, last_scraped_page)

    print(f"\n✅ Сбор данных по городу {city_name.upper()} завершен.")


if __name__ == "__main__":
    # Перед запуском ОБЯЗАТЕЛЬНО обновите 'cookie' в словаре HEADERS!
    for city_code, city_url in URLS.items():
        scrape_zoon_coffee(city_code, city_url)
        print("\n" + "="*70 + "\n")
