ID игр

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import csv
import random

class SteamGameParser:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        self.game_ids = set()

    def extract_game_ids(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        return {href.split('/app/')[1].split('/')[0]
                for link in soup.find_all('a', href=True)
                if '/app/' in (href := link['href']) and href.split('/app/')[1].split('/')[0].isdigit()}

    def get_game_ids_from_url(self, url, delay=True):
        try:
            response = self.session.get(url, timeout=15)
            ids = self.extract_game_ids(response.content)
            if delay:
                time.sleep(random.uniform(1, 2))
            return ids
        except Exception as e:
            print(f"Ошибка при парсинге {url}: {e}")
            return set()

    def get_game_ids_from_search(self, start=0, count=100, sort_by='_ASC'):
        params = {
            'start': start,
            'count': count,
            'dynamic_data': '',
            'sort_by': sort_by,
            'infinite': 1
        }

        try:
            response = self.session.get("https://store.steampowered.com/search/results/", params=params, timeout=15)
            if response.status_code == 200:
                return self.extract_game_ids(response.json().get('results_html', ''))
        except Exception as e:
            print(f"Ошибка при парсинге страницы {start}: {e}")
        return set()

    def collect_ids_from_sources(self):
        #категории
        featured_urls = [
            "https://store.steampowered.com/search/?sort_by=Released_DESC&category1=998",
            "https://store.steampowered.com/search/?filter=topsellers&category1=998",
            "https://store.steampowered.com/search/?sort_by=Price_ASC&category1=998",
            "https://store.steampowered.com/search/?sort_by=Metacritic&category1=998",
            "https://store.steampowered.com/search/?category1=998&specials=1",
        ]

        # Популярные теги
        tags = ["action", "adventure", "rpg", "strategy", "simulation", "indie", "horror"]

        all_ids = set()
        for url in featured_urls:
            all_ids.update(self.get_game_ids_from_url(url))

        for tag in tags:
            all_ids.update(self.get_game_ids_from_url(f"https://store.steampowered.com/search/?tags={tag}"))

        return all_ids

    def parse_with_different_sorts(self, total_games=25000):
        sort_methods = ['_ASC', 'Released_DESC', 'Price_ASC', 'Price_DESC', 'Reviews_DESC']

        for sort_method in sort_methods:
            if len(self.game_ids) >= total_games:
                break

            print(f"Парсинг с сортировкой: {sort_method}")
            for page in range(200):
                if len(self.game_ids) >= total_games:
                    break

                new_ids = self.get_game_ids_from_search(page*100, 100, sort_method)
                before = len(self.game_ids)
                self.game_ids.update(new_ids)

                if len(self.game_ids) - before < 5:
                    break

                time.sleep(random.uniform(1, 2))

    def parse_games(self, total_games=25000):
        print("Начинаем парсинг Steam...")

        initial_ids = self.collect_ids_from_sources()
        self.game_ids.update(initial_ids)
        print(f"Собрано начальных ID: {len(initial_ids)}")

        self.parse_with_different_sorts(total_games)

        print(f"Итоговый результат: {len(self.game_ids)} уникальных ID игр")

    def save_to_csv(self, filename='steam_game_ids.csv'):
        with open(filename, 'w', newline='', encoding='utf-8') as file:
            csv.writer(file).writerows([['Game ID'], *[[game_id] for game_id in self.game_ids]])


parser = SteamGameParser()
try:
    parser.parse_games(total_games=25000)
    parser.save_to_csv()
except Exception as e:
    print(f"Ошибка: {e}")
    parser.save_to_csv('steam_game_ids_fail.csv')



Начинаем парсинг Steam для сбора 25000 игр...
1. Сбор из featured категорий...
Обработана категория: https://store.steampowered.com/search/?sort_by=Released_DESC&category1=998, найдено: 50
Обработана категория: https://store.steampowered.com/search/?filter=topsellers&category1=998, найдено: 100
Обработана категория: https://store.steampowered.com/search/?sort_by=Price_ASC&category1=998, найдено: 136
Обработана категория: https://store.steampowered.com/search/?sort_by=Metacritic&category1=998, найдено: 143
Обработана категория: https://store.steampowered.com/search/?category1=998&specials=1, найдено: 179
Обработана категория: https://store.steampowered.com/search/?category1=998&os=win&filter=popularnew, найдено: 179
Обработана категория: https://store.steampowered.com/search/?category1=998&os=win&filter=comingsoon, найдено: 229
Найдено из featured: 229
2. Сбор по тегам...
Обработан тег: action, найдено игр: 50
Обработан тег: adventure, найдено игр: 50
Обработан тег: rpg, найдено игр: 50