In [None]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time
import re
import random
import concurrent.futures
from threading import Lock

In [None]:
API_KEYS = [
   "15a9fce314884498904ef3ef934e1353",
   "ee850f787d9a4874a1fc5356281f436f",
   "f1eaf679384d4933bd5494bc727ca897",
   "ac4f1afe90564180b86ed2c60ebda44d",
   "bb3212f851334e2ab5a0c4ae35079ba7",
   "fd88fcf247e449fbb1543a3215ed12fe"
]


In [None]:
import requests
import time
import random
import re
import pandas as pd
from threading import Lock

BASE_URL = "https://api.rawg.io/api"

# Глобальные переменные
success_count = 0
error_count = 0
collected_games = set()
dataset = []
lock = Lock()
current_key_index = 0
key_usage_count = {key: 0 for key in API_KEYS}
key_errors = {key: 0 for key in API_KEYS}

def get_next_api_key():

    global current_key_index
    with lock:
        key = API_KEYS[current_key_index]
        current_key_index = (current_key_index + 1) % len(API_KEYS)
        key_usage_count[key] += 1
        return key

def get_data(endpoint, params=None):

    url = f"{BASE_URL}/{endpoint}"
    api_key = get_next_api_key()
    default_params = {"key": api_key}
    if params:
        default_params.update(params)

    try:
        response = requests.get(url, params=default_params, timeout=30)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.HTTPError as e:
        with lock:
            key_errors[api_key] += 1
        if e.response.status_code == 429:
            print(f"Лимит запросов для ключа {api_key[:8]}... Ждем 60 секунд...")
            time.sleep(60)
            return get_data(endpoint, params)
        else:
            print(f"Ошибка HTTP {e.response.status_code} для {url}")
            return None
    except Exception as e:
        with lock:
            error_count += 1
            key_errors[api_key] += 1
        print(f"Ошибка запроса: {e}")
        return None

def get_data_with_retry(endpoint, params=None, max_retries=3):

    global success_count

    for attempt in range(max_retries):
        response = get_data(endpoint, params)
        if response is not None:
            with lock:
                success_count += 1
            return response

        if attempt < max_retries - 1:
            wait_time = (2 ** attempt) + random.uniform(0.5, 1.5)
            print(f"Повторная попытка через {wait_time:.1f} сек...")
            time.sleep(wait_time)

    return None

def clean_text(text):

    if not text:
        return ""
    return re.sub('<.*?>', '', text)

def safe_get(data, key, default=None):

    if data is None:
        return default
    result = data.get(key, default)
    return result if result is not None else default

def collect_games_with_pagination(base_query, max_pages=20):
    games = []

    for page in range(1, max_pages + 1):
        query = base_query.copy()
        query.update({"page": page, "page_size": 40})

        data = get_data_with_retry("games", query)

        if not data or 'results' not in data:
            break

        for game in data['results']:
            with lock:
                if game['id'] not in collected_games:
                    games.append(game)
                    collected_games.add(game['id'])

        print(f"  Страница {page}: {len(data['results'])} игр")

        if len(data['results']) < query["page_size"]:
            break

        time.sleep(0.1)

    return games

def process_game_batch(game_batch, batch_num, total_batches):
    batch_dataset = []

    for i, game in enumerate(game_batch):
        if i % 50 == 0:
            print(f"Батч {batch_num}/{total_batches}: Обработано {i}/{len(game_batch)} игр")

        game_details = get_data_with_retry(f"games/{game['id']}")
        if not game_details:
            continue

        try:
            # Основные метрики
            game_data = {
                'game_id': game['id'],
                'name': game['name'],
                'name_clean': game['name'].lower().strip(),
                'released': safe_get(game, 'released'),
                'release_year': int(game['released'][:4]) if safe_get(game, 'released') else None,
                'rating': safe_get(game, 'rating', 0),
                'rating_top': safe_get(game, 'rating_top', 5),
                'ratings_count': safe_get(game, 'ratings_count', 0),
                'added': safe_get(game, 'added', 0),
                'playtime': safe_get(game, 'playtime', 0),
                'metacritic': safe_get(game, 'metacritic'),
                'reviews_count': safe_get(game, 'reviews_text_count', 0),
                'suggestions_count': safe_get(game, 'suggestions_count', 0),
            }

            # Рейтинги по категориям
            ratings = safe_get(game, 'ratings', [])
            for rating in ratings:
                game_data[f"rating_{rating['id']}"] = rating['count']

            # Описание
            description = safe_get(game_details, 'description', '')
            game_data['description'] = clean_text(description)
            game_data['description_length'] = len(description)

            # Жанры
            genres = [genre['name'] for genre in safe_get(game, 'genres', [])]
            game_data['genres'] = ', '.join(genres)
            game_data['primary_genre'] = genres[0] if genres else 'Unknown'
            game_data['genre_count'] = len(genres)

            # Платформы
            platforms_data = safe_get(game, 'platforms', [])
            platforms = [p['platform']['name'] for p in platforms_data if p.get('platform')]
            game_data['platforms'] = ', '.join(platforms)
            game_data['platform_count'] = len(platforms)
            game_data['is_multiplatform'] = len(platforms) > 1
            game_data['is_pc'] = any('pc' in p.lower() for p in platforms)
            game_data['is_console'] = any(platform in ['PlayStation', 'Xbox', 'Nintendo'] for platform in platforms)

            # Издатели
            publishers = [p['name'] for p in safe_get(game, 'publishers', [])]
            game_data['publishers'] = ', '.join(publishers)
            game_data['publisher_count'] = len(publishers)
            major_publishers = ['Electronic Arts', 'Ubisoft', 'Activision', 'Nintendo', 'Sony', 'Microsoft', 'Square Enix', 'Capcom', 'Sega']
            game_data['major_publisher'] = len(publishers) > 0 and any(pub in major_publishers for pub in publishers)

            # Дополнительные метрики
            game_data.update({
                'achievements_count': safe_get(game_details, 'achievements_count', 0),
                'reddit_url': safe_get(game_details, 'reddit_url', ''),
                'reddit_name': safe_get(game_details, 'reddit_name', ''),
                'reddit_description': clean_text(safe_get(game_details, 'reddit_description', '')),
                'esrb_rating': safe_get(game_details.get('esrb_rating'), 'name', 'Unknown'),
                'tba': safe_get(game_details, 'tba', False),
                'updated': safe_get(game_details, 'updated', ''),
            })

            # Магазины и теги
            stores_data = safe_get(game_details, 'stores', [])
            stores = [s['store']['name'] for s in stores_data if s.get('store')]
            game_data['stores'] = ', '.join(stores)
            game_data['store_count'] = len(stores)

            tags_data = safe_get(game_details, 'tags', [])
            tags = [tag['name'] for tag in tags_data]
            game_data['tags'] = ', '.join(tags[:10])
            game_data['tag_count'] = len(tags)

            batch_dataset.append(game_data)

        except Exception:
            continue

        time.sleep(0.1)

    return batch_dataset

def collect_comprehensive_games_dataset():
    global dataset

    base_queries = [
        {"ordering": "-added"},
        {"ordering": "-released"},
        {"ordering": "-rating"},
        {"ordering": "-metacritic"},
    ]

    queries = base_queries.copy()

    #запросы по жанрам, платформам и датам
    genres = ["action", "adventure", "rpg", "strategy", "indie", "shooter"]
    platforms = ["4", "187", "186", "7", "1", "18"]
    years = [(2020, 2024), (2015, 2019), (2010, 2014)]

    for genre in genres:
        queries.append({"genres": genre, "ordering": "-added"})

    for platform in platforms:
        queries.append({"platforms": platform, "ordering": "-added"})

    for start_year, end_year in years:
        queries.append({"dates": f"{start_year}-01-01,{end_year}-12-31", "ordering": "-added"})

    all_games = []

    print("Начало сбора списков игр...")
    print(f"Всего запросов: {len(queries)}")

    for i, query in enumerate(queries):
        query_desc = query.get('ordering', '')
        if 'genres' in query:
            query_desc = f"genre: {query['genres']}"
        elif 'platforms' in query:
            query_desc = f"platform: {query['platforms']}"
        elif 'dates' in query:
            query_desc = f"dates: {query['dates']}"

        print(f"Запрос {i+1}/{len(queries)}: {query_desc}")

        games_from_query = collect_games_with_pagination(query)
        all_games.extend(games_from_query)

        print(f"Всего собрано уникальных игр: {len(all_games)}")
        time.sleep(0.3)

    # Удаление дубликатов
    unique_games = []
    seen_ids = set()
    for game in all_games:
        if game['id'] not in seen_ids:
            unique_games.append(game)
            seen_ids.add(game['id'])

    print(f"Уникальных игр после дедупликации: {len(unique_games)}")
    print("Начало сбора детальной информации...")

    batch_size = 200
    batches = [unique_games[i:i + batch_size] for i in range(0, len(unique_games), batch_size)]

    for i, batch in enumerate(batches):
        print(f"Обработка батча {i+1}/{len(batches)}")
        batch_data = process_game_batch(batch, i+1, len(batches))
        dataset.extend(batch_data)

    return dataset


try:
    games_dataset = collect_comprehensive_games_dataset()
    print(f"Сбор завершен! Собрано данных об играх: {len(games_dataset)}")

    df_games = pd.DataFrame(games_dataset)
    filename = "rawg_games_dataset.csv"
    df_games.to_csv(filename, index=False, encoding='utf-8')
    print(f"Данные сохранены в {filename}")

except Exception as e:
    print(f"Ошибка: {e}")
    import traceback
    traceback.print_exc()

    if 'df_games' in locals() and not df_games.empty:
        filename = f"rawg_games_dataset_partial.csv"
        df_games.to_csv(filename, index=False, encoding='utf-8')
        print(f"Сохранен частичный датасет: {filename}")

print("Сбор данных завершен!")

СБОР ДАННЫХ С RAWG.IO
Начало сбора списков игр...
Всего запросов: 99
Количество API ключей: 6
Теоретический максимум: 79200 игр
Запрос 1/99: -added
  Страница 1: 40 игр
  Страница 2: 40 игр
  Страница 3: 40 игр
  Страница 4: 40 игр
  Страница 5: 40 игр
  Страница 6: 40 игр
  Страница 7: 40 игр
  Страница 8: 40 игр
  Страница 9: 40 игр
  Страница 10: 40 игр
  Страница 11: 40 игр
  Страница 12: 40 игр
  Страница 13: 40 игр
  Страница 14: 40 игр
  Страница 15: 40 игр
  Страница 16: 40 игр
  Страница 17: 40 игр
  Страница 18: 40 игр
  Страница 19: 40 игр
  Страница 20: 40 игр
Всего собрано уникальных игр: 800
Запрос 2/99: -released
  Страница 1: 40 игр
  Страница 2: 40 игр
  Страница 3: 40 игр
  Страница 4: 40 игр
  Страница 5: 40 игр
  Страница 6: 40 игр
  Страница 7: 40 игр
  Страница 8: 40 игр
  Страница 9: 40 игр
  Страница 10: 40 игр
  Страница 11: 40 игр
  Страница 12: 40 игр
  Страница 13: 40 игр
  Страница 14: 40 игр
  Страница 15: 40 игр
  Страница 16: 40 игр
  Страница 17: 40 игр

In [None]:
df_games.to_csv(f"rawg_games_dataset.csv", index=False, encoding='utf-8')

NameError: name 'df_games' is not defined

In [None]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5315 entries, 0 to 5314
Data columns (total 41 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   game_id             5315 non-null   int64  
 1   name                5315 non-null   object 
 2   name_clean          5315 non-null   object 
 3   released            5256 non-null   object 
 4   release_year        5256 non-null   float64
 5   rating              5315 non-null   float64
 6   rating_top          5315 non-null   int64  
 7   ratings_count       5315 non-null   int64  
 8   added               5315 non-null   int64  
 9   playtime            5315 non-null   int64  
 10  metacritic          2601 non-null   float64
 11  reviews_count       5315 non-null   int64  
 12  suggestions_count   5315 non-null   int64  
 13  rating_5            4264 non-null   float64
 14  rating_4            4574 non-null   float64
 15  rating_3            4480 non-null   float64
 16  rating

In [None]:
from google.colab import files
uploaded = files.upload()

Saving steam_scraped_games.xlsx to steam_scraped_games.xlsx


In [None]:
df_steam = pd.read_csv('steam_search_10k.csv', encoding='iso-8859-1')

In [None]:
common_names = set(df_games['name_clean']).intersection(set(df_steam['title'].str.lower().str.strip()))
print(f"Найдено {len(common_names)} общих игр по названиям")

Найдено 808 общих игр по названиям


In [None]:
df_masha = pd.read_excel('steam_scraped_games.xlsx')

In [None]:
df_masha#.duplicated().sum()

Unnamed: 0,appid,title,url,release_date_card,price_initial_usd,price_final_usd,discount_percent,is_free,review_positive_pct,review_count,release_date_html,about_text,developer,publisher,genres,categories,tags,platforms
0,3443650,Spooky Night,https://store.steampowered.com/app/3443650/Spo...,"Nov 2, 2025",,,,,,,"Nov 2, 2025",About This Game Wishlist to help us release th...,Ozbem,Ozbem,"Massively Multiplayer, RPG, Simulation, Strate...",,"Multiplayer, Social Deduction, Casual, Surviva...",
1,3841250,The Adventure of Ravi 'n' Navi,https://store.steampowered.com/app/3841250/The...,"Nov 2, 2025",,19.99,,,,,"Nov 2, 2025",About This Game The famous duo appears! Their ...,HSA Entertainment,HSA Entertainment,"Action, Adventure, Indie, RPG",,"3D Platformer, Action-Adventure, Action, Dark ...",
2,3671330,Dream Spectra Soundtrack,https://store.steampowered.com/app/3671330/Dre...,"Nov 2, 2025",,4.49,10.0,,,,"Nov 2, 2025",About This Content This is the full OST for Dr...,Ozone Interactive,,,,,
3,3991860,The Forged Show of War,https://store.steampowered.com/app/3991860/The...,"Nov 2, 2025",,10.49,25.0,,,,"Nov 2, 2025",About This Game The year is the late 1980s. Th...,"Swayam Raut, Clever Apoki, Mahesh Jambagi",Forge Bridge Studios,"Action, Strategy",,"Shooter, Military, Tactical, Strategy, Realist...",
4,3382670,Polterparty,https://store.steampowered.com/app/3382670/Pol...,"Nov 2, 2025",,5.09,15.0,,,,"Nov 2, 2025","About This Game Polterparty is a 4-player, onl...",Eelsmooth,Eelsmooth,"Adventure, Indie, RPG, Early Access",,"Early Access, Horror, Team-Based, Funny, Explo...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,3687050,Tainted Grail: The Fall of Avalon - Supporters...,https://store.steampowered.com/app/3687050/Tai...,"May 15, 2025",,7.99,20.0,,92.0,67.0,"May 15, 2025",About This Content BEFORE YOU BUY: This conten...,Questline,Awaken Realms,"Action, Adventure, Indie, RPG",,"Action, Adventure, RPG, Indie",
17996,3617810,Sepulcrum Demo,https://store.steampowered.com/app/3601900/Sep...,"May 15, 2025",,,,,,,"Jun 27, 2025",About This Game Destroy the unknown evil that ...,MDH Software,MDH Software,"Indie, RPG, Strategy",,"RPG, Turn-Based Tactics, Roguelite, Turn-Based...",
17997,3599340,Warbot Engineer,https://store.steampowered.com/app/3599340/War...,"May 15, 2025",,0.99,,,,,"May 15, 2025","About This Game ""Warbot Engineer"" is a high-fr...",night walker,夜行者,"Casual, Indie, Simulation, Strategy",,"Strategy, Top-Down Shooter, Bullet Hell, Shoot...",
17998,3672900,The Honours Project,https://store.steampowered.com/app/3672900/The...,"May 15, 2025",,,,,95.0,43.0,"May 15, 2025",About This Game The Honours Project is an auto...,Scott Cambell,ScottCambellAudio,"Adventure, Casual, Indie, Simulation, Free To ...",,"Walking Simulator, Story Rich, Narration, Casu...",
