## Crawl Data from RAWG

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import requests
import json
import time



In [None]:
API_KEY = "YOUR_API_KEY"
BASE_URL = "https://api.rawg.io/api/games"

# Cấu hình crawl
START_YEAR = "2005-01-01"
END_YEAR = "2025-12-31"
PAGE_SIZE = 40
MAX_PAGES = 250  # ~10,000 records

# Nơi lưu kết quả
all_games = []

# Crawl từng trang
for page in range(1, MAX_PAGES + 1):
    params = {
        "key": API_KEY,
        "dates": f"{START_YEAR},{END_YEAR}",
        "ordering": "-added",  # sắp xếp giảm dần theo độ phổ biến
        "page_size": PAGE_SIZE,
        "page": page
    }
    print(f"Crawling page {page}/{MAX_PAGES}...")

    response = requests.get(BASE_URL, params=params)
    if response.status_code != 200:
        print(f"Failed to fetch page {page}: {response.status_code}")
        continue

    data = response.json()
    results = data.get("results", [])

    for game in results:
        game_info = {
            "id": game.get("id"),
            "name": game.get("name"),
            "released": game.get("released"),
            "rating": game.get("rating"),
            "metacritic": game.get("metacritic"),
            "playtime": game.get("playtime"),
            "genres": [genre["name"] for genre in game.get("genres", [])],
            "platforms": [p["platform"]["name"] for p in (game.get("platforms") or [])],
            "added": game.get("added")
        }
        all_games.append(game_info)

    # Tránh rate limit
    time.sleep(0.5)

print(f"Done. Total games fetched: {len(all_games)}")

# Lưu file JSON
output_file = "/content/drive/MyDrive/SIC_PROJECT/Data/Crawl/games_raw_data.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_games, f, indent=2, ensure_ascii=False)

print(f"Data saved to {output_file}")

Crawling page 1/125...
Crawling page 2/125...
Crawling page 3/125...
Crawling page 4/125...
Crawling page 5/125...
Crawling page 6/125...
Crawling page 7/125...
Crawling page 8/125...
Crawling page 9/125...
Crawling page 10/125...
Crawling page 11/125...
Crawling page 12/125...
Crawling page 13/125...
Crawling page 14/125...
Crawling page 15/125...
Crawling page 16/125...
Crawling page 17/125...
Crawling page 18/125...
Crawling page 19/125...
Crawling page 20/125...
Crawling page 21/125...
Crawling page 22/125...
Crawling page 23/125...
Crawling page 24/125...
Crawling page 25/125...
Crawling page 26/125...
Crawling page 27/125...
Crawling page 28/125...
Crawling page 29/125...
Crawling page 30/125...
Crawling page 31/125...
Crawling page 32/125...
Crawling page 33/125...
Crawling page 34/125...
Crawling page 35/125...
Crawling page 36/125...
Crawling page 37/125...
Crawling page 38/125...
Crawling page 39/125...
Crawling page 40/125...
Crawling page 41/125...
Crawling page 42/125...
C

In [None]:
import requests
import json
import time

API_KEY = "YOUR_API_KEY"
DETAIL_URL = "https://api.rawg.io/api/games/{}"

# Bước 1: Load danh sách game đã crawl (list_games.json)
with open("/content/drive/MyDrive/SIC_PROJECT/Data/Crawl/games_raw_data.json", "r", encoding="utf-8") as f:
    game_list = json.load(f)

print(f"Total games to crawl: {len(game_list)}")

# Bước 2: Tạo list lưu detail
all_details = []

# Bước 3: Crawl từng game
for idx, game in enumerate(game_list, 1):
    game_id = game["id"]
    url = DETAIL_URL.format(game_id)
    params = {"key": API_KEY}

    print(f"[{idx}/{len(game_list)}] Fetching detail for game ID {game_id}...")

    try:
        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"Failed to fetch {game_id}: HTTP {response.status_code}")
            continue

        data = response.json()
        all_details.append(data)

    except Exception as e:
        print(f"Error fetching {game_id}: {e}")
        continue

    # Sleep tránh rate limit
    time.sleep(0.5)

# Bước 4: Lưu ra 1 file JSON array
output_file = "/content/drive/MyDrive/SIC_PROJECT/Data/Crawl/game_details.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_details, f, ensure_ascii=False, indent=2)

print(f"\nDone! Saved {len(all_details)} game details to {output_file}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[78/5000] Fetching detail for game ID 58812...
[79/5000] Fetching detail for game ID 257201...
[80/5000] Fetching detail for game ID 4570...
[81/5000] Fetching detail for game ID 2462...
[82/5000] Fetching detail for game ID 4806...
[83/5000] Fetching detail for game ID 3017...
[84/5000] Fetching detail for game ID 3604...
[85/5000] Fetching detail for game ID 3543...
[86/5000] Fetching detail for game ID 50738...
[87/5000] Fetching detail for game ID 3603...
[88/5000] Fetching detail for game ID 13535...
[89/5000] Fetching detail for game ID 11142...
[90/5000] Fetching detail for game ID 3747...
[91/5000] Fetching detail for game ID 108...
[92/5000] Fetching detail for game ID 3636...
[93/5000] Fetching detail for game ID 274755...
[94/5000] Fetching detail for game ID 10141...
[95/5000] Fetching detail for game ID 11147...
[96/5000] Fetching detail for game ID 4513...
[97/5000] Fetching detail for game ID 4166...
[98/50