In [None]:
!pip -q install requests pandas tqdm cloudscraper beautifulsoup4


In [None]:
import os
import time
import csv
import random
import json
import pathlib
import math
from typing import Optional, Dict, List, Tuple

import requests
import pandas as pd
from tqdm import tqdm
import cloudscraper
from bs4 import BeautifulSoup
import re

# ================== CONFIG ==================

COUNTRIES = [
    "US", "CA", "BR", "AR", "TR", "PL", "DE", "FR", "GB", "JP", "AU", "CN",
    "MX", "RU", "KR", "ES", "IT", "NL", "SE", "NO", "CH", "NZ"
]
LANG = "english"       # Steam UI language
TOPN = 500             # how many top sellers to fetch

OUT_TOPSELLERS_CSV = "steam_topsellers.csv"
OUT_PRICES_CSV = "steam_prices.csv"
OUT_ENRICHED_CSV = "steam_prices_enriched.csv"
OUT_SUMMARY_CSV = "steam_prices_summary.csv"
OUT_DIFF_CSV = "steam_prices_summary_diff.csv"


# ================== UTILS ==================

def ensure_outdir(path: str) -> str:
    d = os.path.dirname(os.path.abspath(path))
    if d:
        pathlib.Path(d).mkdir(parents=True, exist_ok=True)
    return path


def write_csv(rows: List[Dict], out_path: str) -> None:
    if not rows:
        print(f"[WARN] No rows -> {out_path}")
        return
    ensure_outdir(out_path)
    with open(out_path, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
        w.writeheader()
        w.writerows(rows)
    print(f"[OK] {len(rows)} rows -> {out_path}")


In [None]:
# ================== STEP 1: TOP SELLERS ==================

def fetch_steam_topsellers(cc: str = "US", lang: str = LANG, topn: int = TOPN, verbose: bool = True) -> List[Dict]:
    ua = (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 topseller-bot"
    )
    headers_html = {
        "User-Agent": ua,
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
    }

    try:
        scraper = cloudscraper.create_scraper(
            browser={"browser": "chrome", "platform": "linux", "mobile": False}
        )
    except Exception:
        scraper = requests

    rows: List[Dict] = []
    page = 1
    max_pages = math.ceil(topn / 50) + 2

    while len(rows) < topn and page <= max_pages:
        url = f"https://store.steampowered.com/search/?filter=topsellers&cc={cc}&l={lang}&page={page}"
        success = False
        for attempt in range(3):
            try:
                r = scraper.get(url, headers=headers_html, timeout=20)
                r.raise_for_status()
                soup = BeautifulSoup(r.text, "html.parser")
                results = soup.select(".search_result_row")
                if not results:
                    if verbose:
                    success = True
                    break

                for row in results:
                    appid = row.get("data-ds-appid")
                    if not appid:
                        continue
                    name_tag = row.select_one(".title")
                    name = name_tag.get_text(strip=True) if name_tag else None
                    rows.append(
                        {
                            "appid": appid,
                            "name": name,
                            "cc": cc,
                            "currency": None,
                            "initial": None,
                            "final": None,
                            "discount_percent": None,
                        }
                    )
                    if len(rows) >= topn:
                        break

                if verbose:
                    print(f"[HTML] page={page} 抓到 {len(results)} 条，累计 {len(rows)}")
                success = True
                break
            except Exception as e:
                if verbose:
                    print(f"[HTML page={page} attempt={attempt}] {e}")
                time.sleep(1.0 * (2 ** attempt))

        if not success:
            if verbose:
            break

        if len(rows) >= topn:
            break

        page += 1
        time.sleep(0.7 + random.uniform(0, 0.5))

    if verbose:
        print(f"[DONE] Finally {len(rows)}  top sellers (Goal {topn})")
    return rows


In [None]:
# ================== STEP 2: REGIONAL PRICES ==================

# We will reuse a single cloudscraper session for prices.
scraper_price = cloudscraper.create_scraper(
    browser={"browser": "chrome", "platform": "linux", "mobile": False}
)
UA_PRICE = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 pricebot/2.0"
    ),
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Accept-Language": "en-US,en;q=0.9",
}


def fetch_steam_price_throttled(appid: str, country: str, lang: str = LANG) -> Optional[Dict]:
    """Robustly request appdetails?appids=<APPID>&cc=<CC>&l=<LANG>."""
    url = (
        "https://store.steampowered.com/api/appdetails"
        f"?appids={appid}&cc={country}&l={lang}"
    )
    base_sleep = 1.0
    attempt = 0
    while attempt <= 8:
        sleep_s = base_sleep * (2**attempt) + random.uniform(0.0, 0.5)
        try:
            r = scraper_price.get(url, headers=UA_PRICE, timeout=20)
            # 429: too many requests
            if r.status_code == 429:
                retry_after = r.headers.get("Retry-After")
                try:
                    extra = float(retry_after)
                except Exception:
                    extra = 10.0
                time.sleep(extra + random.uniform(0, 1.0))
                attempt += 1
                continue
            # 5xx
            if r.status_code >= 500:
                attempt += 1
                time.sleep(sleep_s)
                continue

            r.raise_for_status()
            j = r.json()
            data = j.get(str(appid), {}).get("data", {})
            pov = data.get("price_overview")
            if not pov:
                return None
            initial = pov.get("initial")
            final = pov.get("final")
            return {
                "platform": "steam",
                "appid": str(appid),
                "country": country,
                "currency": pov.get("currency"),
                "initial": (initial or 0) / 100 if isinstance(initial, (int, float)) else None,
                "final": (final or 0) / 100 if isinstance(final, (int, float)) else None,
                "discount_percent": pov.get("discount_percent"),
                "release_date": (data.get("release_date") or {}).get("date"),
            }
        except Exception:
            attempt += 1
            time.sleep(min(45, (2**attempt) * 1.2 + random.uniform(0, 1.0)))

    return None


def step_fetch_regional_prices() -> None:
    """Fetch prices for all appid × country combos into OUT_PRICES_CSV."""
    df_top = pd.read_csv(OUT_TOPSELLERS_CSV)
    appids = (
        pd.to_numeric(df_top["appid"], errors="coerce")
        .dropna()
        .astype(int)
        .astype(str)
        .unique()
    )

    if os.path.exists(OUT_PRICES_CSV):
        df_done = pd.read_csv(OUT_PRICES_CSV)
        done_set: set[Tuple[str, str]] = set(
            zip(df_done["appid"].astype(str), df_done["country"].astype(str))
        )
        out_rows: List[Dict] = df_done.to_dict("records")
        print(f"[RESUME] loaded {len(df_done)} rows from {OUT_PRICES_CSV}")
    else:
        done_set = set()
        out_rows = []

    tasks: List[Tuple[str, str]] = []
    for aid in appids:
        for cc in COUNTRIES:
            if (aid, cc) not in done_set:
                tasks.append((aid, cc))
    random.shuffle(tasks)
    print(f"[PLAN] to fetch {len(tasks)} appid×country pairs")

    CHUNK_PAUSE = 10.0
    CHUNK_EVERY = 80
    SAVE_EVERY = 50

    for idx, (aid, cc) in enumerate(
        tqdm(tasks, desc="Steam regional prices"), start=1
    ):
        rec = fetch_steam_price_throttled(aid, cc)
        if rec:
            out_rows.append(rec)
            done_set.add((aid, cc))

        time.sleep(0.6 + random.uniform(0, 0.5))

        if idx % CHUNK_EVERY == 0:
            time.sleep(CHUNK_PAUSE)

        if (len(out_rows) % SAVE_EVERY == 0) and out_rows:
            tmp_path = OUT_PRICES_CSV + ".tmp"
            ensure_outdir(tmp_path)
            with open(tmp_path, "w", newline="", encoding="utf-8") as f:
                w = csv.DictWriter(f, fieldnames=list(out_rows[0].keys()))
                w.writeheader()
                w.writerows(out_rows)
            os.replace(tmp_path, OUT_PRICES_CSV)

    if out_rows:
        ensure_outdir(OUT_PRICES_CSV)
        with open(OUT_PRICES_CSV, "w", newline="", encoding="utf-8") as f:
            w = csv.DictWriter(f, fieldnames=list(out_rows[0].keys()))
            w.writeheader()
            w.writerows(out_rows)
    print(f"[DONE] wrote {len(out_rows)} rows -> {OUT_PRICES_CSV}")


In [None]:
# ================== STEP 3: ENRICH WITH APP META ==================

def fetch_app_meta(appid: str, cc: str = "US", lang: str = LANG, session: Optional[requests.Session] = None) -> Dict:
    url = f"https://store.steampowered.com/api/appdetails?appids={appid}&cc={cc}&l={lang}"
    s = session or requests
    for _ in range(3):
        try:
            r = s.get(url, timeout=20, headers={"User-Agent": "Mozilla/5.0 meta-bot/1.0"})
            r.raise_for_status()
            d = r.json().get(str(appid), {}).get("data", {})
            return {
                "appid": str(appid),
                "type": d.get("type"),
                "is_free": d.get("is_free"),
                "genres": ",".join(
                    [g.get("description") for g in (d.get("genres") or [])]
                ),
                "categories": ",".join(
                    [g.get("description") for g in (d.get("categories") or [])]
                ),
            }
        except Exception:
            time.sleep(0.7)
    return {
        "appid": str(appid),
        "type": None,
        "is_free": None,
        "genres": None,
        "categories": None,
    }


def step_enrich_with_meta() -> None:
    p = pd.read_csv(OUT_PRICES_CSV)
    t = pd.read_csv(OUT_TOPSELLERS_CSV)[["appid", "name"]].drop_duplicates()
    p["appid"] = p["appid"].astype(str)
    t["appid"] = t["appid"].astype(str)

    appids = t["appid"].unique().tolist()
    meta_rows: List[Dict] = []
    with requests.Session() as s:
        for a in tqdm(appids, desc="fetch meta"):
            meta_rows.append(fetch_app_meta(a, session=s))
    meta = pd.DataFrame(meta_rows)

    p2 = p.merge(meta, on="appid", how="left")
    p2["ts_collected_utc"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())

    # If Steam marks an app as free, force price to 0
    free_ids = set(meta.loc[meta["is_free"] == True, "appid"])
    p2.loc[p2["appid"].isin(free_ids), ["final", "initial"]] = 0.0

    # Keep only "game" type or free apps
    mask_game = p2["type"].fillna("").str.lower() == "game"
    p2 = p2[mask_game | p2["appid"].isin(free_ids)]

    ensure_outdir(OUT_ENRICHED_CSV)
    p2.to_csv(OUT_ENRICHED_CSV, index=False)
    print("[OK] ->", OUT_ENRICHED_CSV, len(p2))


In [None]:
# ================== STEP 4: SUMMARY & DIFF VS US ==================

def step_build_summary() -> None:
    # base summary from raw prices
    p = pd.read_csv(OUT_PRICES_CSV)
    t = pd.read_csv(OUT_TOPSELLERS_CSV)
    df = p.merge(t[["appid", "name"]].drop_duplicates(), on="appid", how="left")
    pivot = df.pivot_table(
        index=["appid", "name"],
        columns="country",
        values="final",
        aggfunc="first",
    ).reset_index()
    ensure_outdir(OUT_SUMMARY_CSV)
    pivot.to_csv(OUT_SUMMARY_CSV, index=False)
    print(f"[OK] summary -> {OUT_SUMMARY_CSV}")

    # diff vs US using enriched prices (filters non-game etc.)
    p2 = pd.read_csv(OUT_ENRICHED_CSV, dtype={"appid": str})
    names = pd.read_csv(OUT_TOPSELLERS_CSV, dtype={"appid": str})[
        ["appid", "name"]
    ].drop_duplicates()
    df2 = p2.merge(names, on="appid", how="left")

    pv = (
        df2.pivot_table(
            index=["appid", "name"],
            columns="country",
            values="final",
            aggfunc="first",
        )
        .reset_index()
    )

    if "US" in pv.columns:
        for c in [col for col in pv.columns if col not in ["appid", "name", "US"]]:
            pv[f"diff_vs_US_{c}_%"] = (pv[c] - pv["US"]) / pv["US"] * 100.0

    ensure_outdir(OUT_DIFF_CSV)
    pv.to_csv(OUT_DIFF_CSV, index=False)
    print("[OK] ->", OUT_DIFF_CSV)


In [None]:
# ================== RUN FULL PIPELINE ==================

topsellers_rows = fetch_steam_topsellers(cc="US", lang=LANG, topn=TOPN, verbose=True)
write_csv(topsellers_rows, OUT_TOPSELLERS_CSV)

step_fetch_regional_prices()
step_enrich_with_meta()
step_build_summary()

tops = pd.read_csv(OUT_TOPSELLERS_CSV)
print("topsellers rows:", len(tops), "unique appid:", tops["appid"].nunique())


[HTML] page=1 抓到 25 条，累计 25
[HTML] page=2 抓到 25 条，累计 50
[HTML] page=3 抓到 25 条，累计 75
[HTML] page=4 抓到 25 条，累计 100
[HTML] page=5 抓到 25 条，累计 125
[HTML] page=6 抓到 25 条，累计 150
[HTML] page=7 抓到 25 条，累计 175
[HTML] page=8 抓到 25 条，累计 200
[HTML] page=9 抓到 25 条，累计 225
[HTML] page=10 抓到 25 条，累计 250
[HTML] page=11 抓到 25 条，累计 275
[HTML] page=12 抓到 25 条，累计 300
[DONE] 最终抓到 300 条 top sellers 记录 (目标 500)
[OK] 300 rows -> steam_topsellers.csv
[RESUME] loaded 815 rows from steam_prices.csv
[PLAN] to fetch 5763 appid×country pairs


Steam regional prices: 100%|██████████| 5763/5763 [2:24:50<00:00,  1.51s/it]


[DONE] wrote 5425 rows -> steam_prices.csv


fetch meta: 100%|██████████| 299/299 [03:01<00:00,  1.65it/s]


[OK] -> steam_prices_enriched.csv 4130
[OK] summary -> steam_prices_summary.csv
[OK] -> steam_prices_summary_diff.csv
topsellers rows: 300 unique appid: 299


In [None]:
import pandas as pd
tops = pd.read_csv("steam_topsellers.csv")
print("rows:", len(tops), "unique appid:", tops["appid"].nunique())
tops.head()

rows: 300 unique appid: 299


Unnamed: 0,appid,name,cc,currency,initial,final,discount_percent
0,1808500,ARC Raiders,US,,,,
1,1675200,Steam Deck,US,,,,
2,730,Counter-Strike 2,US,,,,
3,2592160,Dispatch,US,,,,
4,3606480,Call of Duty®: Black Ops 7,US,,,,


In [None]:
from google.colab import files

files.download("steam_topsellers.csv")
files.download("steam_prices.csv")
files.download("steam_prices_enriched.csv")
files.download("steam_prices_summary.csv")
files.download("steam_prices_summary_diff.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>