In [None]:
# -- Imports & Global Config (Colab friendly) --
import os, time, random, json
import pandas as pd
import numpy as np
import requests
from datetime import datetime
from tqdm.auto import tqdm

# ---- File Paths ----
PRICES_RAW_PATH         = "/content/steam_prices.csv"                 # upload this first
PRICES_NORMALIZED_PATH  = "/content/steam_prices_normalized.csv"      # produced by Module 2
PATCH_APPDETAILS_PATH   = "/content/steam_prices_patch_appdetails.csv"
FILLED_OUT_PATH         = "/content/steam_prices_normalized_filled.csv"
PAIRS_STILL_MISSING_PATH= "/content/pairs_still_missing.csv"

# ---- Target scope (edit as needed) ----
TARGET_APPIDS = ['1085660','1222670','230410','236390','2767030','730']
TARGET_CC     = ["AR","AU","BR","CA","CN","DE","FR","GB","JP","PL","TR","US"]

# ---- HTTP (polite & age gate bypass) ----
HEADERS = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Safari/537.36"}
COOKIES = {
    "birthtime": "189302401",
    "lastagecheckage": "1-January-1976",
    "wants_mature_content": "1"
}
REQ_TIMEOUT = 20
SLEEP_RANGE = (0.5, 1.0)   # polite delay between calls



In [None]:
# -- Utilities: tolerant date parser & coercions --

from datetime import datetime

KNOWN_PATTERNS = [
    "%d %b, %Y",   # 24 Mar, 2025
    "%b %d, %Y",   # Sep 25, 2025
    "%d-%b-%y",    # 14-Jun-16
    "%d %b %Y",    # 14 Jun 2016
    "%Y-%m-%d",    # 2025-03-24
    "%m/%d/%Y",    # 03/24/2025
    "%d/%m/%Y",    # 24/03/2025
]

def parse_release_date_to_iso(s):
    """Return a pandas Timestamp date (YYYY-MM-DD) or NaT."""
    if pd.isna(s):
        return pd.NaT
    s = str(s).strip()
    if not s:
        return pd.NaT
    s = " ".join(s.replace("  ", " ").split())
    for pat in KNOWN_PATTERNS:
        try:
            dt = datetime.strptime(s, pat)
            return pd.Timestamp(dt.date())
        except Exception:
            pass
    try:
        return pd.to_datetime(s, errors="coerce", dayfirst=True)
    except Exception:
        return pd.NaT

def norm_upper(x):
    return None if pd.isna(x) else str(x).upper()

def cents_to_float(v):
    """Steam returns cents; convert to unit float if not None."""
    return None if v is None else float(v) / 100.0



In [None]:
# -- Normalize raw steam_prices.csv to steam_prices_normalized.csv --

def normalize_steam_prices(raw_path: str, out_path: str) -> pd.DataFrame:
    """
    Read a raw steam_prices.csv and produce a normalized table with:
      - columns: platform, appid, country, currency, initial, final, discount_percent,
                 release_date (raw), release_date_iso (ISO date)
      - uppercase for country & currency
      - numeric coercion for initial/final/discount_percent
      - robust parsing for release_date -> release_date_iso
    """
    if not os.path.exists(raw_path):
        raise FileNotFoundError(f"Raw prices file not found: {raw_path}")

    df = pd.read_csv(raw_path)

    # Ensure required columns exist (create if missing)
    base_cols = ["platform","appid","country","currency","initial","final","discount_percent","release_date"]
    for c in base_cols:
        if c not in df.columns:
            df[c] = np.nan

    # Normalize keys/casing
    df["platform"] = df["platform"].fillna("steam")
    df["appid"]    = df["appid"].astype(str)
    df["country"]  = df["country"].astype(str).str.upper()
    if "currency" in df.columns:
        df["currency"] = df["currency"].astype(str).str.upper()

    # Coerce numerics
    for col in ["initial","final","discount_percent"]:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    # Compute discount_percent if missing but initial/final present
    need = df["discount_percent"].isna() & df["initial"].gt(0) & df["final"].ge(0)
    df.loc[need, "discount_percent"] = (1 - df.loc[need, "final"]/df.loc[need, "initial"]) * 100

    # Parse release_date → ISO
    df["release_date_iso"] = df["release_date"].apply(parse_release_date_to_iso)

    # Save
    df.to_csv(out_path, index=False)
    print(f"Saved normalized prices to: {out_path} | rows={len(df)}")

    return df

# Run cleaning and keep DataFrame in memory for later modules
prices_norm = normalize_steam_prices(PRICES_RAW_PATH, PRICES_NORMALIZED_PATH)
prices_norm.head()



Saved normalized prices to: /content/steam_prices_normalized.csv | rows=513


Unnamed: 0,platform,appid,country,currency,initial,final,discount_percent,release_date,release_date_iso
0,steam,3164500,AR,USD,10.49,7.34,30,"24 Mar, 2025",2025-03-24
1,steam,1757300,AR,USD,7.99,7.99,0,"19 Sep, 2025",2025-09-19
2,steam,1142710,JP,JPY,7000.0,1750.0,75,"16 Feb, 2022",2022-02-16
3,steam,1144200,GB,GBP,44.99,22.49,50,"13 Dec, 2023",2023-12-13
4,steam,1903340,BR,BRL,199.0,179.1,10,"24 Apr, 2025",2025-04-24


In [None]:
# -- Detect missing or bad (appid,country) pairs within TARGET scope --

def build_targets(appids, countries) -> pd.DataFrame:
    return pd.MultiIndex.from_product([appids, countries], names=["appid","country"]).to_frame(index=False)

def detect_missing_and_bad(prices_df: pd.DataFrame,
                           target_appids, target_cc):
    df = prices_df.copy()
    df["appid"]   = df["appid"].astype(str)
    df["country"] = df["country"].astype(str).str.upper()

    targets = build_targets(target_appids, target_cc)

    # Missing entirely
    present = df[["appid","country"]].drop_duplicates()
    missing_pairs = (targets.merge(present, on=["appid","country"], how="left", indicator=True)
                            .query("_merge=='left_only'")[["appid","country"]]
                            .reset_index(drop=True))

    # Present but bad
    for c in ["release_date_iso","currency","initial"]:
        if c not in df.columns:
            df[c] = np.nan
    need_fix = (df.merge(targets, on=["appid","country"], how="inner")
                  .assign(_bad=lambda d: d["release_date_iso"].isna() | d["currency"].isna() | d["initial"].isna()))
    bad_pairs = need_fix.loc[need_fix["_bad"], ["appid","country"]].drop_duplicates().reset_index(drop=True)

    todo_pairs = pd.concat([missing_pairs, bad_pairs], ignore_index=True).drop_duplicates().reset_index(drop=True)
    return targets, missing_pairs, bad_pairs, todo_pairs

# IMPORTANT: use in-memory prices_norm from Module 2
targets, missing_pairs, bad_pairs, todo_pairs = detect_missing_and_bad(prices_norm, TARGET_APPIDS, TARGET_CC)

print("Missing entirely:", len(missing_pairs))
print("Present but bad:", len(bad_pairs))
todo_pairs.head()


Missing entirely: 72
Present but bad: 0


Unnamed: 0,appid,country
0,1085660,AR
1,1085660,AU
2,1085660,BR
3,1085660,CA
4,1085660,CN


In [None]:
# -- Steam Storefront 'appdetails' client (no API key required) --

def fetch_appdetails(appid: str, cc: str) -> dict:
    """
    Call Steam Storefront API and return a normalized row aligned to normalized schema.
    NOTE: initial/final/discount_percent are *current* prices, not guaranteed 'at launch'.
    """
    url = "https://store.steampowered.com/api/appdetails"
    params = {"appids": str(appid), "cc": cc.lower(), "l": "en"}
    r = requests.get(url, params=params, headers=HEADERS, cookies=COOKIES, timeout=REQ_TIMEOUT)
    r.raise_for_status()
    js = r.json()
    node = js.get(str(appid), {})
    if not node or not node.get("success"):
        return {}

    data = node.get("data", {})
    rel  = data.get("release_date") or {}
    pov  = data.get("price_overview") or {}
    is_free = bool(data.get("is_free", False))

    release_date_raw = rel.get("date")
    currency = norm_upper(pov.get("currency")) if pov else None
    initial  = cents_to_float(pov.get("initial")) if pov else (0.0 if is_free else None)
    final    = cents_to_float(pov.get("final"))   if pov else (0.0 if is_free else None)
    disc     = pov.get("discount_percent") if pov else (0 if is_free else None)

    row = {
        "platform": "steam",
        "appid": str(appid),
        "country": norm_upper(cc),
        "currency": currency,
        "initial": initial,
        "final": final,
        "discount_percent": None if disc is None else float(disc),
        "release_date": release_date_raw,                               # raw string
        "release_date_iso": parse_release_date_to_iso(release_date_raw),# ISO/NaT
        "is_free": is_free,
        "source": "appdetails"
    }
    return row


In [None]:
# -- Build a patch only for missing/bad pairs --
def build_appdetails_patch(todo_pairs: pd.DataFrame, out_path: str) -> pd.DataFrame:
    if todo_pairs.empty:
        print("Nothing to fetch; todo_pairs is empty.")
        patch = pd.DataFrame(columns=[
            "platform","appid","country","currency","initial","final","discount_percent",
            "release_date","release_date_iso","is_free","source"
        ])
        patch.to_csv(out_path, index=False)
        return patch

    rows = []
    for _, r in tqdm(todo_pairs.iterrows(), total=len(todo_pairs)):
        appid, cc = str(r.appid), str(r.country)
        try:
            res = fetch_appdetails(appid, cc)
            if res:
                rows.append(res)
        except Exception as e:
            rows.append({"platform":"steam","appid":appid,"country":cc,"source":"appdetails","error":str(e)})
        time.sleep(random.uniform(*SLEEP_RANGE))

    patch = pd.DataFrame(rows)
    if not patch.empty:
        patch["appid"]   = patch["appid"].astype(str)
        patch["country"] = patch["country"].astype(str).str.upper()
        if "currency" in patch.columns:
            patch["currency"] = patch["currency"].astype(str).str.upper()
        for c in ["initial","final","discount_percent"]:
            if c in patch.columns:
                patch[c] = pd.to_numeric(patch[c], errors="coerce")
        if "release_date_iso" in patch.columns:
            mask = patch["release_date_iso"].isna()
            patch.loc[mask, "release_date_iso"] = patch.loc[mask, "release_date"].apply(parse_release_date_to_iso)

    patch.to_csv(out_path, index=False)
    print(f"Saved appdetails patch to: {out_path} | rows={len(patch)}")
    return patch

patch = build_appdetails_patch(todo_pairs, PATCH_APPDETAILS_PATH)
patch.head()


  0%|          | 0/72 [00:00<?, ?it/s]

Saved appdetails patch to: /content/steam_prices_patch_appdetails.csv | rows=71


Unnamed: 0,platform,appid,country,currency,initial,final,discount_percent,release_date,release_date_iso,is_free,source
0,steam,1085660,AR,NONE,0.0,0.0,0.0,"1 Oct, 2019",2019-10-01,True,appdetails
1,steam,1085660,AU,NONE,0.0,0.0,0.0,"1 Oct, 2019",2019-10-01,True,appdetails
2,steam,1085660,BR,NONE,0.0,0.0,0.0,"1 Oct, 2019",2019-10-01,True,appdetails
3,steam,1085660,CA,NONE,0.0,0.0,0.0,"1 Oct, 2019",2019-10-01,True,appdetails
4,steam,1085660,CN,NONE,0.0,0.0,0.0,"1 Oct, 2019",2019-10-01,True,appdetails


In [None]:
# -- Merge normalized + patch with priority, output final filled table --
def merge_with_priority(prices_norm_df: pd.DataFrame, patch_df: pd.DataFrame, out_path: str) -> pd.DataFrame:
    base = prices_norm_df.copy()
    if "source" not in base.columns:
        base["source"] = "orig"

    df = pd.concat([base, patch_df], ignore_index=True)
    df["source"]  = df["source"].fillna("appdetails")
    df["appid"]   = df["appid"].astype(str)
    df["country"] = df["country"].astype(str).str.upper()
    if "currency" in df.columns:
        df["currency"] = df["currency"].astype(str).str.upper()

    # compute discount if missing
    if set(["discount_percent","initial","final"]).issubset(df.columns):
        need = df["discount_percent"].isna() & df["initial"].gt(0) & df["final"].ge(0)
        df.loc[need, "discount_percent"] = (1 - df.loc[need, "final"]/df.loc[need, "initial"]) * 100

    # priority ordering
    df["source_cat"] = pd.Categorical(df["source"], categories=["orig","inferred","appdetails"], ordered=True)
    df = (df.sort_values(["appid","country","source_cat"])
            .drop_duplicates(["appid","country"], keep="first")
            .drop(columns=["source_cat"]))

    df.to_csv(out_path, index=False)
    print(f"Saved filled table to: {out_path} | rows={len(df)}")
    return df

filled = merge_with_priority(prices_norm, patch, FILLED_OUT_PATH)
filled.head()



Saved filled table to: /content/steam_prices_normalized_filled.csv | rows=584


Unnamed: 0,platform,appid,country,currency,initial,final,discount_percent,release_date,release_date_iso,source,is_free
74,steam,1004640,AR,USD,39.99,39.99,0.0,"30 Sep, 2025",2025-09-30,orig,
435,steam,1004640,AU,AUD,74.95,74.95,0.0,"30 Sep, 2025",2025-09-30,orig,
341,steam,1004640,BR,BRL,219.9,219.9,0.0,"30 Sep, 2025",2025-09-30,orig,
277,steam,1004640,CA,CAD,66.99,66.99,0.0,"30 Sep, 2025",2025-09-30,orig,
217,steam,1004640,CN,CNY,298.0,298.0,0.0,"30 Sep, 2025",2025-09-30,orig,


In [None]:
# -- QC: parse success by country and export still-missing pairs (for future runs/QA) --
def qc_and_export_missing(filled_df: pd.DataFrame,
                          target_appids, target_cc,
                          pairs_path: str) -> pd.DataFrame:
    df = filled_df.copy()
    df["release_date_iso"] = pd.to_datetime(df["release_date_iso"], errors="coerce")

    qc = (df.groupby("country")["release_date_iso"]
            .apply(lambda s: s.notna().mean())
            .rename("release_date_parse_success")
            .sort_index())
    print("--- Parse success by country ---")
    print(qc)

    # still missing within scope
    targets = pd.MultiIndex.from_product([target_appids, target_cc], names=["appid","country"]).to_frame(index=False)
    present = df[["appid","country"]].drop_duplicates()
    still_missing = (targets.merge(present, on=["appid","country"], how="left", indicator=True)
                           .query("_merge=='left_only'")[["appid","country"]]
                           .reset_index(drop=True))
    still_missing.to_csv(pairs_path, index=False)
    print(f"Exported still-missing pairs: {len(still_missing)} → {pairs_path}")
    return still_missing

still_missing = qc_and_export_missing(filled, TARGET_APPIDS, TARGET_CC, PAIRS_STILL_MISSING_PATH)
still_missing.head()



--- Parse success by country ---
country
AR    1.0
AU    1.0
BR    1.0
CA    1.0
CN    1.0
DE    1.0
FR    1.0
GB    1.0
JP    1.0
PL    1.0
TR    1.0
US    1.0
Name: release_date_parse_success, dtype: float64
Exported still-missing pairs: 1 → /content/pairs_still_missing.csv


Unnamed: 0,appid,country
0,236390,JP
