# 00 · BGG Scraper

Stáhne data o deskových hrách a kolekcích uživatelů z BoardGameGeek API.

Výstupy:
- `data/games.parquet`
- `data/user_<username>_collection.parquet`
- `data/user_<username>_plays.parquet`

In [1]:
import os, time, requests
import pandas as pd
from bs4 import BeautifulSoup

DATA_DIR = "data"
os.makedirs(DATA_DIR, exist_ok=True)

def bgg_get(url, params, retry_wait=2.0, max_retries=5):
    r = requests.get(url, params=params, timeout=60)
    tries = 0
    while r.status_code == 202 and tries < max_retries:
        time.sleep(retry_wait)
        r = requests.get(url, params=params, timeout=60)
        tries += 1
    r.raise_for_status()
    return r

## Stahování detailů her (`thing`)

In [2]:
def fetch_games(ids, batch_size=200, sleep=1.2):
    url = "https://boardgamegeek.com/xmlapi2/thing"
    rows = []
    for i in range(0,len(ids),batch_size):
        batch = ids[i:i+batch_size]
        r = bgg_get(url,{"id": ",".join(map(str,batch)), "stats":1, "type":"boardgame"})
        soup = BeautifulSoup(r.text, "lxml-xml")
        for item in soup.find_all("item"):
            gid = int(item.get("id"))
            name = item.find("name", {"type":"primary"}).get("value") if item.find("name", {"type":"primary"}) else None
            minp = int(item.find("minplayers").get("value")) if item.find("minplayers") else None
            maxp = int(item.find("maxplayers").get("value")) if item.find("maxplayers") else None
            pt = int(item.find("playingtime").get("value")) if item.find("playingtime") else None
            year = int(item.find("yearpublished").get("value")) if item.find("yearpublished") else None
            cats = [l.get("value") for l in item.find_all("link", {"type":"boardgamecategory"})]
            mechs= [l.get("value") for l in item.find_all("link", {"type":"boardgamemechanic"})]
            desc = item.find("description").text if item.find("description") else ""
            rating= users= weight=None
            stats = item.find("statistics")
            if stats and stats.find("ratings"):
                rat = stats.find("ratings")
                rating = float(rat.find("average").get("value")) if rat.find("average") else None
                users = int(rat.find("usersrated").get("value")) if rat.find("usersrated") else None
                weight = float(rat.find("averageweight").get("value")) if rat.find("averageweight") else None
            rows.append({"bgg_id":gid,"name":name,"min_players":minp,"max_players":maxp,
                         "playing_time":pt,"year":year,"categories":" ".join(cats).lower(),
                         "mechanics":" ".join(mechs).lower(),"description":desc,
                         "rating":rating,"rating_count":users,"weight":weight})
        time.sleep(sleep)
    return pd.DataFrame(rows)

# Demo stáhnutí několika her
demo_ids=[173346,9209,30549,68448,174430]
df_games = fetch_games(demo_ids)
df_games.to_parquet(os.path.join(DATA_DIR,"games.parquet"),index=False)
df_games.head()

Unnamed: 0,bgg_id,name,min_players,max_players,playing_time,year,categories,mechanics,description,rating,rating_count,weight
0,173346,7 Wonders Duel,2,2,30,2015,ancient card game city building civilization e...,end game bonuses income melding and splaying m...,In many ways 7 Wonders Duel resembles its pare...,8.08275,104654,2.2267
1,9209,Ticket to Ride,2,5,60,2004,trains,connections contracts end game bonuses hand ma...,"With elegantly simple gameplay, Ticket to Ride...",7.38782,95778,1.8216
2,30549,Pandemic,2,4,45,2008,medical travel,action points chaining contracts cooperative g...,"In Pandemic, several virulent diseases have br...",7.52026,132190,2.3956
3,68448,7 Wonders,2,7,30,2010,ancient card game city building civilization e...,closed drafting end game bonuses hand manageme...,You are the leader of one of the 7 great citie...,7.66761,110220,2.3146
4,174430,Gloomhaven,1,4,120,2017,adventure exploration fantasy fighting miniatures,action queue action retrieval campaign / battl...,Gloomhaven is a game of Euro-inspired tactica...,8.55433,65736,3.9145


## Kolekce uživatele (`collection`)

In [3]:
def fetch_collection(username):
    url = "https://boardgamegeek.com/xmlapi2/collection"
    params = {"username": username, "stats": 1, "own": 1}
    r = bgg_get(url, params)
    soup = BeautifulSoup(r.text, "lxml-xml")
    rows = []
    for item in soup.find_all("item"):
        gid = int(item.get("objectid"))
        name = item.find("name").text if item.find("name") else None
        rating = item.find("rating")
        val = rating.get("value") if rating else None
        if val is None or val == "N/A":
            user_rating = None
        else:
            try: user_rating = float(val)
            except: user_rating = None
        plays = int(item.find("numplays").text) if item.find("numplays") else 0
        rows.append({"bgg_id": gid, "name": name, "user_rating": user_rating, "plays": plays})
    return pd.DataFrame(rows)

# Demo kolekce
df_col = fetch_collection("w0nderCZ")
df_col.to_parquet(os.path.join(DATA_DIR,"user_w0nderCZ_collection.parquet"),index=False)
df_col.head()

Unnamed: 0,bgg_id,name,user_rating,plays
0,26997,1989: Dawn of Freedom,,0
1,309116,7 Divů světa Duel: Agora,,0
2,202976,7 Divů Světa: Duel – Panteon,,0
3,173346,7 Wonders Duel,,0
4,155987,Abyss,,0


## Plays uživatele (`plays`)

In [4]:
def fetch_plays(username, max_pages=2):
    url = "https://boardgamegeek.com/xmlapi2/plays"
    rows = []
    for page in range(1,max_pages+1):
        r = bgg_get(url,{"username":username,"page":page})
        soup = BeautifulSoup(r.text,"lxml-xml")
        for play in soup.find_all("play"):
            gid = int(play.get("gameid")) if play.get("gameid") else None
            name = play.get("name")
            date = play.get("date")
            qty = int(play.get("quantity")) if play.get("quantity") else 1
            rows.append({"bgg_id":gid,"name":name,"date":date,"quantity":qty})
        time.sleep(1.2)
    return pd.DataFrame(rows)

df_plays = fetch_plays("w0nderCZ")
df_plays.to_parquet(os.path.join(DATA_DIR,"user_w0nderCZ_plays.parquet"),index=False)
df_plays.head()

Unnamed: 0,bgg_id,name,date,quantity
0,,,2023-08-27,1
1,,,2023-08-13,1
2,,,2023-06-14,1
3,,,2023-05-19,1
4,,,2023-05-07,1
