<a href="https://colab.research.google.com/github/jinyang628/catan/blob/main/Catan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Credits

MR. MUCHO BUCHO Game Data (2025)
43,947 anonymized 4-player Catan games
https://github.com/Catan-data/dataset
"""

'\nCredits\n\nMR. MUCHO BUCHO Game Data (2025)\n43,947 anonymized 4-player Catan games\nhttps://github.com/Catan-data/dataset\n'

In [None]:
from enum import StrEnum
import pandas as pd
import os
import json
import logging
from typing import Optional
import tarfile
from collections import defaultdict

log = logging.getLogger(__name__)

In [None]:
RESOURCE_MAPPING = {
    1: "BRICK",
    2: "WOOL",
    3: "GRAIN",
    4: "ORE",
    5: "LUMBER"
}

DEVELOPMENT_CARD_MAPPING = {
    10: "KNIGHT",
    11: "KNIGHT",
    12: "MONOPOLY",
    13: "VICTORY POINT",
    14: "ROAD BUILDING",
    15: "YEAR OF PLENTY"
}

# https://github.com/Catan-data/dataset/issues/3 (Mapping in README is probably wrong)
VICTORY_POINTS_MAPPING = {
    "0": "SETTLEMENTS",
    "1": "CITIES",
    "2": "VP CARD",
    "3": "LARGEST ARMY",
    "4": "LONGEST ROAD"
}

MAX_VICTORY_POINTS_PER_CATEGORY = {
    "SETTLEMENTS": 5,
    "CITIES": 4,
    "LARGEST ARMY": 1,
    "LONGEST ROAD": 1,
    "VP CARD": 5,
}


In [None]:
tar_gz_file_path = '/content/drive/MyDrive/datasets/games.tar.gz'
extract_path = '/content/extracted_games'
os.makedirs(extract_path, exist_ok=True)
with tarfile.open(tar_gz_file_path, 'r:gz') as tar:
    tar.extractall(path=extract_path)
json_game_files = os.listdir(f"{extract_path}/games")


  tar.extractall(path=extract_path)


File extracted to: /content/extracted_games


In [None]:
# Goal: Find what is the distribution of VPs for winners (E.g. how many settlements did he build, did he get largest army, etc.)

def is_valid_end_game_vp_state(end_game_vp_state: dict) -> bool:
  total = 0
  for key, value in end_game_vp_state.items():
    if value > MAX_VICTORY_POINTS_PER_CATEGORY[key]:
      log.error(f"Invalid victory points for category {key}: {value}. Expected a maximum of {MAX_VICTORY_POINTS_PER_CATEGORY[key]}")
      return False
    if key == "SETTLEMENTS":
      total += (value * 1)
    elif key == "CITIES":
      total += (value * 2)
    elif key == "LARGEST ARMY":
      total += (value * 2)
    elif key == "LONGEST ROAD":
      total += (value * 2)
    elif key == "VP CARD":
      total += (value * 1)
    else:
      raise ValueError(f"Unexpected key found in end game vp state {key}")

  if total < 10:
    log.error(f"Stated winner does not have at least 10 VPs: {end_game_vp_state}")
    return False
  return True

def get_end_game_json_vp_state(filename: str) -> Optional[dict]:
  with open(f"{extract_path}/games/{filename}", "r") as f:
    data = json.loads(f.read())
    players = data["data"]["eventHistory"]["endGameState"]["players"]
    for _, metadata in players.items():
      if not metadata["winningPlayer"]:
        continue
      victory_points_metadata = metadata["victoryPoints"]
      end_game_vp_state = defaultdict(int)
      for key, count in victory_points_metadata.items():
        if key not in VICTORY_POINTS_MAPPING:
          log.error(f"Unexpected key found in victory points metadata: {key}. Expected {VICTORY_POINTS_MAPPING}")
          return None
        end_game_vp_state[VICTORY_POINTS_MAPPING[key]] = count
      return end_game_vp_state

def get_end_game_vp_state_df(sample_size: Optional[int]) -> pd.DataFrame:
  sample_size = sample_size or len(json_game_files)
  valid_end_game_states: list[dict] = []
  # Sample file
  for filename in json_game_files[:sample_size]:
    end_game_vp_state: Optional[dict] = get_end_game_json_vp_state(filename)
    if end_game_vp_state is None:
      continue
    if not is_valid_end_game_vp_state(end_game_vp_state):
      continue
    valid_end_game_states.append(end_game_vp_state)

  df = pd.DataFrame(valid_end_game_states)
  df.fillna(0, inplace=True)

  df['LARGEST ARMY'] = df['LARGEST ARMY'].apply(lambda x: True if x == 1 else False)
  df['LONGEST ROAD'] = df['LONGEST ROAD'].apply(lambda x: True if x == 1 else False)
  df["VP CARD"] = df["VP CARD"].astype(int)

  num_rows = len(df)
  if num_rows < sample_size:
    log.error(f"{sample_size - num_rows} row(s) dropped due to inaccurate end game VP state")
  return df

get_end_game_vp_state_df(10)

ERROR:__main__:Unexpected key found in victory points metadata: 6. Expected {'0': 'SETTLEMENTS', '1': 'CITIES', '2': 'VP CARD', '3': 'LARGEST ARMY', '4': 'LONGEST ROAD'}
ERROR:__main__:1 row(s) dropped due to inaccurate end game VP state


Unnamed: 0,SETTLEMENTS,CITIES,VP CARD,LONGEST ROAD,LARGEST ARMY
0,3,2,1,True,False
1,4,3,1,True,True
2,2,3,0,False,True
3,2,2,2,False,True
4,5,1,1,True,False
5,3,1,1,True,True
6,1,2,3,False,True
7,2,3,2,False,False
8,0,4,0,True,False
