<a href="https://colab.research.google.com/github/jinyang628/catan/blob/main/Catan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Credits

MR. MUCHO BUCHO Game Data (2025)
43,947 anonymized 4-player Catan games
https://github.com/Catan-data/dataset
"""

In [47]:
from enum import StrEnum, Enum
import pandas as pd
import os
import json
import logging
from typing import Optional
import tarfile
from collections import defaultdict
from dataclasses import dataclass

import logging
import sys

class LevelBasedFormatter(logging.Formatter):
    def format(self, record):
        # Define formats
        error_format = "%(levelname)s: %(message)s"
        info_format = "%(message)s"

        # Save the original format to restore it later
        orig_fmt = self._style._fmt

        # Switch format based on level
        if record.levelno >= logging.ERROR:
            self._style._fmt = error_format
        else:
            self._style._fmt = info_format

        # Call the original formatter
        result = super().format(record)

        # Restore the original format
        self._style._fmt = orig_fmt
        return result

handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(LevelBasedFormatter())

root = logging.getLogger()
root.handlers = []
root.addHandler(handler)
root.setLevel(logging.INFO)

log = logging.getLogger(__name__)

In [48]:
RESOURCE_MAPPING = {
    1: "BRICK",
    2: "WOOL",
    3: "GRAIN",
    4: "ORE",
    5: "LUMBER"
}

DEVELOPMENT_CARD_MAPPING = {
    10: "KNIGHT",
    11: "KNIGHT",
    12: "MONOPOLY",
    13: "VICTORY POINT",
    14: "ROAD BUILDING",
    15: "YEAR OF PLENTY"
}

class VPCategory(Enum):
    SETTLEMENTS = "SETTLEMENTS"
    CITIES = "CITIES"
    VP_CARD = "VP CARD"
    LARGEST_ARMY = "LARGEST ARMY"
    LONGEST_ROAD = "LONGEST ROAD"

@dataclass(frozen=True)
class VPRule:
    points_per_item: int
    max_items: int

# https://github.com/Catan-data/dataset/issues/3 (Mapping in README is probably wrong)
VICTORY_POINTS_MAPPING = {
    "0": VPCategory.SETTLEMENTS,
    "1": VPCategory.CITIES,
    "2": VPCategory.VP_CARD,
    "3": VPCategory.LARGEST_ARMY,
    "4": VPCategory.LONGEST_ROAD
}

VP_RULES = {
    VPCategory.SETTLEMENTS: VPRule(points_per_item=1, max_items=5),
    VPCategory.CITIES: VPRule(points_per_item=2, max_items=4),
    VPCategory.VP_CARD: VPRule(points_per_item=1, max_items=5),
    VPCategory.LARGEST_ARMY: VPRule(points_per_item=2, max_items=1),
    VPCategory.LONGEST_ROAD: VPRule(points_per_item=2, max_items=1),
}



In [5]:
tar_gz_file_path = '/content/drive/MyDrive/datasets/games.tar.gz'
extract_path = '/content/extracted_games'
os.makedirs(extract_path, exist_ok=True)
with tarfile.open(tar_gz_file_path, 'r:gz') as tar:
    tar.extractall(path=extract_path)
json_game_files = os.listdir(f"{extract_path}/games")


  tar.extractall(path=extract_path)


In [None]:
# Goal: Find what is the distribution of VPs for winners (E.g. how many settlements did he build, did he get largest army, etc.)

def is_valid_end_game_vp_state(end_game_vp_state: dict) -> bool:
    total = 0
    for key, value in end_game_vp_state.items():
        if key not in VP_RULES:
            raise ValueError(f"Unexpected key found in end game vp state: {key}")

        if value > VP_RULES[key].max_items:
            log.error(f"Invalid number of items for category {key}: {value}. "
                     f"Expected a maximum of {VP_RULES[key].max_items}")
            return False

        total += value * VP_RULES[key].points_per_item

    if total < 10:
        log.error(f"Stated winner does not have at least 10 VPs: {end_game_vp_state}")
        return False
    return True

def get_end_game_json_vp_state(filename: str) -> Optional[dict]:
  with open(f"{extract_path}/games/{filename}", "r") as f:
    data = json.loads(f.read())
    players = data["data"]["eventHistory"]["endGameState"]["players"]
    for _, metadata in players.items():
      if not metadata["winningPlayer"]:
        continue
      victory_points_metadata = metadata["victoryPoints"]
      end_game_vp_state = defaultdict(int)
      for key, count in victory_points_metadata.items():
        if key not in VICTORY_POINTS_MAPPING:
          log.error(f"Unexpected key found in victory points metadata: {key}. Expected {VICTORY_POINTS_MAPPING}")
          return None
        end_game_vp_state[VICTORY_POINTS_MAPPING[key]] = count
      return end_game_vp_state

def get_end_game_vp_state_df(sample_size: Optional[int]) -> pd.DataFrame:
  sample_size = sample_size or len(json_game_files)
  valid_end_game_states: list[dict] = []
  for filename in json_game_files[:sample_size]:
    end_game_vp_state: Optional[dict] = get_end_game_json_vp_state(filename)
    if end_game_vp_state is None:
      continue
    if not is_valid_end_game_vp_state(end_game_vp_state):
      continue
    valid_end_game_states.append(end_game_vp_state)

  df = pd.DataFrame(valid_end_game_states)
  df.fillna(0, inplace=True)

  df[VPCategory.LARGEST_ARMY] = df[VPCategory.LARGEST_ARMY].apply(lambda x: True if x == 1 else False)
  df[VPCategory.LONGEST_ROAD] = df[VPCategory.LONGEST_ROAD].apply(lambda x: True if x == 1 else False)
  df[VPCategory.VP_CARD] = df[VPCategory.VP_CARD].astype(int)

  num_rows = len(df)
  if num_rows < sample_size:
    log.error(f"{sample_size - num_rows} row(s) dropped due to inaccurate end game VP state")
  return df

def analyze_winner_distributions(df: pd.DataFrame):
    total_winners = len(df)
    log.info(f"\n\nAnalyzing {total_winners} winning games:\n")

    # 1. Individual Percentages for the longest road / largest army
    largest_army_pct = df[VPCategory.LARGEST_ARMY].mean() * 100
    longest_road_pct = df[VPCategory.LONGEST_ROAD].mean() * 100

    log.info(f"--- Special Victories (Raw Percentages) ---")
    log.info(f"Largest Army: {largest_army_pct:.2f}%")
    log.info(f"Longest Road: {longest_road_pct:.2f}%")

    # 2. Combinations (Both, Neither, Only) for the longest road / largest army
    both = (df[VPCategory.LARGEST_ARMY] & df[VPCategory.LONGEST_ROAD]).sum()
    only_army = (df[VPCategory.LARGEST_ARMY] & ~df[VPCategory.LONGEST_ROAD]).sum()
    only_road = (~df[VPCategory.LARGEST_ARMY] & df[VPCategory.LONGEST_ROAD]).sum()
    neither = (~df[VPCategory.LARGEST_ARMY] & ~df[VPCategory.LONGEST_ROAD]).sum()

    log.info(f"\n--- Special Victories (Combinations) ---")
    log.info(f"Both Army & Road:   {both} winners ({(both/total_winners)*100:.2f}%)")
    log.info(f"Only Largest Army:  {only_army} winners ({(only_army/total_winners)*100:.2f}%)")
    log.info(f"Only Longest Road:  {only_road} winners ({(only_road/total_winners)*100:.2f}%)")
    log.info(f"Neither:  {neither} winners ({(neither/total_winners)*100:.2f}%)")


    # 3. Settlement + City Combinations
    log.info(f"\n--- Build Strategy (Settlement + City Combinations) ---")
    combo_counts = df.groupby([VPCategory.SETTLEMENTS, VPCategory.CITIES]).size().reset_index(name='count')
    combo_counts['percentage'] = (combo_counts['count'] / total_winners) * 100
    combo_counts = combo_counts.sort_values(by='count', ascending=False)
    for _, row in combo_counts.iterrows():
        log.info(f"{int(row[VPCategory.SETTLEMENTS])} Settlements + {int(row[VPCategory.CITIES])} Cities: {int(row['count'])} winners ({row['percentage']:.2f}%)")

    # 3. Cities Distribution
    log.info(f"\n--- Cities Distribution (Cumulative) ---")
    for i in range(1, 6):
        count = (df[VPCategory.CITIES] >= i).sum()
        pct = (count / total_winners) * 100
        log.info(f"At least {i} Cities: {count} winners ({pct:.2f}%)")

    # 4. VP Cards Distribution
    log.info(f"\n--- VP Cards Distribution (Cumulative) ---")
    for i in range(1, 6):
        count = (df[VPCategory.VP_CARD] >= i).sum()
        pct = (count / total_winners) * 100
        log.info(f"At least {i} VP Card: {count} winners ({pct:.2f}%)")

df = get_end_game_vp_state_df(
    # sample_size=100,
    sample_size=None
)
analyze_winner_distributions(df)


ERROR: Unexpected key found in victory points metadata: 5. Expected {'0': <VPCategory.SETTLEMENTS: 'SETTLEMENTS'>, '1': <VPCategory.CITIES: 'CITIES'>, '2': <VPCategory.VP_CARD: 'VP_CARD'>, '3': <VPCategory.LARGEST_ARMY: 'LARGEST_ARMY'>, '4': <VPCategory.LONGEST_ROAD: 'LONGEST_ROAD'>}
ERROR: Unexpected key found in victory points metadata: 5. Expected {'0': <VPCategory.SETTLEMENTS: 'SETTLEMENTS'>, '1': <VPCategory.CITIES: 'CITIES'>, '2': <VPCategory.VP_CARD: 'VP_CARD'>, '3': <VPCategory.LARGEST_ARMY: 'LARGEST_ARMY'>, '4': <VPCategory.LONGEST_ROAD: 'LONGEST_ROAD'>}
ERROR: Stated winner does not have at least 10 VPs: defaultdict(<class 'int'>, {<VPCategory.SETTLEMENTS: 'SETTLEMENTS'>: 3, <VPCategory.CITIES: 'CITIES'>: 2, <VPCategory.LONGEST_ROAD: 'LONGEST_ROAD'>: 1})
ERROR: Unexpected key found in victory points metadata: 6. Expected {'0': <VPCategory.SETTLEMENTS: 'SETTLEMENTS'>, '1': <VPCategory.CITIES: 'CITIES'>, '2': <VPCategory.VP_CARD: 'VP_CARD'>, '3': <VPCategory.LARGEST_ARMY: 'LARG