<a href="https://colab.research.google.com/github/jinyang628/catan/blob/main/Catan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [184]:
"""
Credits

MR. MUCHO BUCHO Game Data (2025)
43,947 anonymized 4-player Catan games
https://github.com/Catan-data/dataset
"""

'\nCredits\n\nMR. MUCHO BUCHO Game Data (2025)\n43,947 anonymized 4-player Catan games\nhttps://github.com/Catan-data/dataset\n'

In [185]:
from enum import StrEnum, Enum
import pandas as pd
import os
import json
import logging
from typing import Optional
import tarfile
from collections import defaultdict
from dataclasses import dataclass

import logging
import sys

class LevelBasedFormatter(logging.Formatter):
    def format(self, record):
        error_format = "%(levelname)s: %(message)s"
        info_format = "%(message)s"
        orig_fmt = self._style._fmt
        if record.levelno >= logging.ERROR:
            self._style._fmt = error_format
        else:
            self._style._fmt = info_format
        result = super().format(record)
        self._style._fmt = orig_fmt
        return result

handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(LevelBasedFormatter())

root = logging.getLogger()
root.handlers = []
root.addHandler(handler)
root.setLevel(logging.INFO)

log = logging.getLogger(__name__)

In [186]:
RESOURCE_MAPPING = {
    1: "BRICK",
    2: "WOOL",
    3: "GRAIN",
    4: "ORE",
    5: "LUMBER"
}

DEVELOPMENT_CARD_MAPPING = {
    10: "KNIGHT",
    11: "KNIGHT",
    12: "MONOPOLY",
    13: "VICTORY POINT",
    14: "ROAD BUILDING",
    15: "YEAR OF PLENTY"
}

class VPCategory(Enum):
    SETTLEMENTS = "SETTLEMENTS"
    CITIES = "CITIES"
    VP_CARD = "VP CARD"
    LARGEST_ARMY = "LARGEST ARMY"
    LONGEST_ROAD = "LONGEST ROAD"

@dataclass(frozen=True)
class VPRule:
    points_per_item: int
    max_items: int

# https://github.com/Catan-data/dataset/issues/3 (Mapping in README is probably wrong)
VICTORY_POINTS_MAPPING = {
    "0": VPCategory.SETTLEMENTS,
    "1": VPCategory.CITIES,
    "2": VPCategory.VP_CARD,
    "3": VPCategory.LARGEST_ARMY,
    "4": VPCategory.LONGEST_ROAD
}

VP_RULES = {
    VPCategory.SETTLEMENTS: VPRule(points_per_item=1, max_items=5),
    VPCategory.CITIES: VPRule(points_per_item=2, max_items=4),
    VPCategory.VP_CARD: VPRule(points_per_item=1, max_items=5),
    VPCategory.LARGEST_ARMY: VPRule(points_per_item=2, max_items=1),
    VPCategory.LONGEST_ROAD: VPRule(points_per_item=2, max_items=1),
}



In [55]:
tar_gz_file_path = '/content/drive/MyDrive/datasets/games.tar.gz'
extract_path = '/content/extracted_games'
os.makedirs(extract_path, exist_ok=True)
with tarfile.open(tar_gz_file_path, 'r:gz') as tar:
    tar.extractall(path=extract_path)
json_game_files = os.listdir(f"{extract_path}/games")


  tar.extractall(path=extract_path)


In [187]:
def is_valid_end_game_vp_state(end_game_vp_state: dict) -> bool:
    total = 0
    for key, value in end_game_vp_state.items():
        if key not in VP_RULES:
            # raise ValueError(f"Unexpected key found in end game vp state: {key}")
            return False

        if value > VP_RULES[key].max_items:
            log.error(f"Invalid number of items for category {key}: {value}. "
                     f"Expected a maximum of {VP_RULES[key].max_items}")
            return False

        total += value * VP_RULES[key].points_per_item

    if total < 10:
        # log.error(f"Stated winner does not have at least 10 VPs: {end_game_vp_state}")
        return False
    return True

In [188]:
def getInitialPlacements(events: dict) -> dict:
  initialPlacements = defaultdict(list)
  for event in events[:20]:
      sc = event.get('stateChange', {})
      corners = sc.get('mapState', {}).get('tileCornerStates', {})

      for corner_id, corner_info in corners.items():
          owner = corner_info.get('owner')
          building_type = corner_info.get('buildingType')

          if owner and building_type == 1:  # Settlement
              initialPlacements[owner].append(corner_id)
  return initialPlacements

In [189]:
def getInitialSettlementResources(eventHistory: dict) -> dict:
    initialPlacements = getInitialPlacements(eventHistory["events"])
    initialState = eventHistory["initialState"]
    hexStates = initialState['mapState']['tileHexStates']
    cornerStates = initialState['mapState']['tileCornerStates']
    result = defaultdict(list)
    for player_id, corner_ids in initialPlacements.items():
        for corner_id in corner_ids:
            corner = cornerStates[corner_id]
            corner_x, corner_y, corner_z = corner['x'], corner['y'], corner['z']
            adjacent_hexes = []

            for hex_id, hex_data in hexStates.items():
                hex_x, hex_y = hex_data['x'], hex_data['y']
                if is_corner_adjacent_to_hex(corner_x, corner_y, corner_z, hex_x, hex_y):
                    adjacent_hexes.append(hex_id)

            resources = []
            for hex_id in adjacent_hexes:
                hex_data = hexStates[hex_id]
                resource_type = hex_data['type']
                dice_number = hex_data['diceNumber']
                if resource_type in RESOURCE_MAPPING:
                  resources.append({
                      'resource_name': RESOURCE_MAPPING[resource_type],
                      'dice_number': dice_number
                  })
            result[str(player_id)].append(resources)

    return result


def is_corner_adjacent_to_hex(corner_x, corner_y, corner_z, hex_x, hex_y):
    if corner_z == 0:  # Up corner
        return (
            (corner_x == hex_x and corner_y == hex_y) or
            (corner_x == hex_x + 1 and corner_y == hex_y - 1) or
            (corner_x == hex_x and corner_y == hex_y - 1)
        )
    else:  # z == 1, Down corner
        return (
            (corner_x == hex_x and corner_y == hex_y) or
            (corner_x == hex_x - 1 and corner_y == hex_y + 1) or
            (corner_x == hex_x and corner_y == hex_y + 1)
        )


In [190]:
def get_game_metadata(filename: str) -> Optional[dict]:
  metadata: dict = {}
  with open(f"{extract_path}/games/{filename}", "r") as f:
    eventHistory = json.loads(f.read())["data"]["eventHistory"]
    # Get Initial Placements
    initial_settlement_resources = getInitialSettlementResources(eventHistory)

    # Get End Game Stats
    players = eventHistory["endGameState"]["players"]
    for player_id, metadata in players.items():
      if not metadata["winningPlayer"]:
        continue
      metadata["winner_player_id"] = player_id
      victory_points_metadata = metadata["victoryPoints"]
      end_game_vp_state = {
          VPCategory.SETTLEMENTS: 0,
          VPCategory.CITIES: 0,
          VPCategory.VP_CARD: 0,
          VPCategory.LARGEST_ARMY: 0,
          VPCategory.LONGEST_ROAD: 0
      }
      for key, count in victory_points_metadata.items():
        if key not in VICTORY_POINTS_MAPPING:
          # log.error(f"Unexpected key found in victory points metadata: {key}. Expected {VICTORY_POINTS_MAPPING}")
          return None
        end_game_vp_state[VICTORY_POINTS_MAPPING[key]] = count
      metadata["winner_end_game_vp_state"] = end_game_vp_state
      metadata["winner_initial_settlement_resources"] = initial_settlement_resources[player_id]

      del metadata["winningPlayer"]
      del metadata["rank"]
      del metadata["color"]
      del metadata["victoryPoints"]
      if "title" in metadata:
        del metadata["title"]
      return metadata


In [191]:
def get_end_game_vp_state_df(sample_size: Optional[int]) -> pd.DataFrame:
  sample_size = sample_size or len(json_game_files)
  valid_end_game_states: list[dict] = []
  for filename in json_game_files[:sample_size]:
    metadata: Optional[dict] = get_game_metadata(filename)
    if metadata is None:
      continue
    if not is_valid_end_game_vp_state(metadata["winner_end_game_vp_state"]):
      continue
    valid_end_game_states.append(metadata)

  df = pd.DataFrame(valid_end_game_states)
  vp_state_df = pd.json_normalize(df['winner_end_game_vp_state'])
  df = pd.concat([df.drop('winner_end_game_vp_state', axis=1), vp_state_df], axis=1)

  df.fillna(0, inplace=True)
  df[VPCategory.LARGEST_ARMY] = df[VPCategory.LARGEST_ARMY].apply(lambda x: True if x == 1 else False)
  df[VPCategory.LONGEST_ROAD] = df[VPCategory.LONGEST_ROAD].apply(lambda x: True if x == 1 else False)
  df[VPCategory.VP_CARD] = df[VPCategory.VP_CARD].astype(int)
  df[VPCategory.CITIES] = df[VPCategory.CITIES].astype(int)
  df[VPCategory.SETTLEMENTS] = df[VPCategory.SETTLEMENTS].astype(int)

  num_rows = len(df)
  if num_rows < sample_size:
    log.error(f"{sample_size - num_rows} row(s) dropped due to inaccurate end game VP state")
  return df

In [192]:
def analyze_winner_distributions(df: pd.DataFrame):
    total_winners = len(df)
    log.info(f"\n\nAnalyzing {total_winners} winning games:\n")

    # 1. Individual Percentages for the longest road / largest army
    largest_army_pct = df[VPCategory.LARGEST_ARMY].mean() * 100
    longest_road_pct = df[VPCategory.LONGEST_ROAD].mean() * 100

    log.info(f"--- Special Conditions (Raw Percentages) ---")
    log.info(f"Largest Army: {largest_army_pct:.2f}%")
    log.info(f"Longest Road: {longest_road_pct:.2f}%")

    # 2. Combinations (Both, Neither, Only) for the longest road / largest army
    both = (df[VPCategory.LARGEST_ARMY] & df[VPCategory.LONGEST_ROAD]).sum()
    only_army = (df[VPCategory.LARGEST_ARMY] & ~df[VPCategory.LONGEST_ROAD]).sum()
    only_road = (~df[VPCategory.LARGEST_ARMY] & df[VPCategory.LONGEST_ROAD]).sum()
    neither = (~df[VPCategory.LARGEST_ARMY] & ~df[VPCategory.LONGEST_ROAD]).sum()

    log.info(f"\n--- Special Conditions (Combinations) ---")
    log.info(f"Both Army & Road:   {both} winners ({((both/total_winners)*100):.2f}%)")
    log.info(f"Only Largest Army:  {only_army} winners ({((only_army/total_winners)*100):.2f}%)")
    log.info(f"Only Longest Road:  {only_road} winners ({((only_road/total_winners)*100):.2f}%)")
    log.info(f"Neither:  {neither} winners ({((neither/total_winners)*100):.2f}%)")


    # 3. Settlement + City Combinations
    log.info(f"\n--- Build Strategy (Settlement + City Combinations) ---")
    combo_counts = df.groupby([VPCategory.SETTLEMENTS, VPCategory.CITIES]).size().reset_index(name='count')
    combo_counts['percentage'] = (combo_counts['count'] / total_winners) * 100
    combo_counts = combo_counts.sort_values(by='count', ascending=False)
    for _, row in combo_counts.iterrows():
        log.info(f"{int(row[VPCategory.SETTLEMENTS])} Settlements + {int(row[VPCategory.CITIES])} Cities: {int(row['count'])} winners ({row['percentage']:.2f}%)")

    # 3. Cities Distribution
    log.info(f"\n--- Cities Distribution (Cumulative) ---")
    for i in range(1, 6):
        count = (df[VPCategory.CITIES] >= i).sum()
        pct = (count / total_winners) * 100
        log.info(f"At least {i} Cities: {count} winners ({pct:.2f}%)")

    # 4. VP Cards Distribution
    log.info(f"\n--- VP Cards Distribution (Cumulative) ---")
    for i in range(1, 6):
        count = (df[VPCategory.VP_CARD] >= i).sum()
        pct = (count / total_winners) * 100
        log.info(f"At least {i} VP Card: {count} winners ({pct:.2f}%)")

    # 5. Resource Pits Distribution (CDF-like analysis)
    log.info(f"\n--- Initial Resource Pits Distribution (Cumulative) ---")
    pit_thresholds = [2, 4, 6, 8]
    for resource_name in RESOURCE_MAPPING.values():
        col_name = f'{resource_name}_pits'
        if col_name in df.columns:
            log.info(f"  {resource_name}:")
            for threshold in pit_thresholds:
                count = (df[col_name] >= threshold).sum()
                pct = (count / total_winners) * 100
                log.info(f"    At least {threshold} pits: {count} winners ({pct:.2f}%)")
        else:
            log.info(f"  {resource_name}: Column '{col_name}' not found.")


In [193]:
def get_pits_from_dice_number(dice_number: int) -> int:
    if dice_number == 7:
        return 0 # No resource production on 7
    elif dice_number in [2, 12]:
        return 1
    elif dice_number in [3, 11]:
        return 2
    elif dice_number in [4, 10]:
        return 3
    elif dice_number in [5, 9]:
        return 4
    elif dice_number in [6, 8]:
        return 5
    return 0 # for all invalid numbers (instead of throwing)

def extract_initial_resources_pits_for_winner(initial_settlement_resources_list: list) -> dict:
    """
    Extracts and sums the number of resource pits for a single winner based on their initial settlements.
    The input `initial_settlement_resources_list` is a list of lists, where each inner list represents
    the resources adjacent to one of the winner's initial settlements.
    """
    resource_pits = defaultdict(int)
    for settlement_resources in initial_settlement_resources_list:
        for resource_info in settlement_resources:
            resource_name = resource_info['resource_name']
            dice_number = resource_info['dice_number']
            pits = get_pits_from_dice_number(dice_number)
            resource_pits[resource_name] += pits
    return dict(resource_pits)

def add_initial_resource_pits_to_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Processes the 'winner_initial_settlement_resources' column to add new columns
    for the total number of pits for each resource type.
    """
    if 'winner_initial_settlement_resources' not in df.columns:
        log.error("Column 'winner_initial_settlement_resources' not found in DataFrame.")
        return df

    resource_pits_series = df['winner_initial_settlement_resources'].apply(extract_initial_resources_pits_for_winner)
    resource_pits_df = pd.json_normalize(resource_pits_series)
    resource_pits_df = resource_pits_df.add_suffix('_pits')
    df = pd.concat([df.drop('winner_initial_settlement_resources', axis=1), resource_pits_df], axis=1)
    for res_name in RESOURCE_MAPPING.values():
        col_name = f'{res_name}_pits'
        if col_name not in df.columns:
            df[col_name] = 0
    df.fillna(0, inplace=True) # General fillna for any remaining NaNs

    return df

In [194]:
df = get_end_game_vp_state_df(
    sample_size=None
)
df = add_initial_resource_pits_to_df(df)
analyze_winner_distributions(df)

df

ERROR: 1082 row(s) dropped due to inaccurate end game VP state


Analyzing 42865 winning games:

--- Special Conditions (Raw Percentages) ---
Largest Army: 46.39%
Longest Road: 57.53%

--- Special Conditions (Combinations) ---
Both Army & Road:   7267 winners (16.95%)
Only Largest Army:  12616 winners (29.43%)
Only Longest Road:  17392 winners (40.57%)
Neither:  5590 winners (13.04%)

--- Build Strategy (Settlement + City Combinations) ---
3 Settlements + 2 Cities: 5362 winners (12.51%)
4 Settlements + 2 Cities: 4554 winners (10.62%)
1 Settlements + 3 Cities: 4394 winners (10.25%)
2 Settlements + 2 Cities: 4091 winners (9.54%)
2 Settlements + 3 Cities: 4029 winners (9.40%)
4 Settlements + 1 Cities: 3753 winners (8.76%)
5 Settlements + 1 Cities: 3360 winners (7.84%)
0 Settlements + 3 Cities: 2067 winners (4.82%)
0 Settlements + 4 Cities: 2020 winners (4.71%)
3 Settlements + 1 Cities: 1810 winners (4.22%)
1 Settlements + 2 Cities: 1332 winners (3.11%)
5 Settlements + 0 Cities: 1194 winne

Unnamed: 0,winner_player_id,VPCategory.SETTLEMENTS,VPCategory.CITIES,VPCategory.VP_CARD,VPCategory.LARGEST_ARMY,VPCategory.LONGEST_ROAD,GRAIN_pits,BRICK_pits,ORE_pits,LUMBER_pits,WOOL_pits
0,2,1,3,1,False,True,2.0,9.0,7.0,0.0,0.0
1,2,1,3,1,False,True,2.0,2.0,5.0,0.0,0.0
2,1,0,4,2,False,False,4.0,9.0,0.0,3.0,5.0
3,2,1,4,1,False,False,0.0,6.0,2.0,8.0,2.0
4,5,0,3,2,True,False,0.0,7.0,6.0,5.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
42860,5,2,3,2,False,False,0.0,3.0,2.0,0.0,4.0
42861,5,3,1,1,True,True,0.0,4.0,12.0,0.0,3.0
42862,2,3,2,1,False,True,0.0,0.0,1.0,4.0,10.0
42863,2,3,2,1,True,False,3.0,3.0,5.0,4.0,2.0
