In [None]:
import itertools
from collections import defaultdict
import random

class Card:
    def __init__(self, rank, suit):
        self.rank = rank
        self.suit = suit

    def __str__(self):
        return f"{self.rank}{self.suit}"

    def __repr__(self):
        return self.__str__()

    def score(self):
        if self.rank == 'A':
            return 1
        elif self.rank == 'J':
            return 0
        elif self.rank in ['Q', 'K']:
            return 10
        else:
            return int(self.rank)

class GolfGame:
    RANKS = ['A', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'J', 'Q', 'K']
    SUITS = ['♠', '♥', '♦', '♣']

    def __init__(self):
        self.deck = self.create_deck()
        self.grid = [None, None, None, None]  # top-left, top-right, bottom-left, bottom-right
        self.known_cards = [False, False, True, True]  # initially know bottom 2 cards
        self.turn = 1
        self.discard_pile = []  # Complete history of discarded cards
        self.opponents_visible = []  # Cards visible in opponents' grids
        self.memory = {
            'all_seen_cards': [],  # Every card we've seen
            'discard_history': [],  # Full discard pile
            'cards_per_rank': {rank: 0 for rank in self.RANKS}  # Count of seen cards per rank
        }

    def create_deck(self):
        return [Card(rank, suit) for rank, suit in itertools.product(self.RANKS, self.SUITS)]

    def calculate_score(self, grid):
        """Calculate score for a 2x2 grid with pair cancellation"""
        scores = [card.score() if card else 0 for card in grid]
        total_score = sum(scores)

        # Check for pairs that cancel (any two matching ranks)
        ranks = [card.rank if card else None for card in grid]
        pairs = []
        used_positions = set()

        # Check all possible pairs (6 combinations for 4 positions)
        for pos1, pos2 in itertools.combinations(range(4), 2):
            if (ranks[pos1] and ranks[pos2] and
                ranks[pos1] == ranks[pos2] and
                pos1 not in used_positions and pos2 not in used_positions):
                pairs.append((pos1, pos2))
                used_positions.add(pos1)
                used_positions.add(pos2)
                total_score -= (scores[pos1] + scores[pos2])

        return total_score, pairs

    def update_memory(self, new_cards):
        """Update memory with newly seen cards"""
        for card in new_cards:
            if card and card not in self.memory['all_seen_cards']:
                self.memory['all_seen_cards'].append(card)
                self.memory['cards_per_rank'][card.rank] += 1

    def add_to_discard(self, card):
        """Add card to discard pile and update memory"""
        if card:
            self.memory['discard_history'].append(card)
            self.update_memory([card])

    def get_deck_probabilities(self, additional_seen_cards=None):
        """Calculate probability distribution of remaining cards in deck using full memory"""
        # Start with cards we've tracked in memory
        rank_counts = self.memory['cards_per_rank'].copy()

        # Add any additional cards passed in (for temporary calculations)
        if additional_seen_cards:
            for card in additional_seen_cards:
                if card:
                    rank_counts[card.rank] += 1

        # Calculate remaining cards for each rank (4 total per rank in deck)
        remaining_cards = {}
        for rank in self.RANKS:
            remaining_cards[rank] = max(0, 4 - rank_counts[rank])

        total_remaining = sum(remaining_cards.values())

        # Convert to probabilities
        probabilities = {}
        for rank in self.RANKS:
            probabilities[rank] = remaining_cards[rank] / total_remaining if total_remaining > 0 else 0

        return probabilities, total_remaining, rank_counts

    def expected_score_for_unknown_position(self, probabilities):
        """Calculate expected score for an unknown card position"""
        expected = 0
        for rank, prob in probabilities.items():
            card_score = Card(rank, '♠').score()  # suit doesn't matter for scoring
            expected += prob * card_score
        return expected

    def get_memory_analysis(self):
        """Get detailed analysis of what we've seen"""
        probs, total_remaining, seen_counts = self.get_deck_probabilities()

        analysis = {
            'total_cards_seen': len(self.memory['all_seen_cards']),
            'total_remaining_in_deck': total_remaining,
            'discard_pile_size': len(self.memory['discard_history']),
            'rank_analysis': {}
        }

        for rank in self.RANKS:
            remaining = 4 - seen_counts[rank]
            analysis['rank_analysis'][rank] = {
                'seen': seen_counts[rank],
                'remaining': remaining,
                'probability': probs[rank],
                'score': Card(rank, '♠').score()
            }

        return analysis

    def evaluate_take_discard_action(self, position, discard_card, current_grid, known_cards):
        """Evaluate taking discard card and placing it at position"""
        new_grid = current_grid.copy()
        new_grid[position] = discard_card
        new_known = known_cards.copy()
        new_known[position] = True

        # Calculate score with this new card
        known_score, pairs = self.calculate_score([card if new_known[i] else None for i, card in enumerate(new_grid)])

        # Add expected score for unknown positions using memory
        deck_probs, _, _ = self.get_deck_probabilities([discard_card])  # Include the card we're taking
        unknown_expected = 0
        for i in range(4):
            if not new_known[i]:
                unknown_expected += self.expected_score_for_unknown_position(deck_probs)

        total_expected = known_score + unknown_expected

        return {
            'action': f'Take {discard_card} → Position {position + 1}',
            'expected_score': total_expected,
            'known_score': known_score,
            'pairs': pairs,
            'position': position
        }

    def evaluate_draw_deck_action(self, position, current_grid, known_cards):
        """Evaluate drawing from deck and expected outcome at position"""
        deck_probs, total_remaining, _ = self.get_deck_probabilities()

        if total_remaining == 0:
            return {
                'action': f'Draw from deck → Position {position + 1}',
                'expected_score': float('inf'),  # No cards left
                'position': position,
                'error': 'No cards remaining in deck'
            }

        total_expected_score = 0
        best_cards = []

        for rank, prob in deck_probs.items():
            if prob == 0:
                continue

            # Simulate drawing this card
            drawn_card = Card(rank, '♠')  # suit doesn't matter
            new_grid = current_grid.copy()
            new_grid[position] = drawn_card
            new_known = known_cards.copy()
            new_known[position] = True

            # Calculate score
            known_score, pairs = self.calculate_score([card if new_known[i] else None for i, card in enumerate(new_grid)])

            # Add expected score for remaining unknown positions
            remaining_deck_probs, _, _ = self.get_deck_probabilities([drawn_card])
            unknown_expected = 0
            for i in range(4):
                if not new_known[i]:
                    unknown_expected += self.expected_score_for_unknown_position(remaining_deck_probs)

            total_score = known_score + unknown_expected
            total_expected_score += prob * total_score

            if drawn_card.score() <= 1:  # Good cards (A=1, J=0)
                best_cards.append((rank, prob))

        return {
            'action': f'Draw from deck → Position {position + 1}',
            'expected_score': total_expected_score,
            'position': position,
            'prob_good_card': sum(prob for rank, prob in best_cards),
            'best_possible': best_cards,
            'cards_remaining': total_remaining
        }

    def get_recommendations(self, grid, known_cards, discard_top, turn):
        """Get action recommendations with probabilities using full memory"""
        # Update memory with current known cards and discard top
        current_known = [card for i, card in enumerate(grid) if known_cards[i] and card]
        self.update_memory(current_known + [discard_top])

        current_score, current_pairs = self.calculate_score([card if known_cards[i] else None for i, card in enumerate(grid)])

        # Calculate baseline expected score (doing nothing)
        deck_probs, total_remaining, _ = self.get_deck_probabilities()
        baseline_unknown_expected = 0
        for i in range(4):
            if not known_cards[i]:
                baseline_unknown_expected += self.expected_score_for_unknown_position(deck_probs)
        baseline_expected = current_score + baseline_unknown_expected

        recommendations = []

        # Available positions (face-down cards only)
        available_positions = [i for i in range(4) if not known_cards[i]]

        if not available_positions:
            return {
                "message": "No moves available - all cards are face-up!",
                "baseline_score": current_score,
                "memory_analysis": self.get_memory_analysis()
            }

        # Evaluate taking discard card
        for pos in available_positions:
            eval_result = self.evaluate_take_discard_action(pos, discard_top, grid, known_cards)
            improvement = baseline_expected - eval_result['expected_score']
            confidence = min(95, max(5, 50 + improvement * 15))

            # Check if this creates a pair
            creates_pair = any(card and card.rank == discard_top.rank for i, card in enumerate(grid) if known_cards[i])

            recommendations.append({
                **eval_result,
                'improvement': improvement,
                'confidence': confidence,
                'type': 'take_discard',
                'creates_pair': creates_pair
            })

        # Evaluate drawing from deck
        for pos in available_positions:
            eval_result = self.evaluate_draw_deck_action(pos, grid, known_cards)
            if 'error' in eval_result:
                continue

            improvement = baseline_expected - eval_result['expected_score']
            confidence = min(85, max(10, 40 + improvement * 10))  # Lower confidence for unknown cards

            recommendations.append({
                **eval_result,
                'improvement': improvement,
                'confidence': confidence,
                'type': 'draw_deck'
            })

        # Sort by improvement (best first)
        recommendations.sort(key=lambda x: x['improvement'], reverse=True)

        return {
            'recommendations': recommendations,
            'baseline_score': baseline_expected,
            'current_known_score': current_score,
            'current_pairs': current_pairs,
            'turn': turn,
            'memory_analysis': self.get_memory_analysis()
        }

def main():
    """Example usage of the Golf solver with memory tracking"""
    game = GolfGame()

    # Simulate a game in progress - add some cards to memory
    # Cards that have been discarded throughout the game
    previous_discards = [
        Card('K', '♠'), Card('9', '♥'), Card('Q', '♦'), Card('8', '♣'), Card('A', '♠')
    ]

    for card in previous_discards:
        game.add_to_discard(card)

    # Example current game state
    grid = [
        None,  # top-left (unknown)
        Card('Q', '♠'),  # top-right (known, face-up)
        Card('6', '♥'),  # bottom-left (known from start)
        Card('J', '♣')   # bottom-right (known from start)
    ]

    known_cards = [False, True, True, True]
    discard_top = Card('6', '♠')  # Current top of discard pile
    turn = 2

    print("=== 4-CARD GOLF STRATEGY SOLVER (with Memory) ===\n")
    print("Current Grid:")
    print(f"[ {'?' if not known_cards[0] else str(grid[0])} | {grid[1] if known_cards[1] else '?'} ]")
    print(f"[ {grid[2] if known_cards[2] else '?'} | {grid[3] if known_cards[3] else '?'} ]")
    print(f"\nDiscard pile top: {discard_top}")
    print(f"Turn: {turn}/4\n")

    # Get recommendations
    result = game.get_recommendations(grid, known_cards, discard_top, turn)

    if 'message' in result:
        print(result['message'])
        print(f"Final score: {result['baseline_score']}")
        return

    # Show memory analysis
    memory = result['memory_analysis']
    print("=== MEMORY ANALYSIS ===")
    print(f"Cards seen: {memory['total_cards_seen']}")
    print(f"Cards remaining in deck: {memory['total_remaining_in_deck']}")
    print(f"Discard pile size: {memory['discard_pile_size']}")

    print("\nRank probabilities remaining:")
    for rank in ['A', 'J', '2', '6', 'Q', 'K']:  # Show key ranks
        info = memory['rank_analysis'][rank]
        print(f"  {rank}: {info['remaining']}/4 left ({info['probability']:.1%}) - Score: {info['score']}")

    print(f"\nCurrent known score: {result['current_known_score']}")
    if result['current_pairs']:
        pairs_str = ', '.join([f"Pos {p1+1} & {p2+1}" for p1, p2 in result['current_pairs']])
        print(f"Current pairs: {pairs_str}")
    print(f"Expected final score if no action: {result['baseline_score']:.1f}\n")

    print("RECOMMENDATIONS (best first):\n")

    for i, rec in enumerate(result['recommendations'][:5]):  # Show top 5
        rank = "🥇" if i == 0 else "🥈" if i == 1 else "🥉" if i == 2 else f"{i+1}."

        print(f"{rank} {rec['action']}")
        print(f"   Expected improvement: {rec['improvement']:+.1f} points")
        print(f"   Confidence: {rec['confidence']:.0f}%")
        print(f"   Expected final score: {rec['expected_score']:.1f}")

        if rec['type'] == 'take_discard' and rec.get('creates_pair'):
            print("   ⭐ CREATES A PAIR! ⭐")

        if rec['type'] == 'draw_deck':
            print(f"   Chance of good card (A, J): {rec['prob_good_card']:.1%}")
            if 'cards_remaining' in rec:
                print(f"   Cards left in deck: {rec['cards_remaining']}")

        print()

    # Strategy advice based on memory
    print("STRATEGY NOTES:")
    best_action = result['recommendations'][0]

    if best_action['improvement'] > 2:
        print("• Strong move available - high confidence recommendation")
    elif best_action['improvement'] > 0.5:
        print("• Decent improvement possible")
    elif best_action['improvement'] > -0.5:
        print("• Marginal decision - consider position and remaining turns")
    else:
        print("• No great options - might want to play conservatively")

    if turn >= 3:
        print("• Late in the game - prioritize certainty over potential")

    # Memory-based insights
    good_cards_left = sum(memory['rank_analysis'][rank]['remaining'] for rank in ['A', 'J'])
    total_left = memory['total_remaining_in_deck']
    if total_left > 0:
        good_card_prob = good_cards_left / total_left
        print(f"• {good_cards_left} good cards (A, J) left in {total_left} remaining cards ({good_card_prob:.1%})")

    # Check for pair opportunities
    if any(rec.get('creates_pair') for rec in result['recommendations'] if rec['type'] == 'take_discard'):
        print("• 🎯 PAIR OPPORTUNITY: Taking discard creates a matching pair!")

    # Check if specific ranks are depleted
    depleted_ranks = [rank for rank, info in memory['rank_analysis'].items() if info['remaining'] == 0]
    if depleted_ranks:
        print(f"• No more {', '.join(depleted_ranks)} cards available")

if __name__ == "__main__":
    main()

In [None]:
import itertools
import random
import numpy as np
from collections import defaultdict, deque
import time
from typing import List, Dict, Tuple, Optional, Any
import json

class Card:
    def __init__(self, rank: str, suit: str):
        self.rank = rank
        self.suit = suit

    def __str__(self):
        return f"{self.rank}{self.suit}"

    def __repr__(self):
        return self.__str__()

    def score(self):
        if self.rank == 'A':
            return 1
        elif self.rank == 'J':
            return 0
        elif self.rank in ['Q', 'K']:
            return 10
        else:
            return int(self.rank)

    def __eq__(self, other):
        if not isinstance(other, Card):
            return False
        return self.rank == other.rank and self.suit == other.suit

    def __hash__(self):
        return hash((self.rank, self.suit))

class Player:
    def __init__(self, name, agent_type="random"):
        self.name = name
        self.agent_type = agent_type
        self.grid = [None] * 4  # 2x2 grid: [TL, TR, BL, BR]
        self.known = [False, False, True, True]  # Only bottom two known at start
        # Memory for tracking seen cards
        self.memory = {
            'all_seen_cards': [],
            'discard_history': [],
            'cards_per_rank': {rank: 0 for rank in ['A', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'J', 'Q', 'K']}
        }

    def reveal_all(self):
        self.known = [True] * 4

    def __str__(self):
        def show(i):
            return str(self.grid[i]) if self.known[i] else '?'
        return f"[ {show(0)} | {show(1)} ]\n[ {show(2)} | {show(3)} ]"

    def update_memory(self, new_cards):
        """Update memory with newly seen cards"""
        for card in new_cards:
            if card and card not in self.memory['all_seen_cards']:
                self.memory['all_seen_cards'].append(card)
                self.memory['cards_per_rank'][card.rank] += 1

    def add_to_discard_memory(self, card):
        """Add card to discard pile memory"""
        if card:
            self.memory['discard_history'].append(card)
            self.update_memory([card])

    def get_deck_probabilities(self, additional_seen_cards=None):
        """Calculate probability distribution of remaining cards in deck"""
        rank_counts = self.memory['cards_per_rank'].copy()

        if additional_seen_cards:
            for card in additional_seen_cards:
                if card:
                    rank_counts[card.rank] += 1

        remaining_cards = {}
        for rank in ['A', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'J', 'Q', 'K']:
            remaining_cards[rank] = max(0, 4 - rank_counts[rank])

        total_remaining = sum(remaining_cards.values())
        probabilities = {}
        for rank in ['A', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'J', 'Q', 'K']:
            probabilities[rank] = remaining_cards[rank] / total_remaining if total_remaining > 0 else 0

        return probabilities, total_remaining

    def expected_score_for_unknown_position(self, probabilities):
        """Calculate expected score for an unknown card position"""
        expected = 0
        for rank, prob in probabilities.items():
            card_score = Card(rank, '♠').score()
            expected += prob * card_score
        return expected

class RandomAgent:
    """Random agent that makes random legal moves"""
    def choose_action(self, player, game_state, trajectory=None):
        positions = [i for i, known in enumerate(player.known) if not known]
        if not positions:
            return None

        action = random.choice(['draw_deck', 'take_discard'])
        pos = random.choice(positions)

        if action == 'take_discard' and game_state.discard_pile:
            return {'type': 'take_discard', 'position': pos}
        else:
            # For draw_deck, also decide whether to keep the card
            keep = random.choice([True, False])
            return {'type': 'draw_deck', 'position': pos, 'keep': keep}

class HeuristicAgent:
    """Heuristic agent using strategy from original main.py"""
    def choose_action(self, player, game_state, trajectory=None):
        positions = [i for i, known in enumerate(player.known) if not known]
        if not positions:
            return None

        # Update memory with current known cards and discard top
        current_known = [card for i, card in enumerate(player.grid) if player.known[i] and card]
        discard_top = game_state.discard_pile[-1] if game_state.discard_pile else None
        player.update_memory(current_known + ([discard_top] if discard_top else []))

        # Calculate current score
        current_score = self.calculate_score([card if player.known[i] else None for i, card in enumerate(player.grid)])

        # Get deck probabilities
        deck_probs, total_remaining = player.get_deck_probabilities()

        # Calculate baseline expected score (doing nothing)
        baseline_unknown_expected = 0
        for i in range(4):
            if not player.known[i]:
                baseline_unknown_expected += player.expected_score_for_unknown_position(deck_probs)
        baseline_expected = current_score + baseline_unknown_expected

        best_action = None
        best_improvement = float('-inf')

        # Evaluate taking discard card
        if discard_top:
            for pos in positions:
                improvement = self.evaluate_take_discard_action(pos, discard_top, player, deck_probs, baseline_expected)
                if improvement > best_improvement:
                    best_improvement = improvement
                    best_action = {'type': 'take_discard', 'position': pos}

        # Evaluate drawing from deck
        for pos in positions:
            improvement = self.evaluate_draw_deck_action(pos, player, deck_probs, baseline_expected)
            if improvement > best_improvement:
                best_improvement = improvement
                best_action = {'type': 'draw_deck', 'position': pos, 'keep': True}

        # If no good action found, take discard if available, otherwise draw
        if not best_action:
            if discard_top:
                best_action = {'type': 'take_discard', 'position': random.choice(positions)}
            else:
                best_action = {'type': 'draw_deck', 'position': random.choice(positions), 'keep': True}

        return best_action

    def calculate_score(self, grid):
        """Calculate score for a grid (some cards might be None)"""
        scores = [card.score() if card else 0 for card in grid]
        total_score = sum(scores)

        ranks = [card.rank if card else None for card in grid]
        pairs = []
        used_positions = set()

        for pos1, pos2 in itertools.combinations(range(4), 2):
            if (ranks[pos1] and ranks[pos2] and
                ranks[pos1] == ranks[pos2] and
                pos1 not in used_positions and pos2 not in used_positions):
                pairs.append((pos1, pos2))
                used_positions.add(pos1)
                used_positions.add(pos2)
                total_score -= (scores[pos1] + scores[pos2])

        return total_score

    def evaluate_take_discard_action(self, position, discard_card, player, deck_probs, baseline_expected):
        """Evaluate taking discard card and placing it at position"""
        new_grid = player.grid.copy()
        new_grid[position] = discard_card
        new_known = player.known.copy()
        new_known[position] = True

        known_score = self.calculate_score([card if new_known[i] else None for i, card in enumerate(new_grid)])

        # Add expected score for unknown positions
        unknown_expected = 0
        for i in range(4):
            if not new_known[i]:
                unknown_expected += player.expected_score_for_unknown_position(deck_probs)

        total_expected = known_score + unknown_expected
        return baseline_expected - total_expected

    def evaluate_draw_deck_action(self, position, player, deck_probs, baseline_expected):
        """Evaluate drawing from deck and expected outcome at position"""
        total_expected_score = 0

        for rank, prob in deck_probs.items():
            if prob == 0:
                continue

            drawn_card = Card(rank, '♠')
            new_grid = player.grid.copy()
            new_grid[position] = drawn_card
            new_known = player.known.copy()
            new_known[position] = True

            known_score = self.calculate_score([card if new_known[i] else None for i, card in enumerate(new_grid)])

            unknown_expected = 0
            for i in range(4):
                if not new_known[i]:
                    unknown_expected += player.expected_score_for_unknown_position(deck_probs)

            total_score = known_score + unknown_expected
            total_expected_score += prob * total_score

        return baseline_expected - total_expected_score

class QLearningAgent:
    """Q-learning agent that actually learns from experience"""
    def __init__(self, learning_rate=0.1, discount_factor=0.9, epsilon=0.2):
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.q_table = defaultdict(lambda: defaultdict(float))
        self.training_mode = True

    def get_state_key(self, player, game_state):
        """Convert game state to a simplified string key for Q-table"""
        # Simplified state representation focusing on key features
        # Only track known cards and their scores, not specific suits
        known_cards = []
        for i, card in enumerate(player.grid):
            if player.known[i] and card:
                known_cards.append(card.rank)  # Only rank, not suit
            else:
                known_cards.append('?')

        # Sort known cards for consistency (same state regardless of position)
        known_cards_sorted = sorted([c for c in known_cards if c != '?'])
        unknown_count = known_cards.count('?')

        # Include discard top and round for context
        discard_top = game_state.discard_pile[-1].rank if game_state.discard_pile else 'None'

        return f"{known_cards_sorted}_{unknown_count}_{discard_top}_{game_state.round}"

    def get_action_key(self, action):
        """Convert action to a string key"""
        return f"{action['type']}_{action['position']}"

    def choose_action(self, player, game_state, trajectory=None):
        # Get legal actions first
        positions = [i for i, known in enumerate(player.known) if not known]
        if not positions:
            return None

        actions = []
        if game_state.discard_pile:
            for pos in positions:
                actions.append({'type': 'take_discard', 'position': pos})
        if game_state.deck:
            for pos in positions:
                actions.append({'type': 'draw_deck', 'position': pos, 'keep': True})

        if not actions:
            return None

        # Epsilon-greedy policy
        if self.training_mode and random.random() < self.epsilon:
            action = random.choice(actions)
        else:
            # Choose action with highest Q-value
            state_key = self.get_state_key(player, game_state)
            best_action = None
            best_value = float('-inf')

            for action in actions:
                action_key = self.get_action_key(action)
                q_value = self.q_table[state_key][action_key]
                if q_value > best_value:
                    best_value = q_value
                    best_action = action

            action = best_action or random.choice(actions)

        # Record action in trajectory for training
        if trajectory is not None:
            state_key = self.get_state_key(player, game_state)
            action_key = self.get_action_key(action)
            trajectory.append({
                'state_key': state_key,
                'action_key': action_key,
                'action': action
            })

        return action

    def update(self, state_key, action_key, reward, next_state_key, next_actions):
        """Update Q-values using Q-learning update rule"""
        max_next_q = 0
        if next_actions:
            max_next_q = max(self.q_table[next_state_key][self.get_action_key(a)]
                           for a in next_actions)

        current_q = self.q_table[state_key][action_key]
        new_q = current_q + self.learning_rate * (reward + self.discount_factor * max_next_q - current_q)
        self.q_table[state_key][action_key] = new_q

    def train_on_trajectory(self, trajectory, final_reward, final_score):
        """Train the agent on a complete game trajectory with improved rewards"""
        if not trajectory:
            return

        # Update Q-values for each step in the trajectory
        for i, step in enumerate(trajectory):
            state_key = step['state_key']
            action_key = step['action_key']

            # Calculate immediate reward for this action
            # Give small positive reward for taking actions (encourages exploration)
            # The main learning comes from the final reward
            immediate_reward = 0.1  # Small positive reward for taking action

            # Get next state and actions (if not the last step)
            if i < len(trajectory) - 1:
                next_step = trajectory[i + 1]
                next_state_key = next_step['state_key']
                next_actions = [next_step['action']]
            else:
                next_state_key = state_key  # Terminal state
                next_actions = []
                # Add final reward to the last action
                immediate_reward += final_reward

            self.update(state_key, action_key, immediate_reward, next_state_key, next_actions)

    def set_training_mode(self, training):
        """Enable or disable training mode"""
        self.training_mode = training

    def get_q_table_size(self):
        """Get the size of the Q-table for debugging"""
        total_entries = sum(len(actions) for actions in self.q_table.values())
        return len(self.q_table), total_entries

    def decay_epsilon(self, factor=0.995):
        """Decay epsilon for better exploration/exploitation balance"""
        self.epsilon = max(0.01, self.epsilon * factor)

class GolfGame:
    RANKS = ['A', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'J', 'Q', 'K']
    SUITS = ['♠', '♥', '♦', '♣']

    def __init__(self, num_players=4, agent_types=None, q_agents=None):
        self.num_players = num_players
        if agent_types is None:
            agent_types = ["random"] * num_players
        self.players = [Player(f'P{i+1}', agent_types[i]) for i in range(num_players)]
        self.agents = self.create_agents(agent_types, q_agents)
        self.deck = self.create_deck()
        self.discard_pile = []
        self.turn = 0  # Player index
        self.round = 1
        self.max_rounds = 4
        self.deal()

    def create_agents(self, agent_types, q_agents=None):
        agents = []
        for i, agent_type in enumerate(agent_types):
            if agent_type == "random":
                agents.append(RandomAgent())
            elif agent_type == "heuristic":
                agents.append(HeuristicAgent())
            elif agent_type == "qlearning":
                # Use persistent Q-learning agent if provided
                if q_agents and i < len(q_agents):
                    agents.append(q_agents[i])
                else:
                    agents.append(QLearningAgent())
            else:
                agents.append(RandomAgent())  # Default to random
        return agents

    def create_deck(self):
        return [Card(rank, suit) for rank, suit in itertools.product(self.RANKS, self.SUITS)]

    def deal(self):
        random.shuffle(self.deck)
        for player in self.players:
            for i in range(4):
                player.grid[i] = self.deck.pop()
        # Start discard pile
        self.discard_pile.append(self.deck.pop())

    def play_turn(self, player, trajectory=None):
        agent = self.agents[self.turn]
        action = agent.choose_action(player, self, trajectory)

        if not action:
            return  # No moves left

        if action['type'] == 'take_discard' and self.discard_pile:
            # Take from discard pile, swap with pos
            new_card = self.discard_pile.pop()
            old_card = player.grid[action['position']]
            player.grid[action['position']] = new_card
            player.known[action['position']] = True
            player.add_to_discard_memory(old_card)
            self.discard_pile.append(old_card)
        elif action['type'] == 'draw_deck' and self.deck:
            # Draw from deck
            new_card = self.deck.pop()
            if action.get('keep', True):
                old_card = player.grid[action['position']]
                player.grid[action['position']] = new_card
                player.known[action['position']] = True
                player.add_to_discard_memory(old_card)
                self.discard_pile.append(old_card)
            else:
                player.add_to_discard_memory(new_card)
                self.discard_pile.append(new_card)

    def all_players_done(self):
        return all(all(p.known) for p in self.players)

    def next_player(self):
        self.turn = (self.turn + 1) % self.num_players
        if self.turn == 0:
            self.round += 1

    def play_game(self, verbose=True, trajectories=None):
        if trajectories is None:
            trajectories = [None] * self.num_players

        # Each player must take exactly 4 turns, so game should last exactly 4 rounds
        while self.round <= self.max_rounds:
            player = self.players[self.turn]
            if verbose:
                print(f"\n-- {player.name}'s turn (Round {self.round}) --")
                print(f"Agent: {player.agent_type}")
                print(player)
                print(f"Top of discard: {self.discard_pile[-1]}")

            # Check if player has any moves available
            available_positions = [i for i, known in enumerate(player.known) if not known]
            if available_positions:
                self.play_turn(player, trajectories[self.turn])
            else:
                # Player has no moves (all cards face-up), but still counts as a turn
                if verbose:
                    print(f"{player.name} has no moves available (all cards face-up)")

            self.next_player()

        # Reveal all cards
        for p in self.players:
            p.reveal_all()
        if verbose:
            print("\n=== FINAL GRIDS ===")
            for p in self.players:
                print(f"{p.name} ({p.agent_type}):\n{p}\n")
        scores = [self.calculate_score(p.grid) for p in self.players]
        if verbose:
            for i, s in enumerate(scores):
                print(f"{self.players[i].name} ({self.players[i].agent_type}) score: {s}")
            winner_idx = scores.index(min(scores))
            print(f"Winner: {self.players[winner_idx].name} ({self.players[winner_idx].agent_type})")
        return scores

    def calculate_score(self, grid):
        scores = [card.score() if card else 0 for card in grid]
        total_score = sum(scores)
        ranks = [card.rank if card else None for card in grid]
        pairs = []
        used = set()
        for pos1, pos2 in itertools.combinations(range(4), 2):
            if (ranks[pos1] and ranks[pos2] and ranks[pos1] == ranks[pos2]
                and pos1 not in used and pos2 not in used):
                pairs.append((pos1, pos2))
                used.add(pos1)
                used.add(pos2)
                total_score -= (scores[pos1] + scores[pos2])
        return total_score

def run_simulations_with_training(num_games=1000, agent_types=None, verbose=False):
    """
    Run multiple simulations with Q-learning training

    Args:
        num_games: Number of games to simulate
        agent_types: List of agent types for each player
        verbose: Whether to print detailed output for each game

    Returns:
        Dictionary with simulation results and statistics
    """
    if agent_types is None:
        agent_types = ["random", "heuristic", "qlearning", "random"]

    num_players = len(agent_types)

    # Create persistent Q-learning agents
    q_agents = []
    for i, agent_type in enumerate(agent_types):
        if agent_type == "qlearning":
            q_agents.append(QLearningAgent(epsilon=0.2))  # Higher epsilon for more exploration
        else:
            q_agents.append(None)

    # Statistics tracking
    stats = {
        'total_games': num_games,
        'agent_types': agent_types,
        'wins_by_agent': defaultdict(int),
        'scores_by_agent': defaultdict(list),
        'average_scores': {},
        'win_rates': {},
        'score_distributions': defaultdict(list),
        'game_durations': [],
        'q_learning_progress': [],
        'learning_curves': defaultdict(list),  # Track scores over time
        'score_by_interval': defaultdict(list)  # Track average scores by intervals
    }

    print(f"Running {num_games} simulations with agents: {agent_types}")
    print("Q-learning agents will learn from experience!")

    # Track scores for learning curves
    interval_size = max(1, num_games // 20)  # 20 intervals for tracking

    for game_num in range(num_games):
        if verbose and game_num % 100 == 0:
            print(f"Game {game_num + 1}/{num_games}")

        # Create trajectories for Q-learning agents
        trajectories = []
        for i in range(num_players):
            if agent_types[i] == "qlearning":
                trajectories.append([])
            else:
                trajectories.append(None)

        # Create and play game
        game = GolfGame(num_players=num_players, agent_types=agent_types, q_agents=q_agents)
        scores = game.play_game(verbose=False, trajectories=trajectories)

        # Train Q-learning agents
        winner_idx = scores.index(min(scores))
        for i, agent_type in enumerate(agent_types):
            if agent_type == "qlearning" and trajectories[i]:
                # Calculate reward: stronger signals for learning
                if i == winner_idx:
                    reward = 10.0  # Big reward for winning
                else:
                    # Stronger negative reward based on score
                    # Lower scores should have higher rewards
                    if scores[i] <= 5:
                        reward = 2.0  # Good score
                    elif scores[i] <= 10:
                        reward = 0.0  # Average score
                    elif scores[i] <= 15:
                        reward = -2.0  # Bad score
                    else:
                        reward = -5.0  # Very bad score

                q_agents[i].train_on_trajectory(trajectories[i], reward, scores[i])

                # Decay epsilon for better learning
                if game_num % 100 == 0:  # Decay every 100 games
                    q_agents[i].decay_epsilon()

        # Record results
        winner_agent = agent_types[winner_idx]
        stats['wins_by_agent'][winner_agent] += 1

        # Record scores for each agent
        for i, score in enumerate(scores):
            agent_type = agent_types[i]
            stats['scores_by_agent'][agent_type].append(score)
            stats['score_distributions'][agent_type].append(score)

            # Track learning curves (every game)
            stats['learning_curves'][agent_type].append(score)

        # Record game duration (number of rounds)
        stats['game_durations'].append(game.round)

        # Track Q-learning progress and scores by intervals
        if game_num % 100 == 0 or game_num == num_games - 1:
            q_progress = {}
            for i, agent_type in enumerate(agent_types):
                if agent_type == "qlearning":
                    states, entries = q_agents[i].get_q_table_size()
                    q_progress[f"qlearning_{i}"] = {"states": states, "entries": entries}
            stats['q_learning_progress'].append((game_num, q_progress))

        # Track average scores by intervals
        if (game_num + 1) % interval_size == 0 or game_num == num_games - 1:
            interval_start = max(0, game_num - interval_size + 1)
            interval_end = game_num + 1

            for i, agent_type in enumerate(agent_types):
                if agent_type in stats['scores_by_agent']:
                    interval_scores = stats['scores_by_agent'][agent_type][interval_start:interval_end]
                    avg_score = np.mean(interval_scores)
                    stats['score_by_interval'][agent_type].append({
                        'interval': len(stats['score_by_interval'][agent_type]) + 1,
                        'games': f"{interval_start+1}-{interval_end}",
                        'avg_score': avg_score,
                        'min_score': min(interval_scores),
                        'max_score': max(interval_scores)
                    })

    # Calculate final statistics
    for agent_type in agent_types:
        if agent_type in stats['scores_by_agent']:
            scores = stats['scores_by_agent'][agent_type]
            stats['average_scores'][agent_type] = np.mean(scores)
            stats['win_rates'][agent_type] = stats['wins_by_agent'][agent_type] / num_games

    return stats

def run_simulations(num_games=1000, agent_types=None, verbose=False):
    """
    Run multiple simulations and collect statistics (without Q-learning training)

    Args:
        num_games: Number of games to simulate
        agent_types: List of agent types for each player
        verbose: Whether to print detailed output for each game

    Returns:
        Dictionary with simulation results and statistics
    """
    if agent_types is None:
        agent_types = ["random", "heuristic", "qlearning", "random"]

    num_players = len(agent_types)

    # Statistics tracking
    stats = {
        'total_games': num_games,
        'agent_types': agent_types,
        'wins_by_agent': defaultdict(int),
        'scores_by_agent': defaultdict(list),
        'average_scores': {},
        'win_rates': {},
        'score_distributions': defaultdict(list),
        'game_durations': []
    }

    print(f"Running {num_games} simulations with agents: {agent_types}")

    for game_num in range(num_games):
        if verbose and game_num % 100 == 0:
            print(f"Game {game_num + 1}/{num_games}")

        # Create and play game
        game = GolfGame(num_players=num_players, agent_types=agent_types)
        scores = game.play_game(verbose=False)

        # Record results
        winner_idx = scores.index(min(scores))
        winner_agent = agent_types[winner_idx]
        stats['wins_by_agent'][winner_agent] += 1

        # Record scores for each agent
        for i, score in enumerate(scores):
            agent_type = agent_types[i]
            stats['scores_by_agent'][agent_type].append(score)
            stats['score_distributions'][agent_type].append(score)

        # Record game duration (number of rounds)
        stats['game_durations'].append(game.round)

    # Calculate final statistics
    for agent_type in agent_types:
        if agent_type in stats['scores_by_agent']:
            scores = stats['scores_by_agent'][agent_type]
            stats['average_scores'][agent_type] = np.mean(scores)
            stats['win_rates'][agent_type] = stats['wins_by_agent'][agent_type] / num_games

    return stats

def print_simulation_results(stats):
    """Print formatted simulation results"""
    print("\n" + "="*60)
    print("SIMULATION RESULTS")
    print("="*60)
    print(f"Total games: {stats['total_games']}")
    print(f"Agents: {stats['agent_types']}")

    print("\nWIN RATES:")
    for agent_type in stats['agent_types']:
        win_rate = stats['win_rates'].get(agent_type, 0)
        wins = stats['wins_by_agent'].get(agent_type, 0)
        print(f"  {agent_type}: {win_rate:.2%} ({wins} wins)")

    print("\nAVERAGE SCORES:")
    for agent_type in stats['agent_types']:
        avg_score = stats['average_scores'].get(agent_type, 0)
        scores = stats['scores_by_agent'].get(agent_type, [])
        if scores:
            min_score = min(scores)
            max_score = max(scores)
            print(f"  {agent_type}: {avg_score:.2f} (range: {min_score}-{max_score})")

    print(f"\nAverage game duration: {np.mean(stats['game_durations']):.1f} rounds")

    # Show Q-learning progress if available
    if 'q_learning_progress' in stats and stats['q_learning_progress']:
        print("\nQ-LEARNING PROGRESS:")
        for game_num, progress in stats['q_learning_progress']:
            print(f"  Game {game_num}: {progress}")

    # Show learning curves and score intervals
    if 'score_by_interval' in stats:
        print("\nLEARNING CURVES (Score by Intervals):")
        for agent_type in stats['agent_types']:
            if agent_type in stats['score_by_interval'] and stats['score_by_interval'][agent_type]:
                print(f"\n  {agent_type.upper()} LEARNING PROGRESS:")
                intervals = stats['score_by_interval'][agent_type]

                # Show first few, middle, and last intervals
                to_show = []
                if len(intervals) <= 6:
                    to_show = intervals
                else:
                    to_show = intervals[:3] + intervals[len(intervals)//2-1:len(intervals)//2+1] + intervals[-3:]

                for interval in to_show:
                    print(f"    Interval {interval['interval']} (Games {interval['games']}): "
                          f"Avg={interval['avg_score']:.2f}, Range={interval['min_score']}-{interval['max_score']}")

                # Show overall improvement
                if len(intervals) >= 2:
                    first_avg = intervals[0]['avg_score']
                    last_avg = intervals[-1]['avg_score']
                    improvement = first_avg - last_avg
                    print(f"    Overall improvement: {improvement:+.2f} points "
                          f"({first_avg:.2f} → {last_avg:.2f})")

    # Show some interesting statistics
    print("\nDETAILED ANALYSIS:")
    for agent_type in stats['agent_types']:
        scores = stats['scores_by_agent'].get(agent_type, [])
        if scores:
            perfect_games = sum(1 for s in scores if s == 0)
            print(f"  {agent_type}: {perfect_games} perfect games (score = 0)")

            # Score distribution
            score_counts = defaultdict(int)
            for score in scores:
                score_counts[score] += 1
            most_common_score = max(score_counts.items(), key=lambda x: x[1])
            print(f"    Most common score: {most_common_score[0]} (occurred {most_common_score[1]} times)")

            # For Q-learning agents, show learning trend
            if agent_type == "qlearning" and 'learning_curves' in stats:
                learning_curve = stats['learning_curves'][agent_type]
                if len(learning_curve) >= 100:
                    first_100_avg = np.mean(learning_curve[:100])
                    last_100_avg = np.mean(learning_curve[-100:])
                    trend = first_100_avg - last_100_avg
                    print(f"    Learning trend: {trend:+.2f} points improvement "
                          f"({first_100_avg:.2f} → {last_100_avg:.2f})")

def plot_learning_curves(stats):
    """Plot learning curves for visualization (if matplotlib is available)"""
    try:
        import matplotlib.pyplot as plt

        plt.figure(figsize=(12, 8))

        for agent_type in stats['agent_types']:
            if agent_type in stats['learning_curves']:
                scores = stats['learning_curves'][agent_type]
                games = list(range(1, len(scores) + 1))

                # Plot individual scores with low alpha
                plt.scatter(games, scores, alpha=0.1, s=1, label=f'{agent_type} (individual)')

                # Plot moving average
                window_size = max(1, len(scores) // 50)  # 50 points for moving average
                if len(scores) >= window_size:
                    moving_avg = []
                    for i in range(len(scores)):
                        start = max(0, i - window_size // 2)
                        end = min(len(scores), i + window_size // 2 + 1)
                        moving_avg.append(np.mean(scores[start:end]))
                    plt.plot(games, moving_avg, linewidth=2, label=f'{agent_type} (moving avg)')

        plt.xlabel('Game Number')
        plt.ylabel('Score')
        plt.title('Learning Curves - Score vs Game Number')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.ylim(bottom=0)  # Scores can't be negative

        # Save plot
        plt.savefig('golf_learning_curves.png', dpi=300, bbox_inches='tight')
        print("\nLearning curves plot saved as 'golf_learning_curves.png'")
        plt.show()

    except ImportError:
        print("\nMatplotlib not available. Install with 'pip install matplotlib' to see learning curves plot.")

def main():
    print("=== GOLF GAME SIMULATION SUITE WITH Q-LEARNING ===")

    # Example 1: Single game with different agents
    print("\n1. Single game example:")
    agent_types = ["heuristic", "random", "qlearning", "random"]
    game = GolfGame(num_players=4, agent_types=agent_types)
    game.play_game(verbose=True)

    # Example 2: Run simulations with Q-learning training
    print("\n2. Running simulations with Q-learning training...")
    stats = run_simulations_with_training(num_games=1000, agent_types=agent_types, verbose=True)
    print_simulation_results(stats)

    # Example 3: Plot learning curves
    print("\n3. Plotting learning curves...")
    plot_learning_curves(stats)

    # Example 4: Compare trained vs untrained Q-learning
    print("\n4. Comparing trained vs untrained Q-learning:")

    # Untrained Q-learning
    print("\nUntrained Q-learning vs Random:")
    stats_untrained = run_simulations(num_games=20000, agent_types=["qlearning", "random"], verbose=False)
    print_simulation_results(stats_untrained)

    # Trained Q-learning
    print("\nTrained Q-learning vs Random:")
    stats_trained = run_simulations_with_training(num_games=20000, agent_types=["qlearning", "random"], verbose=False)
    print_simulation_results(stats_trained)

if __name__ == "__main__":
    main()