In [1]:
import gymnasium as gym
from gymnasium import spaces, vector
import numpy as np
from scipy.stats import binom, nbinom, beta, poisson, gamma, norm, geom
import random
import ray
from ray import tune, air
from ray.rllib.algorithms.ppo import PPOConfig
from functools import partial
from itertools import combinations



In [2]:
# !pip install pokerlib
# from pokerlib.enums import Rank, Suit, Hand
# from pokerlib import HandParser
# import random

# These are attempts for 2

# Good Env

In [105]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces
from gymnasium.utils import seeding
import random

suit_to_int = {'Hearts': 0, 'Diamonds': 1, 'Clubs': 2, 'Spades': 3}
val_to_int = {'2': 0, '3': 1, '4': 2, '5': 3, '6': 4, '7': 5, '8': 6, '9': 7, '10': 8, 'J': 9, 'Q': 10, 'K': 11, 'A': 12}
def card_to_int(card):
    """converts each cart to a unique integer

    Args:
        card (_type_): _description_

    Returns:
        _type_: _description_
    """
    return card[1] * 13 + card[0]


class CustomBinaryActionSpace(spaces.Discrete):
    """
    Custom binary action space with 8 elements.
    """

    def __init__(self):
        super().__init__(2 ** 8)  # 2^8 = 256 possible combinations

    def sample(self):
        """
        Sample a binary action.
        """
        return np.random.randint(2, size=8)  # Generate random binary array of length 8



class CardGameEnv(gym.Env):
    def __init__(self, seed = None):
        # self.action_space = spaces.Box(low=0, high=1, shape=(8,), dtype=np.float32)  # Binary array to represent discarding individual cards
        # action space is a binary array of length 8
        # using custom action space due to issues with MultiBinary
        self.action_space = CustomBinaryActionSpace()
        self.observation_space = spaces.Box(low=0, high=51, shape = [8] ,dtype=np.int16)
        self.deck =  [(value, suit) for value in range(0, 13) for suit in range(4)]
        self.seed()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def reset(self, *, seed=None, options=None):
        self.hand = self.draw_hand(8)  # Number, Suit
        # print(self.hand)
        self.discards = 3
        self.plays = 4
        self.total_reward = 0
        return np.array([card_to_int(card) for card in self.hand]), {}

    def draw_hand(self, num_cards_to_draw):
        """draws num_cards_to_draw number of cards from remaining deck and removes from avail deck

    Args:
        num_cards_to_draw (_type_): number of cards to draw

    Returns:
        _type_: list of newly drawn cards
    """
        cards = []
        for i in range(num_cards_to_draw):
            card = random.choice(self.deck)
            #  Remove the selected card from available deck
            self.deck.remove(card)
            cards.append(card)
        # print("Cards remaining:", len(self.deck))
        return cards

    def step(self, action):
        # print("Startinghand", self.hand)
        
        # print("action", action)
        discards = [idx for idx, val in enumerate(action) if val == 1]
        num_discards = len(discards)
        # Ensure that number of discards is at most 5 (prevent invalid action space)
        while len(discards) > 5:
            discards.pop()
            num_discards = len(discards)

        # print("Discards", discards, len(discards))

        if num_discards > 0:  # choosing to discard something
            new_cards = self.draw_hand(num_discards)
            # Calculate remaining indices ensuring they are within the range of self.hand
            remaining_indices = [i for i in range(8) if i not in discards]
            # print("Remaining Indices:", remaining_indices)
            # print("Hand before discarding:", self.hand)
            self.hand = [self.hand[idx] for idx in remaining_indices] + new_cards
            # print("Hand after discarding:", self.hand)

            self.discards -= 1
        else: # choosing to play, so collect reward
            reward = self.calculate_hand_value(self.hand)
            self.total_reward += reward
            self.plays -= 1
            
        # check for terminal state
        if self.discards == 0 or self.plays == 0:
            done = True
        else:
            done = False
        # If game is over, need to add up total_reward if last action was discard
        if self.discards == 0:
            reward = self.calculate_hand_value(self.hand)
            self.total_reward += reward
        
        # print(self.hand)
        return np.array([card_to_int(card) for card in self.hand]), self.total_reward, done, False, {}


    def calculate_hand_value(self, hand):
        # Generate all combinations of 5 cards
        combinations_5 = combinations(hand, 5)
        best_reward = 0
        for combo in combinations_5:
            card_values = sorted([card[0] for card in combo])
            card_suits = [card[1] for card in combo]
            counts = np.bincount(card_values)
            num_unique_cards = np.count_nonzero(counts)
            is_flush = len(set(card_suits)) == 1

            if num_unique_cards == 2:
                if 4 in counts:
                    reward = 420  # Four of a Kind
                else:
                    reward = 160  # Full House
            elif num_unique_cards == 3:
                if 3 in counts:
                    reward = 90  # Three of a Kind
                else:
                    reward = 40  # Two Pair
            elif num_unique_cards == 4:
                reward = 20  # Pair
            elif num_unique_cards == 5:
                is_straight = all(card_values[i] == card_values[i-1] + 1 for i in range(1, len(card_values)))
                if is_straight:
                    if is_flush:
                        reward = 800  # Straight Flush
                    else:
                        reward = 120  # Straight
                elif is_flush:
                    reward = 140  # Flush
                else:
                    reward = 5  # High Card
            
            if reward > best_reward:
                best_reward = reward
        return best_reward

# Test the environment
env = CardGameEnv()
obs, _ = env.reset()
total_reward = 0
done = False
while not done:
    assert obs.shape 
    action = env.action_space.sample()  # Random action for testing
    obs, reward, done, _, _ = env.step(action)
    total_reward += reward
print("Reward:", total_reward)

# env.calculate_hand_value([(9, 2), (5, 2), (7, 1), (0, 2), (6, 2), (1, 0), (11, 0), (3, 0)])


Reward: 40


In [106]:
if ray.is_initialized():
  ray.shutdown()

runtime_env = {"py_modules": ["../.."]}
ray.init(runtime_env=runtime_env)

2024-05-11 12:44:16,673	INFO worker.py:1642 -- Started a local Ray instance.
2024-05-11 12:44:16,759	INFO packaging.py:518 -- Creating a file package for local directory '../..'.
2024-05-11 12:44:16,848	INFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_cd9b17a67b350ecd.zip' (9.78MiB) to Ray cluster...
2024-05-11 12:44:16,898	INFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_cd9b17a67b350ecd.zip'.


0,1
Python version:,3.10.12
Ray version:,2.7.0


In [107]:
config = (PPOConfig()
          .environment(CardGameEnv)
          .framework('torch')
          .training(gamma=1,)
          .rollouts(num_rollout_workers=3)
)
stop = {"timesteps_total": 10}

tuner = tune.Tuner(
    "PPO",
    param_space=config.to_dict(),
    run_config=air.RunConfig(stop=stop),
)

res = tuner.fit()


0,1
Current time:,2024-05-11 12:44:28
Running for:,00:00:07.43
Memory:,32.5/125.5 GiB

Trial name,# failures,error file
PPO_CardGameEnv_dc77d_00000,1,/home/healthcare/ray_results/PPO_2024-05-11_12-44-20/PPO_CardGameEnv_dc77d_00000_0_2024-05-11_12-44-20/error.txt

Trial name,status,loc
PPO_CardGameEnv_dc77d_00000,ERROR,206.211.132.160:1164601


2024-05-11 12:44:28,253	ERROR tune_controller.py:1502 -- Trial task failed for trial PPO_CardGameEnv_dc77d_00000
Traceback (most recent call last):
  File "/home/healthcare/.local/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/healthcare/.local/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/healthcare/.local/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/healthcare/.local/lib/python3.10/site-packages/ray/_private/worker.py", line 2547, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(TypeError): [36mray::PPO.train()[39m (pid=1164601, ip=206.211.132.160, actor_id=d4c263160862f9027b82335c01000000, repr=PPO)
  File "/home/healthcare/.local/lib/python3.10/site-packages/ray/tune/trainable/traina