In [1]:
import gymnasium as gym
from gymnasium import spaces, vector
import numpy as np
from scipy.stats import binom, nbinom, beta, poisson, gamma, norm, geom
import random
import ray
from ray import tune, air
from ray.rllib.algorithms.ppo import PPOConfig
from functools import partial
from itertools import combinations

import numpy as np
import gymnasium as gym
from gymnasium import spaces
from gymnasium.utils import seeding
import random
from ray.air.integrations.wandb import WandbLoggerCallback, setup_wandb




# Question 1: 1 play and 1 discard

In [2]:
suit_to_int = {'Hearts': 0, 'Diamonds': 1, 'Clubs': 2, 'Spades': 3}
val_to_int = {'2': 0, '3': 1, '4': 2, '5': 3, '6': 4, '7': 5, '8': 6, '9': 7, '10': 8, 'J': 9, 'Q': 10, 'K': 11, 'A': 12}
def card_to_int(card):
    """converts each cart to a unique integer

    Args:
        card (_type_): _description_

    Returns:
        _type_: _description_
    """
    return card[1] * 13 + card[0]


class CardGameEnv(gym.Env):
    def __init__(self, seed = None):
        # self.action_space = spaces.Box(low=0, high=1, shape=(8,), dtype=np.float32)  # Binary array to represent discarding individual cards
        # action space is one of 2**8 options, later turned into a list of 1s and 0s
        # 1s representing a card to discard
        # 0 representing a card to keep
        self.action_space = spaces.Tuple([spaces.Discrete(2**8), spaces.Discrete(2)])
        self.observation_space = spaces.Box(low=0, high=51, shape = [8] ,dtype=np.int16)
        self.deck =  [(value, suit) for value in range(0, 13) for suit in range(4)] 
        self.plays = 1
        self.discards = 1
        self.total_reward = 0
        self.seed()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def reset(self, *, seed=None, options=None):
        self.hand = self.draw_hand(8)  # Number, Suit
        # print(self.hand)
        self.discards = 1
        self.plays = 1
        self.total_reward = 0
        self.deck =  [(value, suit) for value in range(0, 13) for suit in range(4)] 
        return np.array([card_to_int(card) for card in self.hand]), {}

    def draw_hand(self, num_cards_to_draw):
        """draws num_cards_to_draw number of cards from remaining deck and removes from avail deck

    Args:
        num_cards_to_draw (_type_): number of cards to draw

    Returns:
        _type_: list of newly drawn cards
    """
        cards = []
        for i in range(num_cards_to_draw):
            card = random.choice(self.deck)
            #  Remove the selected card from available deck
            self.deck.remove(card)
            cards.append(card)
        # print("Cards remaining:", len(self.deck))
        return cards
    def action_to_list(self, action):
        bin_num = bin(action)
        list_of_nums = [int(x) for x in str(bin_num)[2:]]
        while len(list_of_nums) < 8:
            list_of_nums.insert(0, 0)
        return list_of_nums
    def step(self, action):
        if action[1] == 0:
            self.discards -= 1
        else: # choosing to play, so collect reward
            reward = self.calculate_hand_value(self.hand)
            self.total_reward += reward
            self.plays -= 1

        # either way, perform discard stuff

        # Grab cards to discard
        action = self.action_to_list(action[0])
        discards = [idx for idx, val in enumerate(action) if val == 1]
        num_discards = len(discards)
        # Ensure that number of discards is at most 5 (prevent invalid action space)
        while len(discards) > 5:
            discards.pop()
            num_discards = len(discards)

        if num_discards > 0:  # choosing to discard something
            new_cards = self.draw_hand(num_discards)
            # Calculate remaining indices ensuring they are within the range of self.hand
            remaining_indices = [i for i in range(8) if i not in discards]
            self.hand = [self.hand[idx] for idx in remaining_indices] + new_cards

        # check for terminal state
        if self.discards == 0 or self.plays == 0:
            done = True
        else:
            done = False

        # If game is over, need to add up total_reward if last action was discard
        if self.discards == 0:
            reward = self.calculate_hand_value(self.hand)
            self.total_reward += reward
        
        return np.array([card_to_int(card) for card in self.hand]), self.total_reward, done, False, {}


    def calculate_hand_value(self, hand):
        # Generate all combinations of 5 cards
        combinations_5 = combinations(hand, 5)
        best_reward = 0
        for combo in combinations_5:
            card_values = sorted([card[0] for card in combo])
            card_suits = [card[1] for card in combo]
            counts = np.bincount(card_values)
            num_unique_cards = np.count_nonzero(counts)
            is_flush = len(set(card_suits)) == 1

            if num_unique_cards == 2:
                if 4 in counts:
                    reward = 420  # Four of a Kind
                else:
                    reward = 160  # Full House
            elif num_unique_cards == 3:
                if 3 in counts:
                    reward = 90  # Three of a Kind
                else:
                    reward = 40  # Two Pair
            elif num_unique_cards == 4:
                reward = 20  # Pair
            elif num_unique_cards == 5:
                is_straight = all(card_values[i] == card_values[i-1] + 1 for i in range(1, len(card_values)))
                if is_straight:
                    if is_flush:
                        reward = 800  # Straight Flush
                    else:
                        reward = 120  # Straight
                elif is_flush:
                    reward = 140  # Flush
                else:
                    reward = 5  # High Card
            
            if reward > best_reward:
                best_reward = reward
        return best_reward


In [3]:
# Test the environment
env = CardGameEnv()
obs, _ = env.reset()
total_reward = 0
done = False
while not done:
    assert obs.shape 
    action = env.action_space.sample()  # Random action for testing
    obs, reward, done, _, _ = env.step(action)
    total_reward += reward
print("Reward:", total_reward)

Reward: 5


In [4]:
if ray.is_initialized():
  ray.shutdown()

runtime_env = {"py_modules": ["../.."]}
ray.init(runtime_env=runtime_env)

2024-05-12 16:20:46,406	INFO worker.py:1642 -- Started a local Ray instance.
2024-05-12 16:20:46,502	INFO packaging.py:518 -- Creating a file package for local directory '../..'.
2024-05-12 16:20:46,606	INFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_89a7fa5104382740.zip' (13.72MiB) to Ray cluster...
2024-05-12 16:20:46,669	INFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_89a7fa5104382740.zip'.


0,1
Python version:,3.10.12
Ray version:,2.7.0


In [5]:
checkpoint_config = air.CheckpointConfig(checkpoint_frequency=5)

In [6]:
config = (PPOConfig()
          .environment(CardGameEnv)
          .framework('torch')
          .training(gamma=0.9,)
          .rollouts(num_rollout_workers=3)
)
stop = {"timesteps_total": 100000}

tuner = tune.Tuner(
    "PPO",
    param_space=config.to_dict(),
    run_config=air.RunConfig(stop=stop,
                             storage_path="~/Desktop/Tristan/rl-projects/results",
                             checkpoint_config=checkpoint_config,
                             callbacks=[WandbLoggerCallback(project="RL-Projects",upload_checkpoints=True,
                                                            save_checkpoints=True, group="rl-class-2024")],)
)

res = tuner.fit()

0,1
Current time:,2024-05-12 16:27:03
Running for:,00:06:15.56
Memory:,49.1/125.5 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CardGameEnv_43ed5_00000,TERMINATED,206.211.132.160:1677726,25,362.477,100000,62.5637,800,5,1


2024-05-12 16:20:48,180	INFO wandb.py:308 -- Already logged into W&B.
[2m[36m(_WandbLoggingActor pid=1677975)[0m wandb: Currently logged in as: tristntran. Use `wandb login --relogin` to force relogin
[2m[36m(_WandbLoggingActor pid=1677975)[0m wandb: Tracking run with wandb version 0.17.0
[2m[36m(_WandbLoggingActor pid=1677975)[0m wandb: Run data is saved locally in /home/healthcare/ray_results/PPO_2024-05-12_16-20-47/PPO_CardGameEnv_43ed5_00000_0_2024-05-12_16-20-48/wandb/run-20240512_162100-43ed5_00000
[2m[36m(_WandbLoggingActor pid=1677975)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(_WandbLoggingActor pid=1677975)[0m wandb: Syncing run PPO_CardGameEnv_43ed5_00000
[2m[36m(_WandbLoggingActor pid=1677975)[0m wandb: ⭐️ View project at https://wandb.ai/tristntran/RL-Projects
[2m[36m(_WandbLoggingActor pid=1677975)[0m wandb: 🚀 View run at https://wandb.ai/tristntran/RL-Projects/runs/43ed5_00000
[2m[36m(PPO pid=1677726)[0m Checkpoint successfully cre

# Question 2: 4 plays and 3 discards

In [7]:


suit_to_int = {'Hearts': 0, 'Diamonds': 1, 'Clubs': 2, 'Spades': 3}
val_to_int = {'2': 0, '3': 1, '4': 2, '5': 3, '6': 4, '7': 5, '8': 6, '9': 7, '10': 8, 'J': 9, 'Q': 10, 'K': 11, 'A': 12}
def card_to_int(card):
    """converts each cart to a unique integer

    Args:
        card (_type_): _description_

    Returns:
        _type_: _description_
    """
    return card[1] * 13 + card[0]


class CardGameEnv(gym.Env):
    def __init__(self, seed = None):
        # self.action_space = spaces.Box(low=0, high=1, shape=(8,), dtype=np.float32)  # Binary array to represent discarding individual cards
        # action space is one of 2**8 options, later turned into a list of 1s and 0s
        # 1s representing a card to discard
        # 0 representing a card to keep
        self.action_space = spaces.Tuple([spaces.Discrete(2**8), spaces.Discrete(2)])
        self.observation_space = spaces.Box(low=0, high=51, shape = [8] ,dtype=np.int16)
        self.deck =  [(value, suit) for value in range(0, 13) for suit in range(4)] 
        self.plays = 4
        self.discards = 3
        self.total_reward = 0
        self.seed()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def reset(self, *, seed=None, options=None):
        self.hand = self.draw_hand(8)  # Number, Suit
        # print(self.hand)
        self.discards = 3
        self.plays = 4
        self.total_reward = 0
        self.deck =  [(value, suit) for value in range(0, 13) for suit in range(4)] 
        return np.array([card_to_int(card) for card in self.hand]), {}

    def draw_hand(self, num_cards_to_draw):
        """draws num_cards_to_draw number of cards from remaining deck and removes from avail deck

    Args:
        num_cards_to_draw (_type_): number of cards to draw

    Returns:
        _type_: list of newly drawn cards
    """
        cards = []
        for i in range(num_cards_to_draw):
            card = random.choice(self.deck)
            #  Remove the selected card from available deck
            self.deck.remove(card)
            cards.append(card)
        # print("Cards remaining:", len(self.deck))
        return cards
    def action_to_list(self, action):
        bin_num = bin(action)
        list_of_nums = [int(x) for x in str(bin_num)[2:]]
        while len(list_of_nums) < 8:
            list_of_nums.insert(0, 0)
        return list_of_nums
    def step(self, action):
        if action[1] == 0:
            self.discards -= 1
        else: # choosing to play, so collect reward
            reward = self.calculate_hand_value(self.hand)
            self.total_reward += reward
            self.plays -= 1

        # either way, perform discard stuff

        # Grab cards to discard
        action = self.action_to_list(action[0])
        discards = [idx for idx, val in enumerate(action) if val == 1]
        num_discards = len(discards)
        # Ensure that number of discards is at most 5 (prevent invalid action space)
        while len(discards) > 5:
            discards.pop()
            num_discards = len(discards)

        if num_discards > 0:  # choosing to discard something
            new_cards = self.draw_hand(num_discards)
            # Calculate remaining indices ensuring they are within the range of self.hand
            remaining_indices = [i for i in range(8) if i not in discards]
            self.hand = [self.hand[idx] for idx in remaining_indices] + new_cards

        # check for terminal state
        if self.discards == 0 or self.plays == 0:
            done = True
        else:
            done = False

        # If game is over, need to add up total_reward if last action was discard
        if self.discards == 0:
            reward = self.calculate_hand_value(self.hand)
            self.total_reward += reward
        
        return np.array([card_to_int(card) for card in self.hand]), self.total_reward, done, False, {}


    def calculate_hand_value(self, hand):
        # Generate all combinations of 5 cards
        combinations_5 = combinations(hand, 5)
        best_reward = 0
        for combo in combinations_5:
            card_values = sorted([card[0] for card in combo])
            card_suits = [card[1] for card in combo]
            counts = np.bincount(card_values)
            num_unique_cards = np.count_nonzero(counts)
            is_flush = len(set(card_suits)) == 1

            if num_unique_cards == 2:
                if 4 in counts:
                    reward = 420  # Four of a Kind
                else:
                    reward = 160  # Full House
            elif num_unique_cards == 3:
                if 3 in counts:
                    reward = 90  # Three of a Kind
                else:
                    reward = 40  # Two Pair
            elif num_unique_cards == 4:
                reward = 20  # Pair
            elif num_unique_cards == 5:
                is_straight = all(card_values[i] == card_values[i-1] + 1 for i in range(1, len(card_values)))
                if is_straight:
                    if is_flush:
                        reward = 800  # Straight Flush
                    else:
                        reward = 120  # Straight
                elif is_flush:
                    reward = 140  # Flush
                else:
                    reward = 5  # High Card
            
            if reward > best_reward:
                best_reward = reward
        return best_reward


In [8]:
# Test the environment
env = CardGameEnv()
obs, _ = env.reset()
total_reward = 0
done = False
while not done:
    assert obs.shape 
    action = env.action_space.sample()  # Random action for testing
    obs, reward, done, _, _ = env.step(action)
    total_reward += reward
print("Reward:", total_reward)

Reward: 410


In [9]:
if ray.is_initialized():
  ray.shutdown()

runtime_env = {"py_modules": ["../.."]}
ray.init(runtime_env=runtime_env)

2024-05-12 16:27:14,673	INFO worker.py:1642 -- Started a local Ray instance.
2024-05-12 16:27:14,764	INFO packaging.py:518 -- Creating a file package for local directory '../..'.
2024-05-12 16:27:14,872	INFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_89a7fa5104382740.zip' (13.72MiB) to Ray cluster...
2024-05-12 16:27:14,934	INFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_89a7fa5104382740.zip'.


0,1
Python version:,3.10.12
Ray version:,2.7.0


In [10]:
checkpoint_config = air.CheckpointConfig(checkpoint_frequency=5)

In [11]:
config = (PPOConfig()
          .environment(CardGameEnv)
          .framework('torch')
          .training(gamma=0.9,)
          .rollouts(num_rollout_workers=3)
)
stop = {"timesteps_total": 100000}

tuner = tune.Tuner(
    "PPO",
    param_space=config.to_dict(),
    run_config=air.RunConfig(stop=stop,
                             storage_path="~/Desktop/Tristan/rl-projects/results",
                             checkpoint_config=checkpoint_config,
                             callbacks=[WandbLoggerCallback(project="RL-Projects",upload_checkpoints=True,
                                                            save_checkpoints=True, group="rl-class-2024")],)
)

res = tuner.fit()


0,1
Current time:,2024-05-12 16:33:02
Running for:,00:05:45.90
Memory:,49.2/125.5 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CardGameEnv_2b391_00000,TERMINATED,206.211.132.160:1682694,25,333.718,100000,660.359,3380,20,4.71378


2024-05-12 16:27:16,211	INFO wandb.py:308 -- Already logged into W&B.
[2m[36m(_WandbLoggingActor pid=1682943)[0m wandb: Currently logged in as: tristntran. Use `wandb login --relogin` to force relogin
[2m[36m(_WandbLoggingActor pid=1682943)[0m wandb: Tracking run with wandb version 0.17.0
[2m[36m(_WandbLoggingActor pid=1682943)[0m wandb: Run data is saved locally in /home/healthcare/ray_results/PPO_2024-05-12_16-27-16/PPO_CardGameEnv_2b391_00000_0_2024-05-12_16-27-16/wandb/run-20240512_162728-2b391_00000
[2m[36m(_WandbLoggingActor pid=1682943)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(_WandbLoggingActor pid=1682943)[0m wandb: Syncing run PPO_CardGameEnv_2b391_00000
[2m[36m(_WandbLoggingActor pid=1682943)[0m wandb: ⭐️ View project at https://wandb.ai/tristntran/RL-Projects
[2m[36m(_WandbLoggingActor pid=1682943)[0m wandb: 🚀 View run at https://wandb.ai/tristntran/RL-Projects/runs/2b391_00000
[2m[36m(PPO pid=1682694)[0m Checkpoint successfully cre