# Optimal policy with Reinforcement Learning in Yahtzee dice game

## 1. Environment definition

In [None]:
import gymnasium as gym
import numpy as np


class YahtzeeEnv(gym.Env):
    def __init__(self):
        super(YahtzeeEnv, self).__init__()
        # Define spaces ( )
        self.observation_space = gym.spaces.Box(low=0, high=50, shape=(45,),
                                                dtype=np.int32)  # ((dice_result: 5 * 6 bit for one hot encoding), reroll, turn, [scores: 12], bonus )
        self.action_space = gym.spaces.Discrete(44)  # 0 : initial roll, 1-31: reroll, 32-43: filling the score)
        self._reset()
        self.rng = np.random.default_rng()

    def _reset(self):
        # Reset environment state (dice, categories, etc.)
        # Game states
        self.dice = np.zeros((5, 6), dtype=np.int32)  # one hot vectors in array
        self.rerolls = 3
        self.turn = 0
        self.scorecard = np.zeros(12, dtype=np.int32)
        self.bonus = False
        self.done = False
        self.bonusRewarded = False
        self.scored = np.full((12,), False, dtype=np.bool)

        # Return initial observation -> do we need this?

    def _reroll_under_mask(self, mask: list):
        "Reroll dice result under the bitmask"
        if self.rerolls == 0:
            raise ValueError(f"No reroll remains. ")
        else:
            self.rerolls -= 1
            for i in range(len(mask)):
                if mask[i] == 1:
                    self.dice[i, :] = np.zeros((1, 6), dtype=np.int32)  # reset ith die value
                    j = self.rng.integers(low=0, high=5, endpoint=True, dtype=np.int32)
                    self.dice[i, j] = 1

    def get_score_for_action(self, action) -> int:
        """ self.dice: 2D numpy array (5*6), each row represents one number under one-hot encoding
            action : 31-42 integer number
            Return : score(int) for selected action
        """
        scoreto = action - 32  # Changed from 31 to 32
        numbers = [0, 0, 0, 0, 0, 0]  # how many occurences are there for 1,2,...,5,6?
        meresum = 0
        for i in range(5):
            if self.dice[i][0] == 1:  ## ith die is 1
                numbers[0] += 1
            elif self.dice[i][1] == 1:  ## ith die is 2
                numbers[1] += 1
            elif self.dice[i][2] == 1:
                numbers[2] += 1
            elif self.dice[i][3] == 1:
                numbers[3] += 1
            elif self.dice[i][4] == 1:
                numbers[4] += 1
            elif self.dice[i][5] == 1:
                numbers[5] += 1
            else:
                continue
        for i in range(5):
            meresum += numbers[i] * (i + 1)

        if scoreto == 0:  # Ones
            return numbers[0] * 1
        elif scoreto == 1:  # Twos
            return numbers[1] * 2
        elif scoreto == 2:  # Threes
            return numbers[2] * 3
        elif scoreto == 3:  # Fours
            return numbers[3] * 4
        elif scoreto == 4:  # Fives
            return numbers[4] * 5
        elif scoreto == 5:  # Sixes
            return numbers[5] * 6
        elif scoreto == 6:  # Choice
            return meresum
        elif scoreto == 7:  # Four-of-a-Kind: if any number appears at least 4 times
            for i in range(5):
                if numbers[i] >= 4:
                    return meresum
                else:
                    continue
            return 0
        elif scoreto == 8:  # Full House: three of one number and two of another
            return meresum if (3 in numbers) and (2 in numbers) else 0
        elif scoreto == 9:  # Little Straight: 1,2,3,4; 2,3,4,5; 3,4,5,6

            if numbers[0:4] == [1, 1, 1, 1]:
                return 15
            elif numbers[1:5] == [1, 1, 1, 1]:
                return 15
            elif numbers[2:] == [1, 1, 1, 1]:
                return 15
            else:
                return 0
        elif scoreto == 10:  # Big Straight: 2-3-4-5-6; 1,2,3,4,5
            if (numbers[:5] == [1, 1, 1, 1, 1]) or (numbers[1:6] == [1, 1, 1, 1, 1]):
                return 30
            else:
                return 0
        elif scoreto == 11:  # Yacht: all dice the same
            try:
                numbers.index(5)
                return 50
            except ValueError:
                return 0
        else:
            print("Undealt case raised. scoring function must be modified.")
            return 0

    def _score_action(self, action, score=None):
        """Fill the score in scorecard with selected action. (in-place)
         This function does
         1) fill the score
         2) reset rerolls to 3
         3) turn increases
         4) check game end
         5) check bonus point is possible


        Args:
            action (int): 32-43 integer.
            score (int, optional) : score for that category
        No return value
        """
        if action < 32 or action > 43:
            raise ValueError(f"Invalid scoring action: {action}. Action must be between 32 and 43 for scoring.")

        index = action - 32  # Changed from 31 to 32
        if self.scored[index]:
            raise ValueError(f"Already filled in that category : {index}, value : {self.scorecard[index]}")
        val = score if score is not None else self.get_score_for_action(action)
        self.scorecard[index] = val
        self.scored[index] = True
        self.rerolls = 3
        self.turn += 1

        # check if game is completed
        if self.turn == 12:  # game end
            self.done = True
        # check the bonus point is possible
        if np.sum(self.scorecard[0:6]) >= 63:
            self.bonus = True

    def _initiate_turn(self):
        if self.rerolls == 3:
            self._reroll_under_mask([1, 1, 1, 1, 1])

    def get_state(self) -> np.ndarray:
        return np.concatenate([
            self.dice.flatten(),
            np.array([self.rerolls]),  # Convert scalar to array
            np.array([self.turn]),  # Convert scalar to array
            self.scorecard,
            np.array([self.bonus], dtype=int)  # Convert boolean to int
        ])

    def get_valid_action(self) -> list:
        """Return list of valid action, e.g.)[1,4,5,43] action: 0(initiate roll), 1-31(reroll), 32-43(scoring); integer """

        if self.rerolls == 3:
            return [0] #only initial roll
        elif self.rerolls == 0:
            # only scoring option is available
            valids = []
            for i, filled in enumerate(self.scored):
                if not filled:
                    valids.append(i + 32)  # Changed from 31 to 32
            return valids
        else:
            # We have rerolls, and scoring is available as well
            valids = list(range(1, 32))  # Changed from 0 to 1
            for i, filled in enumerate(self.scored):
                if not filled:
                    valids.append(i + 32)  # Changed from 31 to 32
            return valids

    def step(self, action):
        # 1) Apply action (roll dice or choose category, etc.)
        # 2) Calculate reward
        # 3) Determine if episode is done
        # 4) Return the next state, reward, done, and optionally info dict

        valid = self.get_valid_action()
        # print(f"***Valid actions in step function(output of get_valid_function) : {valid}***")

        if action not in valid:
            # If an invalid action is selected, penalize and return the same state.
            reward = -15
            next_state = self.get_state()
            return next_state, reward, self.done, {}

        ## turn initiation
        if action == 0:
            reward = 0
            self._initiate_turn()
            next_state = self.get_state()
            return next_state, reward, self.done, {}

        ## reroll action
        if 1 <= action <= 31:  # Changed from < 31 to range 1-31
            mask = self.int_to_bitmask(action)
            self._reroll_under_mask(mask)
            reward = self.get_sum_possible_score() * 0.08  # since this is not actual reward (not finally scored value)
            next_state = self.get_state()
            return next_state, reward, self.done, {}

        ## scoring action
        elif 32 <= action <= 43:  # Changed to range 32-43
            score = self.get_score_for_action(action)
            reward = score 
            self._score_action(action, score)
            if self.bonus and (self.bonusRewarded is False):
                reward += 35
                self.bonusRewarded = True
            # reward += score / 63  # bonus contribution
            next_state = self.get_state()
            return next_state, reward, self.done, {}
        else:
            raise ValueError(f"Invalid action: {action}. Action must be between 0 and 43.")

    def get_sum_possible_score(self) -> int:
        valids = self.get_valid_action()
        sum = 0
        for sAction in valids:
            if sAction >= 32:
                sum += self.get_score_for_action(sAction)
        return sum

    @staticmethod
    def int_to_bitmask(num):
        """Change an integer number to a 5-bit mask corresponding to (num+1) in binary representation.

        Input: integer number (1-31)
        Output: list of integers representing a 5-bit mask.

        Example:
            input: 21
            return: [1, 0, 1, 1, 0] 
        """
        if not (1 <= num <= 31):
            raise ValueError("Input number must be between 0(inclusive) and 30 (inclusive).")

        # Convert to binary, remove '0b' prefix, and fill with leading zeros to ensure 5 bits
        bitmask = list(map(int, format(num, '05b'))) # change the num+1 to num
        return bitmask


## 2. Model definition

In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque

class DQNet(nn.Module):
    def __init__(self, state_dim=45, action_dim=44):
        super(DQNet, self).__init__()
        
        self.net = nn.Sequential(
            nn.Linear(state_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256,128),
            nn.ReLU(),
            nn.Linear(128, action_dim)
        )
    
    def forward(self, x):
        return self.net(x)

## 3. Agent definition (optimizer, loss)

In [None]:
class DQNAgent:
    def __init__(self, state_dim, action_dim, lr=1e-3, gamma=0.99,
                 epsilon_start=1.0, epsilon_end=0.010, epsilon_decay=0.995,
                 buffer_size=10000, batch_size=64, target_update=100, rng=None):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.target_update = target_update
        self.steps_done = 0
        self.memory = deque(maxlen=buffer_size)
        #Cuda device
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.policy_net = DQNet(state_dim, action_dim).to(self.device)
        self.target_net = DQNet(state_dim, action_dim).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        #self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.optimizer = optim.RMSprop(self.policy_net.parameters(),lr = lr, alpha = 0.99)
        self.rng = np.random.default_rng() if rng is None else rng
        
            
    def select_action(self, state, valid_actions):
        """
        Choose an action using an epsilon-greedy strategy,
        but only among the valid actions provided.
        """
        self.steps_done += 1
        # 1) With probability epsilon, pick a random valid action.
        if self.rng.random() < self.epsilon:
            return self.rng.choice(valid_actions)
        else:
            # 2) Otherwise, pick the best valid action based on Q-values from the policy_net.
            with torch.no_grad():
                # Move input state to the same device as the network.
                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
                
                # Forward pass (no_grad context, so no gradient tracking).
                q_values = self.policy_net(state_tensor).squeeze(0)  # Shape: (action_dim,)

                # Create a mask where all actions are -inf except valid ones.
                masked_q = torch.full((self.action_dim,), float('-inf'), device=self.device)
                masked_q[valid_actions] = q_values[valid_actions]

                # Argmax on GPU; then convert to a Python int.
                action = int(torch.argmax(masked_q))
                if action not in valid_actions:
                    print(f"Invalid actions are selected. valid actions : {valid_actions}, action : {action}")
            
            return action


    def push_memory(self, transition):
        # Transition: (state, action, reward, next_state, done, valid_actions_next)
        self.memory.append(transition)

    def optimize_model(self):
        if len(self.memory) < self.batch_size:
            return

        # Sample a batch of transitions from memory
        indices = self.rng.choice(len(self.memory), self.batch_size, replace=False)
        transitions = [self.memory[i] for i in indices]

        # transitions is a list of tuples:
        # (state, action, reward, next_state, done, valid_actions_next)

        # Convert lists to NumPy arrays
        states = np.array([t[0] for t in transitions], dtype=np.float32)
        actions = np.array([t[1] for t in transitions], dtype=np.int64)
        rewards = np.array([t[2] for t in transitions], dtype=np.float32)
        next_states = np.array([t[3] for t in transitions], dtype=np.float32)
        dones = np.array([t[4] for t in transitions], dtype=np.float32)
        valid_actions_next = [t[5] for t in transitions]

        # Convert to PyTorch tensors
        batch_state = torch.from_numpy(states).to(self.device)
        batch_action = torch.from_numpy(actions).unsqueeze(1).to(self.device)
        batch_reward = torch.from_numpy(rewards).unsqueeze(1).to(self.device)
        batch_next_state = torch.from_numpy(next_states).to(self.device)
        batch_done = torch.from_numpy(dones).unsqueeze(1).to(self.device)

        # 1) Current Q-values for the taken actions (policy net)
        q_values_all = self.policy_net(batch_state)            # [batch_size, action_dim]
        current_q = q_values_all.gather(1, batch_action)       # [batch_size, 1]

        # 2) Double DQN logic:
        #    - Use policy_net to pick best action in the next state
        #    - Then evaluate that action with target_net
        with torch.no_grad():
            # a) Next state Q-values from policy_net (for action selection)
            next_q_policy = self.policy_net(batch_next_state)  # [batch_size, action_dim]

            # b) Next state Q-values from target_net (for value)
            next_q_target = self.target_net(batch_next_state)  # [batch_size, action_dim]

            # We'll store the Double DQN chosen action's Q-value in next_q_values
            next_q_values = torch.zeros((self.batch_size, 1), device=self.device)

            for i, valid_acts in enumerate(valid_actions_next):
                if len(valid_acts) > 0:
                    # i) pick best valid action from policy_net
                    best_action_idx = torch.argmax(next_q_policy[i, valid_acts])
                    best_action = valid_acts[best_action_idx]  # This is the actual action index
                    
                    # ii) evaluate Q-value for that action from target_net
                    next_q_values[i] = next_q_target[i, best_action]
                else:
                    # If no valid actions, Q-value remains 0
                    next_q_values[i] = 0.0

        # 3) Bellman target:
        #    target = reward + (1 - done) * gamma * Q_target(next_state, best_action)
        #    where best_action is chosen by the policy_net
        expected_q = batch_reward + (1 - batch_done) * self.gamma * next_q_values

        # 4) Compute loss and update
        loss = nn.MSELoss()(current_q, expected_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()




    def update_target(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())


## 4. Train process definition

In [None]:
import torch
import torch.nn as nn
import numpy as np
import os
import re
from DeepQNet import DQNet
from DeepQNet import DQNAgent
from YahtzeeEnv import YahtzeeEnv

def train_agent(num_episodes=500, print_interval=10, load_filepath=None, save_filepath=None, lr=1e-3, gamma=0.99,
                 epsilon_start=1.0, epsilon_end=0.010, epsilon_decay=0.995,
                 buffer_size=10000, batch_size=64, target_update=100, rng=None):
    """
    Train the DQN agent on YahtzeeEnv for a specified number of episodes.
    Optionally load an existing agent's parameters from 'load_filepath'
    and/or save the agent after training to 'save_filepath'.
    
    Args:
        num_episodes (int): Number of episodes (full games) to train.
        print_interval (int): Print progress every this many episodes.
        load_filepath (str or None): Path to a saved agent checkpoint. 
                                     If provided, loads that agent first.
        save_filepath (str or None): Path to save the trained agent after training.
    
    Returns:
        DQNAgent: The trained (or further trained) DQN agent.
    """

    # Initialize environment.
    env = YahtzeeEnv()
    # The state dimension is determined from the environment.
    state_dim = env.get_state().shape[0]
    # As defined in YahtzeeEnv, there are 43 discrete actions (1..43).
    action_dim = 44

    # 1) Either load an existing agent or create a new one.
    if load_filepath is not None:
        # We load from a checkpoint file.
        print(f"Loading agent from {load_filepath}...")
        agent = load_agent(load_filepath, state_dim, action_dim)
        agent.epsilon = epsilon_start
    else:
        # We create a new agent from scratch.
        agent = DQNAgent(state_dim, action_dim, lr=lr, gamma=gamma,
                 epsilon_start=epsilon_start, epsilon_end=epsilon_end, epsilon_decay=epsilon_decay,
                 buffer_size=buffer_size, batch_size=batch_size, target_update=target_update, rng=rng)
    
    total_steps = 0  # Counts total steps across episodes (for target_net updates).

    for episode in range(num_episodes):
        # Reset the environment for a new episode (a new Yahtzee game).
        env._reset()
        state = env.get_state()
        
        episode_reward = 0.0
        done = False
        
        while not done:
            # Retrieve valid actions for the current state.
            valid_actions = env.get_valid_action()
            
            # Agent picks an action (epsilon-greedy restricted to valid actions).
            action = agent.select_action(state, valid_actions)
            
            # Environment processes the action.
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward

            # If not done, get valid actions for the next state; otherwise empty list.
            next_valid_actions = env.get_valid_action() if not done else []
            
            # Store transition in replay memory.
            agent.push_memory((state, action, reward, next_state, done, next_valid_actions))
            
            # Optimize (update) the policy network using a minibatch from memory.
            agent.optimize_model()
            
            # Move to the next state.
            state = next_state
            total_steps += 1
            
            # Periodically update the target network with policy_net weights.
            if total_steps % agent.target_update == 0:
                agent.update_target()
        
        #  Decay epsilon AFTER the episode ends, not after every batch update
        try : 
            if agent.epsilon > agent.epsilon_end and (episode % (num_episodes // 600)==0):
                agent.epsilon *= agent.epsilon_decay  
        except ZeroDivisionError:
            pass
        # Print training progress every 'print_interval' episodes.
        if (episode + 1) % print_interval == 0:
            print(f"Episode {episode+1}/{num_episodes} - Reward: {episode_reward:.2f}, Score: {env.scorecard[0:6]}|{env.scorecard[6:]},total : {np.sum(env.scorecard)}, Epsilon: {agent.epsilon:.3f}")

    # After training, optionally save the agent.
    if save_filepath is not None:
        save_agent(agent, save_filepath)

    return agent


## 5.helper functions

In [None]:

def test_agent(agent, num_test_episodes=20):
    """
    Test a trained DQN agent (no/low exploration) on YahtzeeEnv.
    
    Args:
        agent (DQNAgent): A trained DQNAgent (with .policy_net on agent.device).
        num_test_episodes (int): Number of episodes to test (full Yahtzee games).
        
    Returns:
        float: The average reward over the test episodes.
    """
    # Temporarily store the old epsilon, then set epsilon to 0 for pure exploitation.
    old_epsilon = agent.epsilon
    agent.epsilon = 0.0  # No exploration during testing

    # We'll move input data to the same device as agent.policy_net
    device = agent.device

    env = YahtzeeEnv()
    rewards = []

    for episode in range(num_test_episodes):
        env._reset()
        state = env.get_state()
        episode_reward = 0.0
        done = False
        
        while not done:
            valid_actions = env.get_valid_action()
            
            # Inference on GPU (or CPU if CUDA not available)
            with torch.no_grad():
                # Move the state to the same device as the model
                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
                
                # Forward pass on the policy network
                q_values_tensor = agent.policy_net(state_tensor).squeeze(0)  
                
                # Move back to CPU for NumPy-based masking
                q_values = q_values_tensor.cpu().numpy()
            
            # Create a masked Q-value array that sets invalid actions to -∞
            masked_q_values = np.full(agent.action_dim, -np.inf)
            for a in valid_actions:
                masked_q_values[a] = q_values[a]
            
            # Choose the best action among valid actions (greedy)
            best_action = int(np.argmax(masked_q_values))
            
            # Step in the environment (on CPU)
            next_state, reward, done, _ = env.step(best_action)
            episode_reward += reward
            
            # Move on to the next state
            state = next_state

        rewards.append(episode_reward)

    # Restore agent’s original epsilon
    agent.epsilon = old_epsilon

    # Compute average reward
    avg_reward = np.mean(rewards)
    print(f"Tested on {num_test_episodes} episodes. Avg reward = {avg_reward:.2f}")
    return avg_reward


def save_agent(agent, filepath="trained_agent.pth"):
    """
    Save the trained DQN agent's policy network and parameters.

    Args:
        agent (DQNAgent): The trained agent to save.
        filepath (str): Path to save the model.
    """
    torch.save({
        'policy_net_state_dict': agent.policy_net.state_dict(),
        'target_net_state_dict': agent.target_net.state_dict(),
        'optimizer_state_dict': agent.optimizer.state_dict(),
        'epsilon': agent.epsilon
    }, filepath)
    print(f"Agent saved to {filepath}")

def save_model_info(model, optimizer, loss_fn, filename="modelinfo.md"):
    with open(filename, "w") as file:
        file.write("# Model Information\n\n")
        
        # 모델의 layer 구조
        file.write("## Model Architecture\n")
        file.write("```python\n")
        file.write(str(model) + "\n")
        file.write("```\n\n")
        
        # Optimizer 정보
        file.write("## Optimizer\n")
        file.write(f"Optimizer: {optimizer.__class__.__name__}\n")
        file.write(f"Learning rate: {optimizer.param_groups[0]['lr']}\n")
        file.write(f"Parameters: {optimizer.param_groups[0]}\n\n")
        
        # Loss function 정보
        file.write("## Loss Function\n")
        file.write(f"Loss Function: {loss_fn.__class__.__name__}\n\n")
        
        # Activation functions 정보 (모델에서 사용하는 activation function 추출)
        file.write("## Activation Functions\n")
        activation_functions = []
        for layer in model.children():
            if isinstance(layer, nn.ReLU):
                activation_functions.append("ReLU")
            elif isinstance(layer, nn.Sigmoid):
                activation_functions.append("Sigmoid")
            elif isinstance(layer, nn.Tanh):
                activation_functions.append("Tanh")
        
        if activation_functions:
            file.write(f"Used activation functions: {', '.join(activation_functions)}\n")
        else:
            file.write("No activation function found\n")

def load_agent(filepath, state_dim, action_dim):
    """
    Load a trained DQN agent from a file.

    Args:
        filepath (str): Path to the saved model checkpoint.
        state_dim (int): State dimension (should match training).
        action_dim (int): Action dimension (should match training).

    Returns:
        DQNAgent: The loaded agent.
    """
    # Create a new agent instance with the same architecture
    agent = DQNAgent(state_dim, action_dim)

    # Load the checkpoint file
    checkpoint = torch.load(filepath, map_location=torch.device('cpu'))  # Use CPU for portability
    
    # Restore model parameters
    agent.policy_net.load_state_dict(checkpoint['policy_net_state_dict'])
    agent.target_net.load_state_dict(checkpoint['target_net_state_dict'])
    agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    agent.epsilon = checkpoint['epsilon']
    
    print(f"Agent loaded from {filepath}")
    return agent

def find_latest_trial(num_episodes):
    """
    Find the largest trial number among files in the current directory
    matching the pattern: trained_agent_{trial}_{num_episodes}.pth

    Returns:
        int: the largest trial found, or -1 if no matching file exists.
    """
    pattern = re.compile(r"^trained_agent_(\d+)_(\d+)\.pth$")
    max_trial = -1

    # Check every file in the current working directory
    for fname in os.listdir('.'):
        match = pattern.match(fname)
        if match:
            found_trial = int(match.group(1))
            found_episodes = int(match.group(2))
            if found_episodes == num_episodes and found_trial > max_trial:
                max_trial = found_trial

    return max_trial

def play_episode(agent, md_filename="yahtzee_playthrough.md"):
    """
    Plays one episode of Yahtzee with a trained agent, writing each step's info to a Markdown file.
    """
    env = YahtzeeEnv()
    env._reset()
    done = False
    steps = 0
    cumulative_reward = 0
    category = {
    0: "initiate roll",  # 처음 굴리기

    # 주사위를 다시 굴리는 행동
    1:  "reroll 00001",  2:  "reroll 00010",  3:  "reroll 00011",  4:  "reroll 00100",
    5:  "reroll 00101",  6:  "reroll 00110",  7:  "reroll 00111",  8:  "reroll 01000",
    9:  "reroll 01001", 10:  "reroll 01010", 11: "reroll 01011", 12: "reroll 01100",
    13: "reroll 01101", 14: "reroll 01110", 15: "reroll 01111", 16: "reroll 10000",
    17: "reroll 10001", 18: "reroll 10010", 19: "reroll 10011", 20: "reroll 10100",
    21: "reroll 10101", 22: "reroll 10110", 23: "reroll 10111", 24: "reroll 11000",
    25: "reroll 11001", 26: "reroll 11010", 27: "reroll 11011", 28: "reroll 11100",
    29: "reroll 11101", 30: "reroll 11110", 31: "reroll 11111",

    # 점수를 기록하는 행동
    32: "score : ones", 33: "score : twos", 34: "score : threes", 35: "score : fours",
    36: "score : fives", 37: "score : sixes", 38: "score : choices", 39: "score : four of a kind",
    40: "score : full house", 41: "score : small straight", 42: "score : large straight",
    43: "score : yahtzee",
}


    # We prepare lines of markdown
    md_lines = []
    md_lines.append("# Yahtzee Episode Playthrough\n")
    md_lines.append("**Environment:** Yahtzee\n")
    md_lines.append("**Agent:** Trained DQN (placeholder)\n")
    md_lines.append("---\n")

    md_lines.append("## Step-by-Step Decisions\n")
    md_lines.append("| Step | Dice (One-Hot) | Rerolls | Turn | Valid Actions | Chosen Action | Reward | Cumulative Reward | Done? |")
    md_lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- |")

    # Start the loop
    state = env.get_state()
    while not done and steps < 100:  # 12 turns is typical, 100 is a safe upper bound
        valid_actions = env.get_valid_action()
        action = agent.select_action(state, valid_actions)
        next_state, reward, done, info = env.step(action)

        steps += 1
        cumulative_reward += reward

        # Convert dice to a more readable list of faces
        dice_desc = []
        for i in range(5):
            face_idx = np.argmax(env.dice[i])  # which face is "1"
            dice_desc.append(str(face_idx+1))
        dice_str = ", ".join(dice_desc)

        # Summarize step in table row
        line = f"| {steps} | **{dice_str}** | {next_state[30]} | {next_state[31]} | `{next_state[32:43].astype(int)}` | **{category[action]}** | {reward:.2f} | {cumulative_reward:.2f} | {done} |"
        md_lines.append(line)

        state = next_state

    # Summarize final score
    final_score = np.sum(env.scorecard)
    bonus_desc = f"(Bonus Active)" if env.bonus else ""
    md_lines.append("\n---\n")
    md_lines.append(f"**Episode finished** after **{steps}** steps.\n\n")
    md_lines.append(f"**Final Scorecard** = {env.scorecard}  \n")
    md_lines.append(f"**Sum of Scorecard** = {final_score} {bonus_desc}\n")
    md_lines.append(f"**Cumulative Reward** = {cumulative_reward:.2f}\n")

    # Write to Markdown file
    with open(md_filename, "w", encoding="utf-8") as f:
        f.write("\n".join(md_lines))

    print(f"Playthrough complete. Markdown log written to {md_filename}.")


## main execution

In [None]:

if __name__ == "__main__":

    # Just a debug check: prints True if GPU is available
    print("CUDA available?", torch.cuda.is_available())
    #print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    torch.device("cpu")

    # Number of training episodes for this run
    num_episodes = 1000

    # 1) Look for an existing trial file in the current directory
    trial = find_latest_trial(num_episodes)
    if trial >= 0:
        # Found a file trained_agent_{trial}_{num_episodes}.pth
        load_filepath = f"trained_agent_{trial}_{num_episodes}.pth"
        print(f"Loading from file: {load_filepath}")
    else:
        load_filepath = None  # No prior file found
        print("No prior training file found; starting from scratch.")

    # 2) Define where to save next trial
    #    e.g. if trial = 2, we save next as trained_agent_3_{num_episodes}.pth
    save_filepath = f"trained_agent_{trial+1}_{num_episodes}.pth"

    # 3) Train (or continue training) the agent and save the model to save_filepath
    rng = np.random.default_rng()
    trained_agent = train_agent(
        num_episodes=num_episodes,
        print_interval=50,
        load_filepath=load_filepath,      # Might be None if not found
        save_filepath=save_filepath, lr=5e-5, gamma=0.99,
                 epsilon_start=1.0 / (trial+2), epsilon_end=0.010, epsilon_decay=0.995,
                 buffer_size=100000, batch_size=128, target_update=100, rng=rng
    )

    # 4) Test the agent
    test_score = test_agent(trained_agent, num_test_episodes=100)
    print(f"Final average test reward: {test_score:.2f}")
    play_episode(trained_agent)

    # 5) save the model info in modelinfo.md
    save_model_info(trained_agent.policy_net.net, trained_agent.optimizer, nn.MSELoss())