# 使用機器學習的方法改善策略

# import package

In [50]:
# import packages
## niuniu function
from niuniu_func import *

## caculating
import random
import numpy as np

## torch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import torch.nn.functional as F
import multiprocessing as mp

## os
import os

# build niuniu env

In [70]:
# env of niuniu
# set myself as player 0
class NiuNiuEnv:
    # init 
    def __init__(self):
        # generate deck
        self.deck = self.generate_deck()
        # generate player == 4
        self.players = [[] for _ in range(4)]
        self.banker_index = -1
        # banker multiplier
        self.banker_multiplier = 1
        # bet number
        self.bets = [0, 0, 0, 0]
        # state, 0: bank step, 1: bet step, 2: result step
        self.current_phase = 0
        # reset
        self.reset()

    # generate deck
    def generate_deck(self):
        suits = ['heart', 'spade', 'diamond', 'club']
        ranks = ['2', '3', '4', '5', '6', '7', '8', '9', '10', 'J', 'Q', 'K', 'A']
        return [(suit, rank) for suit in suits for rank in ranks]

    # reset
    def reset(self):
        # regenerate deck & shuffle
        self.deck = self.generate_deck()        
        random.shuffle(self.deck)

        # every player have 4 cards
        self.players = [[self.deck.pop() for _ in range(4)] for _ in range(4)]

        # init bets
        self.bets = [0] * 4
        self.banker_index = -1
        self.banker_multiplier = 1

        # init step
        self.current_phase = 0

        # reload state
        self.state = self.get_state()
        return self.state
    
    # get myself's hand number
    def get_state(self):
        # myself's hand
        state = []
        for card in self.players[0]:
            state.append(get_suit_rank(card))
            state.append(get_card_rank(card))
        # which step
        state.append(self.current_phase)
        # who is banker
        state.append(self.banker_index)
        # banker multiplayer
        state.append(self.banker_multiplier)
        # every player's bet
        state.extend(self.bets)
        return np.array(state, dtype=np.float32)
        
    # step    
    def step(self, action):
        """
        action: [banker_action, bet_action]
        banker_action: 0-4 is baker multi
        bet_action: 1-5 is bet multi
        """
        # unpack action
        banker_action, bet_action = action 
        # banker
        self.banker_bid = banker_action
        # bet
        self.bet_amount = bet_action

        """
        step 1 : decide whether to get banker
            * myself : by ppo agent
            * others : by simulate_ev to decide
        """
        bank_multipliers = [simulate_ev(self.players[i], 100000)[0] for i in range(1, 4)]
        bank_multipliers[0] = self.banker_bid
        # run time : 22s

        """
        step 2 : decide final banker(the max multiplier)
            * if all not want to be banker, random choose one & set multiplier = 1
            * if more than one have same multiplier, random choose one
        """
        max_bet = max(bank_multipliers)
        if max_bet == 1:
            random_banker = random.choice(range(4))
            bank_multipliers[random_banker] = 1
        banker_candidates = [i for i, b in enumerate(bank_multipliers) if b == max_bet]
        self.banker_index = random.choice(banker_candidates)
        self.banker_multiplier = max_bet
        banker_hand = self.players[self.banker_index]

        # whether myself is banker
        is_banker = (self.banker_index == 0)

        # go to next action -- bet
        self.current_phase = 1

        # bet action
        if is_banker:
            """
            step 3 : if myself is banker
                * I don't need to bet
                * others use `calculate_ev_against_banker` to bet, besides
                if banker multiplier over 3, we assume banker have niu
            """
            self.bets[0] = self.bet_amount
            for i in range(1, 4):
                have_niu = self.banker_multiplier >= 3
                self.bets[i] = calculate_ev_against_banker(self.players[i], 100000, have_niu)[1]
        else:
            """
            step 4 : if myself is not banker
                * let ppo decide bet
                * others we don't care
            """
            self.bets[0] = max(1, min(5, action[1]))

        """
        step 5 : add the 5th card to every player
        """
        for i in range(4):
            self.players[i].append(self.deck.pop())

        # go to next action -- result
        self.current_phase = 2

        """
        step 6 : caculate ev of myself
            * I am banker : caculate payout of the sum of me against others(use negative)
            * I am not banker : calculate the payout against the banker
        """
        if is_banker:
            # I am banker
            total_payout = -sum(
                calculate_payout(self.players[i], banker_hand, False) * self.bets[i] * self.banker_multiplier
                for i in range(4) if i != self.banker_index
            )

            """
            step 7 : caculate reward(scaling & punishing)
            """
            max_payout = 3 * 5 * self.banker_multiplier  # 假設最多 3 個人都押最大 (5)
            min_payout = -max_payout  # 最大可能損失 (全部玩家贏最大)
            reward = (total_payout - min_payout) / (max_payout - min_payout)  # 使 reward 在 [-1, 1] 之間

            if reward > 0.8:
                reward += 0.3  # 高獲利獎勵
            elif reward > 0.5:
                reward += 0.2  # 中等獲利獎勵
            elif reward < 0:
                reward -= 0.2  # 只要是負的就給懲罰
            elif reward < -0.5:
                reward -= 0.4  # 大虧損懲罰
            elif reward < -0.8:
                reward -= 0.6  # 超大額虧損懲罰

        else:
            # I am not banker
            total_payout = calculate_payout(self.players[0], banker_hand, False) * self.bets[0] * self.banker_multiplier
            
            """
            step 7 : caculate reward(scaling & punishing)
            """
            max_possible_profit = 5 * self.banker_multiplier  # 假設最大下注 5
            max_possible_loss = -max_possible_profit  # 最壞情況：輸掉最大押注
            # reward 標準化到 [-1, 1]
            reward = (total_payout - max_possible_loss) / (max_possible_profit - max_possible_loss)

            # 強化獎勵與懲罰
            if reward > 0.8:
                reward += 0.3  # 高獲利獎勵
            elif reward > 0.5:
                reward += 0.2  # 中等獲利獎勵
            elif reward > 0.2:
                reward += 0.1  # 低獲利獎勵
            elif reward < -0.2:
                reward -= 0.1  # 輕微虧損懲罰
            elif reward < -0.5:
                reward -= 0.3  # 大額虧損懲罰
            elif reward < -0.8:
                reward -= 0.6  # 超大額虧損懲罰

        if is_banker:
            if self.banker_multiplier >= 3 and total_payout < 0:
                reward -= 0.3  # 如果搶 3 倍但輸了，額外懲罰

        """
        step 8 : finish one round
        """
        done = True

        """
        step 9 : reset
        """
        self.reset()

        return self.get_state(), reward, done, {}



## simple test
test whether niuniu env is runnable <br>
to avoid getting error of having NaN <br>

In [71]:
# test env of niuniu
def test_env():
    env = NiuNiuEnv()
    state = env.reset()
    print("Initial State:", state)
    # test 10 times
    for i in range(10):
        # random action
        action = [np.random.randint(0, 5), np.random.randint(1, 6)]
        state, reward, done, _ = env.step(action)
        if np.isnan(state).any():
            print(f"NaN detected in state at step {i}!")
        if np.isnan(reward):
            print(f"NaN detected in reward at step {i}!")
        print(f"Step {i} - State: {state}, Reward: {reward}")

test_env()
# run time : 3m 55s
"""
result explain :
    * the first 8 numbers represent 4 card in hands, (suit, card)
    * the others represent the other states
"""

Initial State: [ 3.  1.  4. 12.  1.  1.  4.  6.  0. -1.  1.  0.  0.  0.  0.]
Step 0 - State: [ 2. 10.  2.  6.  2. 12.  3. 10.  0. -1.  1.  0.  0.  0.  0.], Reward: 0.8
Step 1 - State: [ 1. 12.  4.  3.  4.  4.  2.  3.  0. -1.  1.  0.  0.  0.  0.], Reward: 0.8666666666666667
Step 2 - State: [ 4.  7.  1.  8.  2. 10.  1.  1.  0. -1.  1.  0.  0.  0.  0.], Reward: 1.7
Step 3 - State: [ 1.  4.  3.  9.  1.  9.  4.  2.  0. -1.  1.  0.  0.  0.  0.], Reward: -0.06666666666666665
Step 4 - State: [ 2. 11.  2.  5.  4. 13.  3. 10.  0. -1.  1.  0.  0.  0.  0.], Reward: 1.4000000000000001
Step 5 - State: [ 4. 10.  1. 13.  2.  4.  1.  3.  0. -1.  1.  0.  0.  0.  0.], Reward: 1.6
Step 6 - State: [ 4.  7.  3. 10.  2.  9.  3.  1.  0. -1.  1.  0.  0.  0.  0.], Reward: 0.8999999999999999
Step 7 - State: [ 1.  2.  4.  1.  3.  5.  4. 11.  0. -1.  1.  0.  0.  0.  0.], Reward: 1.2
Step 8 - State: [ 4.  3.  2.  4.  3.  3.  4. 11.  0. -1.  1.  0.  0.  0.  0.], Reward: 1.3
Step 9 - State: [ 3.  2.  4.  1.  1.  3.  

'\nresult explain :\n    * the first 8 numbers represent 4 card in hands, (suit, card)\n    * the others represent the other states\n'

# build PPO

In [58]:
# PPO 價值網絡 (V(s))
class ValueNetwork(nn.Module):
    def __init__(self, input_dim):
        super(ValueNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.LayerNorm(128, eps=1e-5),  # 避免標準化時出現 NaN
            nn.LeakyReLU(),
            nn.Linear(128, 128),
            nn.LayerNorm(128, eps=1e-5),
            nn.LeakyReLU(),
            nn.Linear(128, 1)  # 輸出 V(s)
        )

    def forward(self, x):
        return self.fc(x).squeeze(-1)  # 讓輸出維度變成 (batch,)

In [59]:
# PPO 策略網絡
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim1, output_dim2):
        super(PolicyNetwork, self).__init__()
        self.shared_fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU()
        )
        self.banker_fc = nn.Linear(128, output_dim1)  # 搶莊動作
        self.bet_fc = nn.Linear(128, output_dim2)  # 下注動作

    def forward(self, x):
        x = self.shared_fc(x)

        banker_logits = self.banker_fc(x)
        bet_logits = self.bet_fc(x)

        banker_probs = F.softmax(banker_logits, dim=-1)
        bet_probs = F.softmax(bet_logits, dim=-1)

        return banker_probs, bet_probs

    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)  # 轉成 batch
        banker_probs, bet_probs = self.forward(state)

        banker_dist = Categorical(banker_probs)
        bet_dist = Categorical(bet_probs)

        banker_action = banker_dist.sample().item()
        bet_action = bet_dist.sample().item()  # 這裡不加 +1，讓外部處理

        banker_log_prob = banker_dist.log_prob(torch.tensor(banker_action))
        bet_log_prob = bet_dist.log_prob(torch.tensor(bet_action))

        return (banker_action, bet_action), banker_log_prob, bet_log_prob


In [60]:
# PPO Agent
class PPOAgent:
    def __init__(self, input_dim, output_dim1, output_dim2, lr=3e-4, gamma=0.99, eps_clip=0.2, K_epochs=10, model_path="niu_ppo_model", num_envs=8):
        self.device = torch.device("cpu")
        self.num_envs = num_envs
        
        self.policy = PolicyNetwork(input_dim, output_dim1, output_dim2).to(self.device)
        self.value = ValueNetwork(input_dim).to(self.device)
        self.optimizer_policy = optim.Adam(self.policy.parameters(), lr=lr)
        self.optimizer_value = optim.Adam(self.value.parameters(), lr=lr)

        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        self.model_path = model_path
        # load saved model
        self.load_model()


    def compute_returns(self, rewards, dones):
        returns = []
        R = torch.zeros(1, dtype=torch.float32).to(self.device)  # 改為標量初始化

        # 確保 rewards 和 dones 是 (T, 1) 形狀
        rewards = torch.tensor(rewards, dtype=torch.float32).view(-1, 1)
        dones = torch.tensor(dones, dtype=torch.float32).view(-1, 1)

        for t in reversed(range(len(rewards))):
            R = rewards[t] + self.gamma * R * (1 - dones[t])
            returns.insert(0, R)

        return torch.cat(returns).detach()


    def update(self, states, actions, log_probs, rewards, dones):
        returns = self.compute_returns(rewards, dones)

        states = torch.tensor(states, dtype=torch.float32).to(self.device)
        actions = torch.tensor(actions, dtype=torch.long).view(-1, 2).to(self.device)  # 確保 actions 維度正確
        old_log_probs = torch.tensor(log_probs, dtype=torch.float32).view(-1).to(self.device)  # 轉為 1D


        for _ in range(self.K_epochs):
            banker_probs, bet_probs = self.policy(states)

            banker_probs = torch.nan_to_num(banker_probs, nan=0.0)
            bet_probs = torch.nan_to_num(bet_probs, nan=0.0)

            banker_dist = Categorical(banker_probs)
            bet_dist = Categorical(bet_probs)

            new_banker_log_prob = banker_dist.log_prob(actions[:, 0])
            new_bet_log_prob = bet_dist.log_prob(actions[:, 1] - 1)
            new_log_probs = new_banker_log_prob + new_bet_log_prob
            new_log_probs = torch.nan_to_num(new_banker_log_prob, nan=0.0) + torch.nan_to_num(new_bet_log_prob, nan=0.0)
    
            value_estimates = self.value(states).view(-1)
            value_estimates = torch.nan_to_num(value_estimates, nan=0.0)

            advantages = returns - value_estimates.detach()

            ratio = torch.exp(new_log_probs - old_log_probs)
            surr1 = ratio * advantages
            surr2 = torch.clamp(ratio, 1 - self.eps_clip, 1 + self.eps_clip) * advantages

            policy_loss = -torch.min(surr1, surr2).mean()
            value_loss = F.mse_loss(value_estimates, returns)

            self.optimizer_policy.zero_grad()
            policy_loss.backward()
            self.optimizer_policy.step()

            self.optimizer_value.zero_grad()
            value_loss.backward()
            self.optimizer_value.step()

    def select_action(self, state):
        # state = torch.FloatTensor(state).unsqueeze(0)
        # state = torch.nan_to_num(state, nan=0.0, posinf=1.0, neginf=-1.0)
        state = torch.FloatTensor(state).to(self.device)
        state = torch.nan_to_num(state, nan=0.0)

        banker_probs, bet_probs = self.policy(state)
        banker_probs = torch.nan_to_num(banker_probs, nan=0.2)
        bet_probs = torch.nan_to_num(bet_probs, nan=0.2)

        banker_dist = Categorical(banker_probs)
        bet_dist = Categorical(bet_probs)

        banker_action = banker_dist.sample().cpu().numpy()
        bet_action = (bet_dist.sample() + 1).cpu().numpy()

        banker_log_prob = banker_dist.log_prob(torch.tensor(banker_action, device=self.device))
        bet_log_prob = bet_dist.log_prob(torch.tensor(bet_action - 1, device=self.device))

        return np.array([banker_action, bet_action]), banker_log_prob.detach().cpu().numpy(), bet_log_prob.detach().cpu().numpy()
    
    def save_model(self):
        os.makedirs(os.path.dirname(self.model_path), exist_ok=True)
        torch.save(self.policy.state_dict(), f"{self.model_path}_policy.pth")
        torch.save(self.value.state_dict(), f"{self.model_path}_value.pth")
        print("model saved")

    def load_model(self):
        policy_path = f"{self.model_path}_policy.pth"
        value_path = f"{self.model_path}_value.pth"

        if os.path.exists(policy_path) and os.path.exists(value_path):
            self.policy.load_state_dict(torch.load(policy_path))
            self.value.load_state_dict(torch.load(value_path))
            print("load saved model")
        else:
            print("model not found, start training from begining")




# train

In [63]:
num_episodes = 120  # 訓練回合數
batch_size = 2048
gamma = 0.99
clip_epsilon = 0.2
lr = 3e-4
update_epochs = 10
save_interval = 5  # 每 5 回合存一次模型

save_model_path = "D:/python/poker_gto/ppo_models/ppo_model_v1"
num_envs = 12  # 使用 8 個環境

In [64]:
env = NiuNiuEnv()
state_dim = len(env.get_state())
banker_action_dim = 5  # 搶莊倍率 (0~4)
bet_action_dim = 5  # 下注倍率 (1~5)

ppo_agent = PPOAgent(state_dim, banker_action_dim, bet_action_dim, model_path=save_model_path, num_envs=num_envs)


model not found, start training from begining


In [65]:
# 訓練
for episode in range(num_episodes):
    state = env.reset()
    done = False
    episode_reward = 0

    states, actions, log_probs, rewards, dones = [], [], [], [], []

    while not done:
        action, banker_log_prob, bet_log_prob = ppo_agent.select_action(state)

        # 確保動作格式正確
        banker_action, bet_action = action
        next_state, reward, done, _ = env.step((banker_action, bet_action))  # 確保和環境兼容

        # 記錄數據
        states.append(state)
        actions.append([banker_action, bet_action])  # 確保 actions 格式正確
        log_probs.append(banker_log_prob + bet_log_prob)
        rewards.append(reward)
        dones.append(done)

        state = next_state
        episode_reward += reward

    # 更新 PPO
    ppo_agent.update(states, actions, log_probs, rewards, dones)

    # 顯示訓練結果
    print(f"Episode {episode + 1}: Total Reward = {episode_reward}")
    
    # 每 `save_interval` 次存一次模型
    if (episode + 1) % save_interval == 0:
        ppo_agent.save_model()


Episode 1: Total Reward = -0.1
Episode 2: Total Reward = 1.4000000000000001
Episode 3: Total Reward = 0.4
Episode 4: Total Reward = -0.6
Episode 5: Total Reward = 0.2
model saved
Episode 6: Total Reward = 1.2
Episode 7: Total Reward = 0.8
Episode 8: Total Reward = 0.1
Episode 9: Total Reward = 1.7
Episode 10: Total Reward = -0.7999999999999999
model saved
Episode 11: Total Reward = 0.2
Episode 12: Total Reward = 1.2
Episode 13: Total Reward = -0.1
Episode 14: Total Reward = 0.4
Episode 15: Total Reward = 0.5
model saved
Episode 16: Total Reward = 1.0
Episode 17: Total Reward = 0.4
Episode 18: Total Reward = 0.1
Episode 19: Total Reward = -0.5
Episode 20: Total Reward = -1.1
model saved
Episode 21: Total Reward = -0.1
Episode 22: Total Reward = -0.1
Episode 23: Total Reward = 0.8
Episode 24: Total Reward = 0.5
Episode 25: Total Reward = -0.1
model saved
Episode 26: Total Reward = 0.8999999999999999
Episode 27: Total Reward = 0.1
Episode 28: Total Reward = -0.1
Episode 29: Total Reward =

In [66]:
def load_model(agent, policy_model_path, value_model_path):
    """
    載入 policy 和 value 模型，並顯示當前使用的模型名稱。
    """
    print(f"🔍 正在使用的模型: policy -> {policy_model_path}, value -> {value_model_path}")
    agent.policy.load_state_dict(torch.load(policy_model_path))
    agent.value.load_state_dict(torch.load(value_model_path))
    print("✅ 模型載入完成！")

In [67]:
def test_trained_model(env, agent, policy_model="D:\python\poker_gto\ppo_models\ppo_model_v1_policy.pth", value_model="D:\python\poker_gto\ppo_models\ppo_model_v1_value.pth"):
    """
    使用訓練好的模型，讓使用者輸入 4 張手牌，並讓模型決策搶莊與下注倍率
    """
    print("\U0001F50D 測試模式啟動！輸入 `exit` 可離開測試模式。")
    
    # 確保模型在 GPU 上運行
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    agent.policy.to(device)
    load_model(agent, policy_model, value_model)
    
    while True:
        try:
            # 取得環境的初始狀態
            state = env.reset()

            # 讓使用者輸入 4 張手牌
            print("\n請輸入 4 張手牌（格式：heart J diamond 10 club J spade A），或輸入 `exit` 離開:")
            input_cards = input().strip()

            if input_cards.lower() == 'exit':
                print("\U0001F44B 測試結束！")
                break

            input_cards = input_cards.split()

            if len(input_cards) != 8:
                print("❌ 錯誤！請輸入 4 張手牌的花色與數值（共 8 個字串）。")
                continue

            # 解析輸入的手牌
            player_hand = [(input_cards[i], input_cards[i + 1]) for i in range(0, 8, 2)]
            print(f"\U0001F3B4 你的手牌: {player_hand}")

            # 更新 state，確保手牌資訊正確
            for i in range(4):
                state[i * 2] = get_suit_rank(player_hand[i])  # 花色
                state[i * 2 + 1] = get_card_rank(player_hand[i])  # 點數

            # 轉換為 PyTorch Tensor 並移至 GPU
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)

            # 模型預測搶莊倍率
            with torch.no_grad():
                banker_dist, bet_dist = agent.policy(state_tensor)
                print(f"🔍 banker_dist: {banker_dist}")  # 打印 banker_dist 的結構
                print(f"🔍 bet_dist: {bet_dist}")  # 打印 bet_dist 的結構
                
                # 根據模型輸出結構決定如何取得概率
                if hasattr(banker_dist, 'probs'):
                    banker_action = torch.argmax(banker_dist.probs).item()
                else:
                    # 如果 banker_dist 沒有 'probs' 屬性，可以打印 banker_dist 或作其他處理
                    banker_action = torch.argmax(banker_dist).item()
                    
                if hasattr(bet_dist, 'probs'):
                    bet_action = torch.argmax(bet_dist.probs).item()
                else:
                    bet_action = torch.argmax(bet_dist).item()

            print(f"\U0001F916 模型預測的搶莊倍率: {banker_action}")

            # 讓使用者輸入是否成功搶莊
            is_banker = input("✅ 是否搶到莊？ (y/n): ").strip().lower()
            if is_banker == 'y':
                print("\U0001F389 你是莊家！不需要下注")
            else:
                print("沒有搶到莊家")
                banker_multiplier = float(input("\U0001F4E2 請輸入莊家的倍率: ").strip())
                print(f"\U0001F916 莊家的倍率: {banker_multiplier}")

                # 更新 state 中的莊家倍率
                state[-4] = banker_multiplier

                # 轉換為 PyTorch Tensor 並移至 GPU
                state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)

                # 模型預測下注倍率
                with torch.no_grad():
                    _, bet_dist = agent.policy(state_tensor)
                    print(f"🔍 bet_dist: {bet_dist}")  # 打印 bet_dist 的結構

                    if hasattr(bet_dist, 'probs'):
                        bet_action = torch.argmax(bet_dist.probs).item()
                    else:
                        bet_action = torch.argmax(bet_dist).item()

                print(f"\U0001F916 模型建議的下注倍率: {bet_action}")

        except Exception as e:
            print(f"❌ 發生錯誤: {e}")


In [69]:
test_trained_model(env, ppo_agent)

🔍 測試模式啟動！輸入 `exit` 可離開測試模式。
🔍 正在使用的模型: policy -> D:\python\poker_gto\ppo_models\ppo_model_v1_policy.pth, value -> D:\python\poker_gto\ppo_models\ppo_model_v1_value.pth
✅ 模型載入完成！

請輸入 4 張手牌（格式：heart J diamond 10 club J spade A），或輸入 `exit` 離開:
🎴 你的手牌: [('heart', '9'), ('diamond', '3'), ('club', '3'), ('spade', '6')]
🔍 banker_dist: tensor([[0.0113, 0.0166, 0.0020, 0.9658, 0.0043]])
🔍 bet_dist: tensor([[0.0047, 0.9180, 0.0500, 0.0215, 0.0057]])
🤖 模型預測的搶莊倍率: 3
🎉 你是莊家！不需要下注

請輸入 4 張手牌（格式：heart J diamond 10 club J spade A），或輸入 `exit` 離開:
👋 測試結束！


In [38]:
import torch
import numpy as np
from collections import Counter

def generate_random_hand():
    suits = ["heart", "diamond", "club", "spade"]
    ranks = ["2", "3", "4", "5", "6", "7", "8", "9", "10", "J", "Q", "K", "A"]
    hand = np.random.choice([f"{suit} {rank}" for suit in suits for rank in ranks], 4, replace=False)
    hand = [tuple(card.split()) for card in hand]
    return hand

def test_banker_distribution(env, agent, num_tests=100):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    agent.policy.to(device)
    
    banker_choices = []
    
    for _ in range(num_tests):
        state = env.reset()
        player_hand = generate_random_hand()
        
        for i in range(4):
            state[i * 2] = get_suit_rank(player_hand[i])
            state[i * 2 + 1] = get_card_rank(player_hand[i])

        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
        
        with torch.no_grad():
            banker_dist, _ = agent.policy(state_tensor)
            banker_action = torch.argmax(banker_dist).item()
            banker_choices.append(banker_action)
    
    # 計算分布
    counter = Counter(banker_choices)
    total = sum(counter.values())
    for action, count in sorted(counter.items()):
        print(f"倍率 {action}: {count} 次 ({count / total:.2%})")
    
    return counter

# 測試 100 次隨機手牌的 banker_dist 分布
test_banker_distribution(env, ppo_agent, num_tests=100)


倍率 3: 100 次 (100.00%)


Counter({3: 100})

In [37]:
test_trained_model(env, ppo_agent)

🔍 測試模式啟動！輸入 `exit` 可離開測試模式。

請輸入 4 張手牌（格式：heart J diamond 10 club J spade A），或輸入 `exit` 離開:
🎴 你的手牌: [('heart', 'J'), ('diamond', '10'), ('club', 'J'), ('spade', 'A')]
📊 banker_dist 統計 (測試 100 次): {3: 100}

請輸入 4 張手牌（格式：heart J diamond 10 club J spade A），或輸入 `exit` 離開:
👋 測試結束！


# to do list
* **(finish)** 訓練時應該包含花色 : 可能將花色也轉為數值，變成一個 2 維度的 state  <br>
* **(finish)** 訓練後應該可以保存模型，並且疊家每次訓練的成果上去 <br>
* 新增一個輸入 : 假設我不是莊家時，現在的倍率是幾倍，這會影響到我後續的下注策略 <br>
* <b>(maybe finish)</b>輸入我的手牌之類的資訊後應該要可以當作回測，紀錄到模型訓練當中，並加以改進 <br>

* 完成後可能可以架設簡單的 app 或 api  <br>
* 完成後可以接著改做 德州撲克的訓練 <br>

In [None]:
def backtest_with_real_hands(env, agent):
    """
    讓使用者輸入實際手牌，讓 AI 提供決策建議，並將結果回報給模型，增強學習
    """
    print("\U0001F4CA 進入回測模式！輸入 `exit` 可離開回測模式。")
    
    while True:
        try:
            state = env.reset()

            print("\n請輸入 4 張手牌（格式：heart J diamond 10 club J spade A），或輸入 `exit` 離開:")
            input_cards = input().strip()

            if input_cards.lower() == 'exit':
                print("\U0001F44B 回測結束！")
                break

            input_cards = input_cards.split()

            if len(input_cards) != 8:
                print("❌ 錯誤！請輸入 4 張手牌的花色與數值（共 8 個字串）。")
                continue

            player_hand = [(input_cards[i], input_cards[i + 1]) for i in range(0, 8, 2)]
            print(f"\U0001F3B4 你的手牌: {player_hand}")

            for i in range(4):
                state[i] = card_value(player_hand[i])

            state_tensor = torch.tensor(state, dtype=torch.float32)

            with torch.no_grad():
                banker_dist, bet_dist = agent.policy(state_tensor)
                banker_action = torch.argmax(banker_dist.probs).item()
                bet_action = torch.argmax(bet_dist.probs).item()

            print(f"\U0001F916 AI 建議的搶莊倍率: {banker_action}")

            is_banker = input("✅ 是否搶到莊？ (y/n): ").strip().lower()
            if is_banker == 'y':
                print("\U0001F389 你是莊家！不需要下注")
                bet_action = 0
            else:
                print(f"\U0001F916 AI 建議的下注倍率: {bet_action}")

            reward = float(input("\U0001F4B0 請輸入這局的最終收益（負值代表虧損）: ").strip())

            states = [state]
            actions = [[banker_action, bet_action]]
            rewards = [reward]
            dones = [True]

            agent.update(states, actions, rewards, dones)
            print("\U0001F4C8 AI 已學習這局的結果！")

        except Exception as e:
            print(f"❌ 發生錯誤: {e}")


# parellel computing 