# 使用機器學習的方法改善策略

# import package

In [1]:
from niuniu_func import *
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical


In [2]:
deck = generate_deck()
random.shuffle(deck)
players = [[deck.pop() for _ in range(4)] for _ in range(4)]
print(players)
bank_multipliers = [simulate_ev(players[i], 100000)[0] for i in range(4)]
print(bank_multipliers)
max_bet = max(bank_multipliers)
print(max_bet)
banker_candidates = [i for i, b in enumerate(bank_multipliers) if b == max_bet]
banker_index = random.choice(banker_candidates)
print(banker_index)
banker_multiplier = max_bet
print(bank_multipliers)
banker_hand = players[banker_index]
print(banker_hand)

[[('diamond', '7'), ('diamond', '9'), ('diamond', '6'), ('club', '4')], [('heart', 'J'), ('club', '2'), ('club', '6'), ('diamond', 'K')], [('heart', '7'), ('spade', '3'), ('heart', '3'), ('heart', '5')], [('spade', 'Q'), ('club', '3'), ('diamond', '3'), ('heart', '10')]]
[4, 4, 1, 4]
4
0
[4, 4, 1, 4]
[('diamond', '7'), ('diamond', '9'), ('diamond', '6'), ('club', '4')]


In [3]:
players[0]

[('diamond', '7'), ('diamond', '9'), ('diamond', '6'), ('club', '4')]

In [4]:
bets = [0] * 4
is_banker = (banker_index == 0)
if is_banker:
    bets[0] = 0  # 莊家不下注
    for i in range(1, 4):
        have_niu = banker_multiplier >= 3
        bets[i] = calculate_ev_against_banker(players[i], 100000, have_niu)[1]
else:
    # 4 step : if myslef is not banker, decide the bet
    # also use calculate_ev_against_banker(player_hand, num_simulations=100000, have_niu=True)
    # if banker multi is 3 or 4, have_niu == True, otherwise, have_niu == Flase
    have_niu = banker_multiplier >= 3
    bets[0] = calculate_ev_against_banker(players[0], 100000, have_niu)[1]
    
print(bets)    

[0, 1, 1, 1]


In [5]:
for i in range(4):
    players[i].append(deck.pop())

In [6]:
banker_hand

[('diamond', '7'),
 ('diamond', '9'),
 ('diamond', '6'),
 ('club', '4'),
 ('spade', '6')]

In [7]:
players[0]

[('diamond', '7'),
 ('diamond', '9'),
 ('diamond', '6'),
 ('club', '4'),
 ('spade', '6')]

In [8]:


# 6 step : caculate ev of myself
# if myself is banker, caculate the payout of sum me against other 3 players
# if myself is not banke, caculate the payout of me agaginst banker
# the payout == calculate_payout(player_hand, banker_hand, verbose=False) multi * bet
if is_banker:
    # If I am the banker, calculate the payout against other 3 players
    total_payout = -sum(
        calculate_payout(players[i], banker_hand, False) * bets[i] * banker_multiplier
        for i in range(4) if i != banker_index
    )
else:
    # If I am not the banker, calculate the payout against the banker
    total_payout = calculate_payout(players[0], banker_hand, False) * bets[0] * banker_multiplier

# 7 step : get reward
reward = total_payout
print(reward)

8


In [9]:
# env of niuniu
# set myself as player 0
class NiuNiuEnv:
    # init 
    def __init__(self):
        # generate deck
        self.deck = self.generate_deck()
        # generate player == 4
        self.players = [[] for _ in range(4)]
        # generate banker == none
        self.banker_index = -1
        # banker multiplier
        self.banker_multiplier = 1
        # bet number
        self.bets = [0, 0, 0, 0]
        # generate state
        self.state = None
        # reset
        self.reset()

    # generate deck
    def generate_deck(self):
        suits = ['heart', 'spade', 'diamond', 'club']
        ranks = ['2', '3', '4', '5', '6', '7', '8', '9', '10', 'J', 'Q', 'K', 'A']
        deck = [(suit, rank) for suit in suits for rank in ranks]
        return deck

    # reset
    def reset(self):
        # regenerate deck & shuffle
        self.deck = self.generate_deck()
        random.shuffle(self.deck)
        # every player have 4 cards
        self.players = [[self.deck.pop() for _ in range(4)] for _ in range(4)]

        # init bets
        self.bets = [0] * 4
        self.banker_index = -1
        self.banker_multiplier = 1

        # reload state
        self.state = self.get_state()
        return self.state
    
    # get myself's hand number
    def get_state(self):
        # myslef's hand
        state = [card_value(card) for card in self.players[0]]
        # who is banker
        state.append(self.banker_index)
        # banker multiplayer
        state.append(self.banker_multiplier)
        # every player's bet
        state.extend(self.bets)
        return np.array(state, dtype=np.float32)
    
    # step    
    def step(self, action):
        """
        action: 0-4 為搶莊倍率，5-9 為下注倍率 (共 10 種行為)
        """

        # 1 step : decide whether to get banker, using simulate_ev(player_hand, num_simulations=100000)
        # to find use what multipler to get
        # return 4 player's bank multi
        ### bank_multipliers = [simulate_ev(self.players[i], 100000)[0] for i in range(4)]
        # use ppo to decide whether to get banker
        self.banker_bid = action if action < 5 else 0
        # other player use simulate_ev to decide
        bank_multipliers = [simulate_ev(self.players[i], 100000)[0] for i in range(4)]
        bank_multipliers[0] = self.banker_bid
        # run time : 22s

        # 2 step : decide final banker, the one have max multiplier is banker
        # if more than one have the same multi, random choose one
        max_bet = max(bank_multipliers)
        banker_candidates = [i for i, b in enumerate(bank_multipliers) if b == max_bet]
        self.banker_index = random.choice(banker_candidates)
        self.banker_multiplier = max_bet
        banker_hand = self.players[self.banker_index]
        # whether myself is banker
        is_banker = (self.banker_index == 0)

        # 3 step : if myslef is banker, no need to decide the bet
        # however the other player have to decide the bet(1~5)
        # using calculate_ev_against_banker(player_hand, num_simulations=100000, have_niu=True)
        # if banker multi is 3 or 4, have_niu == True, otherwise, have_niu == Flase
        if is_banker:
            self.bets[0] = 0
            for i in range(1, 4):
                have_niu = self.banker_multiplier >= 3
                self.bets[i] = calculate_ev_against_banker(self.players[i], 100000, have_niu)[1]
        else:
            # 4 step : if myslef is not banker, decide the bet
            # also use calculate_ev_against_banker(player_hand, num_simulations=100000, have_niu=True)
            # if banker multi is 3 or 4, have_niu == True, otherwise, have_niu == Flase
            # let ppo decide bets
            self.bets[0] = (action - 4) if action >= 5 else 1


        # 5 step : append the 5th card to every player's hand
        for i in range(4):
            self.players[i].append(self.deck.pop())

        # 6 step : caculate ev of myself
        # if myself is banker, caculate the payout of sum me against other 3 players
        # if myself is not banke, caculate the payout of me agaginst banker
        # the payout == calculate_payout(player_hand, banker_hand, verbose=False) multi * bet
        if is_banker:
            # If I am the banker, calculate the payout against other 3 players
            total_payout = -sum(
                calculate_payout(self.players[i], banker_hand, False) * self.bets[i] * self.banker_multiplier
                for i in range(4) if i != self.banker_index
            )
        else:
            # If I am not the banker, calculate the payout against the banker
            total_payout = calculate_payout(self.players[0], banker_hand, False) * self.bets[0] * self.banker_multiplier


        # 7 step : get reward
        reward = total_payout

        # 8 step : finish one round
        done = True

        # 9 step : reset
        self.reset()

        return self.state, reward, done, {}



In [10]:
# PPO 價值網絡
class ValueNetwork(nn.Module):
    def __init__(self, input_dim):
        super(ValueNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    
    def forward(self, x):
        return self.fc(x)

In [None]:
# PPO 策略網絡

class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, x):
        return self.fc(x)

In [None]:
# PPO Agent
class PPOAgent:
    def __init__(self, state_dim, action_dim, lr=0.001):
        self.policy = PolicyNetwork(state_dim, action_dim)
        self.value = ValueNetwork(state_dim)
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        self.value_optimizer = optim.Adam(self.value.parameters(), lr=lr)
        self.gamma = 0.99
        self.eps_clip = 0.2
    
    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32)
        probs = self.policy(state)
        dist = Categorical(probs)
        action = dist.sample()
        return action.item(), dist.log_prob(action)

    def compute_returns(self, rewards):
        returns = []
        G = 0
        for r in reversed(rewards):
            G = r + self.gamma * G
            returns.insert(0, G)
        return torch.tensor(returns, dtype=torch.float32)
    
    def train(self, states, actions, log_probs, rewards):
        returns = self.compute_returns(rewards)
        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        log_probs = torch.stack(log_probs)
        
        values = self.value(states).squeeze()
        advantages = returns - values.detach()
        
        new_probs = self.policy(states)
        new_dist = Categorical(new_probs)
        new_log_probs = new_dist.log_prob(actions)

        ratio = torch.exp(new_log_probs - log_probs)
        clipped_ratio = torch.clamp(ratio, 1 - self.eps_clip, 1 + self.eps_clip)
        loss_policy = -torch.min(ratio * advantages, clipped_ratio * advantages).mean()

        loss_value = (returns - values).pow(2).mean()
        
        self.policy_optimizer.zero_grad()
        loss_policy.backward()
        self.policy_optimizer.step()

        self.value_optimizer.zero_grad()
        loss_value.backward()
        self.value_optimizer.step()


In [19]:
# 最終推論
class NiuNiuDecisionHelper:
    def __init__(self, agent):
        self.agent = agent
    
    def decide_action(self, hand):
        state = np.array([card_value(card) for card in hand], dtype=np.float32)
        action, _ = self.agent.select_action(state)
        return action

In [20]:
env = NiuNiuEnv()
agent = PPOAgent(state_dim=6, action_dim=10)

# 訓練
for episode in range(10):
    state = env.reset()
    done = False
    rewards = []
    states = []
    actions = []
    log_probs = []

    while not done:
        action, log_prob = agent.select_action(state)
        new_state, reward, done, _ = env.step(action)
        rewards.append(reward)
        states.append(state)
        actions.append(action)
        log_probs.append(log_prob)
        state = new_state

    agent.train(states, actions, log_probs, rewards)

helper = NiuNiuDecisionHelper(agent)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x10 and 6x64)

In [51]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# 確保異常檢測開啟
torch.autograd.set_detect_anomaly(True)

class PPOAgent:
    def __init__(self, input_dim, action_dim, lr=1e-3, gamma=0.99, clip_epsilon=0.2):
        self.policy_network = PolicyNetwork(input_dim, action_dim)
        self.optimizer_policy = optim.Adam(self.policy_network.parameters(), lr=lr)
        self.gamma = gamma
        self.clip_epsilon = clip_epsilon

        self.memory = []

    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        probs = self.policy_network(state)
        dist = torch.distributions.Categorical(probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        return action.item(), log_prob.item()

    def store_transition(self, state, action, reward, log_prob, next_state):
        self.memory.append((state, action, reward, log_prob, next_state))

    def train(self):
        if not self.memory:
            return

        states, actions, rewards, log_probs, next_states = zip(*self.memory)

        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.long)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        log_probs = torch.tensor(log_probs, dtype=torch.float32)

        # 計算新的 log_prob
        new_probs = self.policy_network(states)
        new_dist = torch.distributions.Categorical(new_probs)
        new_log_probs = new_dist.log_prob(actions)

        # 計算 ratio (並確保不會 in-place 操作)
        ratio = torch.exp(new_log_probs - log_probs.detach())

        # 計算 advantage
        advantages = rewards - rewards.mean()

        # PPO 損失函數
        unclipped = ratio * advantages
        clipped = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * advantages
        policy_loss = -torch.min(unclipped, clipped).mean()

        # 更新策略網路
        self.optimizer_policy.zero_grad()
        policy_loss.backward()
        self.optimizer_policy.step()

        self.memory = []



env = NiuNiuEnv()
agent = PPOAgent(input_dim=len(env.get_state()), action_dim=10)  # 確保動作空間是 10


In [53]:
num_episodes = 100  # 訓練 10,000 場遊戲
for episode in range(num_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        action, log_prob = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)

        agent.store_transition(state, action, reward, log_prob, next_state)
        state = next_state
        total_reward += reward

    agent.train()  # 更新 PPO
    print(f"Episode {episode}, Total Reward: {total_reward}")

    # if episode % 100 == 0:
    #     print(f"Episode {episode}, Total Reward: {total_reward}")

# everage : 22.5s/epoch

Episode 0, Total Reward: -12
Episode 1, Total Reward: 8
Episode 2, Total Reward: -12
Episode 3, Total Reward: -24
Episode 4, Total Reward: -16
Episode 5, Total Reward: -12
Episode 6, Total Reward: -8
Episode 7, Total Reward: 8
Episode 8, Total Reward: 4
Episode 9, Total Reward: 16
Episode 10, Total Reward: -4
Episode 11, Total Reward: -8
Episode 12, Total Reward: -16
Episode 13, Total Reward: 4
Episode 14, Total Reward: -60
Episode 15, Total Reward: 8
Episode 16, Total Reward: 3
Episode 17, Total Reward: 5
Episode 18, Total Reward: -48
Episode 19, Total Reward: -16
Episode 20, Total Reward: 6
Episode 21, Total Reward: -8
Episode 22, Total Reward: 24
Episode 23, Total Reward: -4
Episode 24, Total Reward: 8
Episode 25, Total Reward: 4
Episode 26, Total Reward: -4
Episode 27, Total Reward: 20
Episode 28, Total Reward: -24
Episode 29, Total Reward: -4
Episode 30, Total Reward: -8
Episode 31, Total Reward: 4
Episode 32, Total Reward: -2
Episode 33, Total Reward: -4
Episode 34, Total Reward:

In [58]:
# 假設你的環境已經初始化
env = NiuNiuEnv()

# 假設你已經訓練好 PPO Agent
agent = PPOAgent(input_dim=env.get_state().shape[0], action_dim=10)

my_hand=[('diamond', 'K'), ('diamond', '9'), ('diamond', '6'), ('club', '4')]
# 測試：給定目前的手牌，讓 AI 決定下注策略
state = env.get_state()  # 取得手牌狀態
action, _ = agent.select_action(state)  # 讓 PPO 決定動作

# 解讀動作
if action < 5:
    print(f"AI 決定搶莊，倍率為 {action + 1} 倍")
else:
    print(f"AI 決定下注，倍率為 {action - 4} 倍")


AI 決定下注，倍率為 3 倍


In [43]:
class NiuNiuDecisionHelper:
    def __init__(self, agent, env):
        self.agent = agent
        self.env = env
        self.suit_map = {'diamond': 0, 'club': 1, 'heart': 2, 'spade': 3}
        self.rank_map = {'J': 11, 'Q': 12, 'K': 13, 'A': 14}
        self.rank_map.update({str(i): i for i in range(2, 11)})

    def preprocess_state(self, hand_cards):
        """
        將手牌轉換為數值型 NumPy 陣列
        :param hand_cards: 玩家的手牌 (ex: [('diamond', '7'), ('diamond', '9')])
        :return: 數值型 NumPy 陣列
        """
        numerical_hand = []
        for suit, rank in hand_cards:
            suit_num = self.suit_map[suit]
            rank_num = self.rank_map[rank]
            numerical_hand.extend([suit_num, rank_num])  
        return np.array(numerical_hand, dtype=np.float32)

    def decide_qiangzhuang(self, hand_cards):
        """
        根據手牌決定是否搶莊，並提供搶莊倍率 (1~4 倍) 或 不搶時下注倍率 (1~5 倍)
        """
        processed_state = self.preprocess_state(hand_cards)
        action, _ = self.agent.select_action(processed_state)

        if action < 4:  # 0~3 搶莊 (倍率 1~4)
            return True, action + 1
        else:  # 4~9 不搶莊 (倍率 1~5)
            return False, action - 3

    def compute_banker_loss(self, hand_cards, final_banker, qiangzhuang_multiplier):
        """
        計算搶莊失敗時應該下注的倍率
        :param hand_cards: 玩家的手牌
        :param final_banker: 是否成為莊家 (True/False)
        :param qiangzhuang_multiplier: 當初搶莊時的倍率
        :return: 搶莊失敗時，應該下注的倍率 (1~5 倍)，若成功成為莊家則回傳 None
        """
        if final_banker:
            return None  # 成功當莊，不用算賠率

        # **使用 AI 來決定下注倍率**
        processed_state = self.preprocess_state(hand_cards)
        action, _ = self.agent.select_action(processed_state)

        if action >= 4:  # 4~9 代表下注 (對應倍率 1~5)
            return action - 3
        else:
            return 1  # 預設為最低倍率 1 倍


In [49]:
env = NiuNiuEnv()
agent = PPOAgent(input_dim=8, action_dim=10)  # 輸入 8 維，輸出 10 個動作
decision_helper = NiuNiuDecisionHelper(agent, env)

hand_cards = [('diamond', 'J'), ('diamond', 'K'), ('diamond', '10'), ('club', '4')]
should_qiang, multiplier = decision_helper.decide_qiangzhuang(hand_cards)

if should_qiang:
    print(f"應該搶莊，建議倍率: {multiplier} 倍")
    final_banker = bool(int(input("最後是否成功成為莊家？(1: 是, 0: 否): ")))
    banker_loss = decision_helper.compute_banker_loss(hand_cards, final_banker, multiplier)
    if banker_loss:
        print(f"搶莊失敗，AI 建議下注倍率: {banker_loss} 倍")
else:
    print(f"不搶莊，建議下注倍率: {multiplier} 倍")


應該搶莊，建議倍率: 3 倍
搶莊失敗，AI 建議下注倍率: 1 倍


In [None]:
# PPO 代理人
class PPOAgent:
    def __init__(self, input_dim, action_dim, lr=0.002, gamma=0.99, epsilon=0.2, update_steps=5):
        self.policy = PolicyNetwork(input_dim, action_dim)  # ✅ 確保這裡有 PolicyNetwork
        self.value = ValueNetwork(input_dim)
        self.optimizer_policy = optim.Adam(self.policy.parameters(), lr=lr)
        self.optimizer_value = optim.Adam(self.value.parameters(), lr=lr)
        self.gamma = gamma
        self.epsilon = epsilon
        self.update_steps = update_steps
        self.memory = []
    
    def select_action(self, state):
        """ 使用策略網絡選擇行動 """
        state = torch.FloatTensor(state).unsqueeze(0)  # 加維度 (batch=1)
        probs = self.policy(state)
        dist = Categorical(probs)
        action = dist.sample()
        return action.item(), dist.log_prob(action)
    
    def store_transition(self, state, action, reward, log_prob, next_state):
        """ 存儲交互數據 """
        self.memory.append((state, action, reward, log_prob, next_state))

    def compute_discounted_rewards(self, rewards):
        """ 計算折扣回報 G_t """
        discounted_rewards = []
        G = 0
        for r in reversed(rewards):
            G = r + self.gamma * G
            discounted_rewards.insert(0, G)
        return torch.FloatTensor(discounted_rewards)

    def train(self):
        """ 使用 PPO 來訓練策略網絡和價值網絡 """
        if len(self.memory) == 0:
            return
        
        # 1. 解析記憶
        states, actions, rewards, log_probs, next_states = zip(*self.memory)
        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)
        log_probs = torch.stack(log_probs)
        rewards = self.compute_discounted_rewards(rewards)
        
        # 2. 計算優勢值 Advantage = G_t - V(s)
        values = self.value(states).squeeze()
        advantages = rewards - values.detach().clone()  # 使用 clone() 來避免原地修改

        # 3. 更新策略網絡 (PPO Loss)
        for _ in range(self.update_steps):  # 重複多次更新
            probs = self.policy(states)
            dist = Categorical(probs)
            new_log_probs = dist.log_prob(actions)
            
            ratio = torch.exp(new_log_probs - log_probs)  # 重要性權重
            clipped_ratio = torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon)
            policy_loss = -torch.min(ratio * advantages, clipped_ratio * advantages).mean()
            
            self.optimizer_policy.zero_grad()
            policy_loss.backward(retain_graph=True)  # 確保計算圖不會被釋放
            self.optimizer_policy.step()

        # 4. 更新價值網絡 (MSE 損失)
        value_loss = (self.value(states).squeeze() - rewards).pow(2).mean()
        self.optimizer_value.zero_grad()
        value_loss.backward()
        self.optimizer_value.step()

        # 5. 清空記憶
        self.memory = []


In [117]:
env = NiuNiuEnv()
agent = PPOAgent(input_dim=len(env.get_state()), action_dim=10)

num_episodes = 10  # 進行 10 次訓練

for episode in range(num_episodes):
    state = env.reset()
    done = False
    total_reward = 0
    
    while not done:
        action, log_prob = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)  # 注意這裡解構返回值
        
        agent.store_transition(state, action, reward, log_prob, next_state)
        
        state = next_state
        total_reward += reward  # 確保 reward 是數字

    agent.train()  # 訓練一次

    if episode % 10 == 0:
        print(f"Episode {episode}, Total Reward: {total_reward}")

# 最後的訓練結果
print("Training completed.")


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [64, 10]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [64]:
# 訓練 PPO
env = NiuNiuEnv()
agent = PPOAgent(input_dim=4, action_dim=10)
num_episodes = 10000

for episode in range(num_episodes):
    state = env.reset()
    memory = []
    done = False
    
    while not done:
        action, log_prob = agent.select_action(state)
        
        # 呼叫 step() 並獲得下一步狀態和獎勳
        next_state, reward, done = env.step(action)
        
        memory.append((state, action, reward, log_prob, next_state))
        state = next_state
    
    agent.train(memory)
    
    if episode % 10 == 0:
        print(f"Episode {episode}, Last Reward: {reward}")

print("Training complete!")


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x10 and 4x64)

In [65]:
# 測試主程式
if __name__ == "__main__":
    # 初始化遊戲環境和代理
    env = NiuNiuEnv()
    agent = PPOAgent(input_dim=4, action_dim=10)
    
    # 測試 1: 使用代理選擇行為
    state = env.reset()
    action, log_prob = agent.select_action(state)
    print(f"Chosen Action: {action}, Log Probability: {log_prob}")
    
    # 測試 2: 執行一步遊戲並查看結果
    is_banker = (action < 5)  # 假設選擇 0-4 為搶莊，5-9 為下注
    next_state, reward, done = env.step(action, is_banker)
    print(f"Next State: {next_state}, Reward: {reward}, Done: {done}")

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x10 and 4x64)

In [None]:
# 假設模型已經訓練完成並保存在 agent 中
def get_action_from_model(hand, agent, is_banker=False):
    """
    根據手牌與模型決策是否搶莊，並在搶莊與否時決定下注策略。
    hand: 玩家手牌
    agent: 訓練好的PPOAgent
    is_banker: 是否為莊家，True表示是莊家，False表示閒家
    """
    # 將手牌轉換為模型的狀態向量
    state = np.array([card_value(card) for card in hand], dtype=np.float32)  # 轉換為數字狀態
    
    # 如果是閒家，先決定是否搶莊（選擇動作）
    if not is_banker:
        action, _ = agent.select_action(state)  # 根據狀態選擇動作
        if action < 5:  # 如果選擇的動作是搶莊（0-4表示搶莊）
            is_banker = True
            print("決定搶莊！")
        else:
            print("決定不搶莊，選擇下注。")
    
    # 根據是否搶莊來決定下注
    if is_banker:
        # 如果是莊家，決定下注策略
        action, _ = agent.select_action(state)  # 莊家可以下注的動作範圍是 0-4
        print(f"作為莊家，下注倍率為 {action % 5 + 1}")
    else:
        # 如果是閒家，根據手牌決定下注
        action, _ = agent.select_action(state)  # 閒家的下注動作範圍是 5-9
        print(f"作為閒家，下注倍率為 {action % 5 + 1}")

    return action

# 假設我們有一副手牌
player_hand = ['3♠', '7♣', 'K♦', '9♥']  # 玩家手牌

# 使用訓練完成的模型來決定策略
get_action_from_model(player_hand, agent)


In [37]:
player_hand = [('heart', '9'), ('diamond', 'J'), ('club', '3'), ('spade', '6'), ('heart', '2')]
banker_hand = [('spade', '10'), ('club', 'J'), ('heart', '4'), ('diamond', '6'), ('diamond', '2')]
print(type(calculate_payout(player_hand, banker_hand, verbose=False)))

<class 'int'>
