<a href="https://colab.research.google.com/github/kevinkao777/Deep-Reinforcement-Learning-Book/blob/master/%E9%A6%AC%E7%88%BE%E5%8F%AF%E5%A4%AB%E6%B1%BA%E7%AD%96%E9%81%8E%E7%A8%8B_(MDP)_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np

class MDP:
    def __init__(self, states, actions, transition_probs, rewards, discount_factor):
        self.states = states
        self.actions = actions
        self.transition_probs = transition_probs
        self.rewards = rewards
        self.discount_factor = discount_factor
        
    def value_iteration(self, error_tolerance=1e-3):
        values = {state: 0 for state in self.states}
        while True:
            new_values = {state: 0 for state in self.states}
            for state in self.states:
                action_values = []
                for action in self.actions:
                    action_value = 0
                    for next_state, prob in self.transition_probs[state][action].items():
                        action_value += prob * (self.rewards[state][action][next_state] + 
                                                self.discount_factor * values[next_state])
                    action_values.append(action_value)
                new_values[state] = max(action_values)
            delta = max([abs(new_values[state] - values[state]) for state in self.states])
            if delta < error_tolerance:
                break
            values = new_values
        return values

states = [0, 1, 2, 3]
actions = [0, 1]

transition_probs = {
    0: {
        0: {0: 1.0},
        1: {2: 1.0},
    },
    1: {
        0: {0: 0.5, 1: 0.5},
        1: {2: 1.0},
    },
    2: {
        0: {0: 0.5, 3: 0.5},
        1: {3: 1.0},
    },
    3: {
        0: {3: 1.0},
        1: {3: 1.0},
    },
}

rewards = {
    0: {0: {0: 2.0}, 1: {2: 2.0}},
    1: {0: {0: 2.0, 1: 2.0}, 1: {2: 2.0}},
    2: {0: {0: 2.0, 3: 2.0}, 1: {3: 2.0}},
    3: {0: {3: 0.0}, 1: {3: 0.0}},
}

discount_factor = 0.95

mdp = MDP(states, actions, transition_probs, rewards, discount_factor)
print(mdp.value_iteration())


{0: 39.98081873791644, 1: 39.98081873791644, 2: 20.99040936895822, 3: 0.0}


這個答案是一個字典，其中鍵為0,1,2,3，值為39.98081873791644，39.98081873791644，20.99040936895822和0.0。

這個字典可能是一個結果，表示在馬爾可夫決策過程中，不同狀態（在這種情況下可能為0,1,2,3）對於給定的報酬函數的值的預測。具體的含義取決於實際的問題定義和使用的方法。