In [18]:
from mdptoolbox import example
import mdptoolbox
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

In [12]:
P, R = example.forest()

In [13]:
print(type(P), type(R))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [14]:
print(P.shape, R.shape)

(2, 3, 3) (3, 2)


In [15]:
mdptoolbox.mdp.ValueIteration?

[0;31mInit signature:[0m
[0mmdptoolbox[0m[0;34m.[0m[0mmdp[0m[0;34m.[0m[0mValueIteration[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtransitions[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreward[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdiscount[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mepsilon[0m[0;34m=[0m[0;36m0.01[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_iter[0m[0;34m=[0m[0;36m1000[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minitial_value[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
A discounted MDP solved using the value iteration algorithm.

Description
-----------
ValueIteration applies the value iteration algorithm to solve a
discounted MDP. The algorithm consists of solving Bellman's equation
iteratively.
Iteration is stopped when an epsilon-optimal policy is found or after a
specified number (``max_iter``) of iterations.
This function uses verbose and si

In [16]:
vi = mdptoolbox.mdp.ValueIteration(P, R, 0.9)
vi.run()

In [17]:
vi.policy

(0, 0, 0)

In [19]:
gamma = 0.9

P = np.array([
    [0.5, 0.5, 0.0, 0.0],
    [0.3, 0.3, 0.4, 0.0],
    [0.0, 0.3, 0.3, 0.4],
    [0.0, 0.0, 0.0, 1.0]
])

r = np.array([2, 0, -1, 0])  # 每个状态的即时奖励

# 解MRP: V = (I - gamma*P)^(-1) * r
I = np.eye(4)
V = np.linalg.inv(I - gamma * P).dot(r)

print("各状态价值 V =", V)

各状态价值 V = [ 4.69988075  1.29985425 -0.889095    0.        ]


In [36]:
gamma = 0.9

# Action 0: Nap
P0 = np.array([
    [0.9, 0.1, 0.0, 0.0],
    [0.1, 0.8, 0.1, 0.0],
    [0.0, 0.2, 0.7, 0.1],
    [0.0, 0.0, 0.0, 1.0]
])

# Action 1: Roam
P1 = np.array([
    [0.5, 0.5, 0.0, 0.0],
    [0.3, 0.3, 0.4, 0.0],
    [0.0, 0.3, 0.3, 0.4],
    [0.0, 0.0, 0.0, 1.0]
])

# 把这两个动作的转移概率打包成三维数组：P[a][s][s']
# pymdptoolbox 里，P 是一个长度为 n_actions 的列表，每个元素是 (n_states x n_states) 的矩阵
P = [P0, P1]

# 奖励：假设仅依赖状态
R = np.array([
        [2, 0, -1, 0], # action 0
        [1, 0, -2, 0]  # action 1
])


R_all_actions = R.transpose(1, 0) # from (2, 4) to (4, 2)

print(f'P0 shape: {P0.shape}\nP1 shape: {P1.shape}\nP shape: {np.array(P).shape}\nR shape: {R.shape}\nR_all_actions shape: {R_all_actions.shape}')

P0 shape: (4, 4)
P1 shape: (4, 4)
P shape: (2, 4, 4)
R shape: (2, 4)
R_all_actions shape: (4, 2)


In [37]:
mdp = mdptoolbox.mdp.ValueIteration(P, R_all_actions, gamma)
mdp.run()

print("Optimal policy:", mdp.policy)  # 最优策略（对每个状态选择哪一个动作）
print("Optimal value function:", mdp.V)

Optimal policy: (0, 1, 0, 0)
Optimal value function: (12.5927089168427, 4.372077720880815, -0.5770590796636772, 0.0)


![image](https://github.com/jiaowoguanren0615/FIT-5226/blob/main/images/week4_lab.jpg)

In [55]:
P_single = np.array([
    # home   shelter  dead   city   market
    [0.5,    0.0,     0.1,    0.4,   0.0   ],  # from home
    [0.0,    0.2,     0.2,    0.2,   0.4   ],  # from shelter
    [0.0,    0.0,     1.0,    0.0,   0.0   ],  # from dead (吸收)
    [0.1,    0.0,     0.4,    0.5,   0.0   ],  # from city
    [0.0,    0.0,     0.4,    0.0,   0.6   ],  # from market
])

R_single = np.array([0.4, 0.2, 0.0, 0.3, 0.7])

In [56]:
print(P_single.shape, R_single.shape)

(5, 5) (5,)


In [57]:
P_single = np.expand_dims(P_single, axis=0)
P_single.shape

(1, 5, 5)

In [58]:
R_single = np.expand_dims(R_single, axis=1)
R_single.shape

(5, 1)

In [59]:
gamma = 0.9  # 折扣因子

# 用 ValueIteration
mdp = mdptoolbox.mdp.ValueIteration(P_single, R_single, gamma)
mdp.run()

print("Optimal policy:", mdp.policy)
print("Optimal value function:", mdp.V)

Optimal policy: (0, 0, 0, 0, 0)
Optimal value function: (1.213044130367309, 1.0749256662104465, 0.0, 0.7435083661488171, 1.5215918155583958)
