##Imports

In [None]:
import numpy as np
from pprint import pprint

##Infrastructure

In [None]:
# Define the Markov Decision Process (MDP) class
class MDP():

	def __init__(self, states, actions, P, R, gamma):
		self.states = states      # List of all possible states
		self.actions = actions    # List of all possible actions
		self.P = P                # Transition probability function: P(s, a, s')
		self.R = R                # Reward function: R(s, a, s')
		self.gamma = gamma        # Discount factor (importance of future rewards)


# Calculate the Q-value for a given state-action pair
def Q_value(mdp, s, a, U):
	return sum(mdp.P(s, a, state)*(mdp.R(s, a, state) + mdp.gamma*U[state]) for state in mdp.states)


# Get the maximum difference between two sets of utilities
def max_diff_utilities(U0, U1):
	return max([abs(U0[s] - U1[s]) for s in U0])


# Value Iteration algorithm to find optimal utility values
def value_iteration(mdp, epsilon=0.01, max_iterations=100):
	U = {s: -100 for s in mdp.states}
	for iteration_num in range(max_iterations):
		U_next = { state: max([Q_value(mdp, state, a, U) for a in mdp.actions]) for state in mdp.states }
		if max_diff_utilities(U, U_next) <= epsilon:
			return U_next, iteration_num
		U = U_next
	return U_next, iteration_num


# Derive the best policy from utility values
def policy_from_utility(mdp, U):
	policy = {}
	for state in mdp.states:
		if all(Q_value(mdp, state, a, U) <= -100 for a in mdp.actions):
			policy[state] = 'none'
		else:
			best_action = max(mdp.actions, key=lambda a: Q_value(mdp, state, a, U))
			policy[state] = best_action
	return policy

##Define the MDP

In [None]:
# Define transition probabilities for each state and action


# Define the probability of transitioning from one state to another


# Define the rewards for reaching different terminal states


# Instantiate the MDP
mdp = None

##Perform Value Iteration

In [None]:
print("Value Iteration")
print("----------------------")
U, i = value_iteration(mdp, 0.01)
policy = policy_from_utility(mdp, U)
print(f"Iterations: {i}")
print("Utility:")
pprint(U)
print("Policy:")
pprint(policy)

Value Iteration
----------------------
Iterations: 55
Utility:
{(0, 0): 19.913687108245117,
 (0, 1): 19.913687108245117,
 (0, 2): 19.913687108245117,
 (1, 0): 19.913687108245117,
 (1, 2): 19.913687108245117,
 (2, 0): 19.9136868862271,
 (2, 1): 19.91368616776171,
 (2, 2): 19.9136868862271,
 (3, 0): 19.913685951359735,
 (3, 1): 0.0,
 (3, 2): 0.0}
Policy:
{(0, 0): 'up',
 (0, 1): 'up',
 (0, 2): 'up',
 (1, 0): 'left',
 (1, 2): 'left',
 (2, 0): 'left',
 (2, 1): 'left',
 (2, 2): 'left',
 (3, 0): 'down',
 (3, 1): 'up',
 (3, 2): 'up'}
