In [2]:
# Modifying the Reward Structure
import numpy as np

states = ['A', 'B', 'C', 'D']
transition_matrix = {'A': 'B', 'B': 'C', 'C': 'D', 'D': 'A'}

rewards = {'A': 2, 'B': 0, 'C': -2, 'D': 1}

policy = {'A': 'B', 'B': 'C', 'C': 'D', 'D': 'A'}

gamma = 0.9
value_function = {s: 0 for s in states}
iterations = 10

def bellman_update(state, value_function, gamma):
    next_state = policy[state]
    return rewards[state] + gamma * value_function[next_state]

for _ in range(iterations):
    for s in states:
        value_function[s] = bellman_update(s, value_function, gamma)

print("Estimated Value Function:")
for state, value in value_function.items():
    print(f"Value({state}) = {value:.2f}")

Estimated Value Function:
Value(A) = 2.88
Value(B) = 0.98
Value(C) = 0.52
Value(D) = 3.59


In [12]:
# Implementing a Probabilistic Policy
import numpy as np

states = ['A', 'B', 'C', 'D']
transitions = {'A': {'B':0.8, 'C':0.2}, 'B': {'C':1.0}, 'C': {'D':0.9, 'A':0.1}, 'D': {'A':1.0}}

# rewards = {'A': 2, 'B': 0, 'C': -2, 'D': 1}

# policy = {'A': 0.8*'B'+0.2*'C', 'B': 'C', 'C': 'D'*0.9+0.1*'A', 'D': 'A'}

gamma = 0.9
value_function = {s: 0 for s in states} #dictionary comprehension
iterations = 1000
threshold=0.01

def bellman_update(state, value_function, gamma):
  expected_val = 0
  for next_state, prob in transitions[state].items():
    expected_val += prob*(rewards[state]+gamma*value_function[next_state])
  return expected_val

for i in range(iterations):
  maxdel = 0
  new_val_func = value_function.copy()
  for s in states:
    updated_val = bellman_update(s, value_function, gamma)
    maxdel = (maxdel, abs(updated_val - value_function[s]))
    new_val_func[s] = updated_val
  value_function=new_val_func

print("Estimated Value Function:")
for state, value in value_function.items():
    print(f"Value({state}) = {value:.2f}")

Estimated Value Function:
Value(A) = 3.15
Value(B) = 1.25
Value(C) = 1.39
Value(D) = 3.84
