<a href="https://colab.research.google.com/github/johnjustine5646/RL/blob/main/Lab3_RL_522.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

In [2]:

class MDP:
    def __init__(self, states, actions, transitions, rewards, gamma=0.9):
        self.states = states
        self.actions = actions
        self.transitions = transitions
        self.rewards = rewards
        self.gamma = gamma
        self.values = {s: 0 for s in states}  # Initialize value function to 0 for all states

    def value_iteration(self, theta=0.0001):
        while True:
            delta = 0  # Initialize change in value function
            for state in self.states:
                if state not in self.transitions:  # Skip terminal states
                    continue

                # Find the maximum action value for the current state
                max_value = float('-inf')
                for action in self.actions:
                    action_value = 0
                    for prob, next_state in self.transitions[state][action]:
                        reward = self.rewards[state][action]
                        action_value += prob * (reward + self.gamma * self.values[next_state])
                    max_value = max(max_value, action_value)

                # Update value function and compute difference for convergence
                delta = max(delta, abs(self.values[state] - max_value))
                self.values[state] = max_value

            # Check if values have converged
            if delta < theta:
                break

    def get_policy(self):
        policy = {}
        for state in self.states:
            if state not in self.transitions:  # Skip terminal states
                policy[state] = None
                continue

            # Choose the best action based on the current value function
            best_action = None
            max_value = float('-inf')
            for action in self.actions:
                action_value = 0
                for prob, next_state in self.transitions[state][action]:
                    reward = self.rewards[state][action]
                    action_value += prob * (reward + self.gamma * self.values[next_state])

                if action_value > max_value:
                    max_value = action_value
                    best_action = action

            policy[state] = best_action

        return policy


In [3]:
states = ['s1', 's2', 's3', 'terminal']
actions = ['a1', 'a2']
transitions = {
    's1': {
        'a1': [(1.0, 's2')],
        'a2': [(1.0, 's3')],
    },
    's2': {
        'a1': [(1.0, 's1')],
        'a2': [(1.0, 'terminal')],
    },
    's3': {
        'a1': [(1.0, 'terminal')],
        'a2': [(1.0, 's1')],
    }
}
rewards = {
    's1': {'a1': -1, 'a2': 0},
    's2': {'a1': -1, 'a2': 1},
    's3': {'a1': 1, 'a2': -1},
}

mdp = MDP(states, actions, transitions, rewards, gamma=0.9)
mdp.value_iteration()
policy = mdp.get_policy()

print("Optimal Values:", mdp.values)
print("Optimal Policy:", policy)

Optimal Values: {'s1': 0.9, 's2': 1.0, 's3': 1.0, 'terminal': 0}
Optimal Policy: {'s1': 'a2', 's2': 'a2', 's3': 'a1', 'terminal': None}
