Markov descision process with policy evaluation 
**Markov Decision Process** is defined as a stochastic decision making process which uses a mathematical framework for modelling the decision-making of a dynamic system where the outcomes are either random or under the control of a decision maker (agent). 

A **Markov Decision Process (MDP)** consists of the following components:

- **State Space (S)**: A set of possible states of the system.
- **Actions (A)**: A list of actions the agent can perform.
- **Transition Probability (T)**: This represents the probability of transitioning from one state to another state when an action is performed. 
Transition Probability written as $ P(s' | s, a)$, which is the probability of ending in state $ s' $ after taking an action $ a $ in state $ s $.
- **Reward Function (R)**: The reward attained after transitioning to a new state.
- **Discount Factor ($ \gamma $)**: A value ranging from 0.0 to 1.0 that determines the future rewards compared to immediate rewards.


In [4]:
import numpy as np 

class MDP: 
    def __init__(self, states, actions, transition_probs, rewards, discount_factor=0.9):
        self.states = states
        self.actions = actions
        self.transition_probs = transition_probs
        self.rewards = rewards
        self.discount_factor = discount_factor
        self.value_table = np.zeros(len(states))  # Initialize value table with zeros
        self.policy = [0] * len(states) 



    def policy_evaluation(self,threshold=1e-6):
        while True:
            delta = 0
            for s in range(len(self.states)):
                v = self.value_table[s]
                self.value_table[s] = self.rewards[s] + self.discount_factor * np.sum([self.transition_probs[s][a] * self.value_table[self.states.index(next_state)]
                for a, next_state in zip(self.actions, self.transition_probs[s])])
                delta = max(delta, abs(v - self.value_table[s]))
                if delta < threshold:
                    break
    

    def policy_improvement(self):
        policy_stable=True
        for s in range(len(self.states)):
            oldaction=self.policy[s]
            # Compute action-values for all actions in the state
            action_values = []
            for a in self.actions:
                action_value = sum([p * (r + self.discount_factor * self.value_table[s_next])
                                    for p, s_next, r in self.transition_probs[s][a]])
                action_values.append(action_value)
            best_action = np.argmax(action_values)
            self.policy[s] = best_action
            if oldaction != best_action:
                policy_stable = False
        return policy_stable
    

    def policy_iteration(self):
        while True:
            self.policy_evaluation()
            if self.policy_improvement():
                break
        return self.policy, self.value_table
    
states = [0, 1, 2]  # Example states
actions = [0, 1]    # Example actions
# Transition probabilities and rewards in the form {state: {action: [(probability, next_state, reward), ...]}}
transition_probs = {
    0: {
        0: [(1.0, 0, 0)],  # Stay in the same state with reward 0
        1: [(1.0, 1, 1)],  # Move to state 1 with reward 1
    },
    1: {
        0: [(1.0, 2, 2)],
        1: [(1.0, 0, 0)],
    },
    2: {
        0: [(1.0, 1, 1)],
        1: [(1.0, 2, 2)],
    }
}
rewards = [0, 1, 2]  # Example rewards for each state
discount_factor = 0.9

# Initialize and run policy iteration
mdp = MDP(states, actions, transition_probs, rewards, discount_factor)
optimal_policy, optimal_values = mdp.policy_iteration()

print("Optimal Policy:", optimal_policy)
print("Optimal Value Function:", optimal_values)



TypeError: can't multiply sequence by non-int of type 'numpy.float64'

In [22]:
import numpy as np

class MDP:
    def __init__(self, states, actions, transition_prob, rewards, discount_factor=0.9):
        self.states = states
        self.actions = actions
        self.transition_probs = transition_prob
        self.rewards = rewards
        self.discount_factor = discount_factor
        self.value_table = np.zeros(len(states))  # Initialize value table with zeros
        self.policy = [0] * len(states)           # Initialize policy with zeros

    def policy_evaluation(self, threshold=1e-6):
        iteration = 0
        while True:
            delta = 0
            print(f"Policy Evaluation Iteration {iteration}")
            for s in self.states:
                v = self.value_table[s]
                a = self.policy[s]
                # Compute the value for the chosen action under the current policy
                self.value_table[s] = sum([p * (r + self.discount_factor * self.value_table[s_next])
                                           for p, s_next, r in self.transition_probs[s][a]])
                delta = max(delta, abs(v - self.value_table[s]))
            print(f"Value Table: {self.value_table}\n")
            if delta < threshold:
                break
            iteration += 1

    def policy_improvement(self):
        policy_stable = True
        iteration = 0
        while not policy_stable:
            policy_stable = True
            print(f"Policy Improvement Iteration {iteration}")
            for s in self.states:
                old_action = self.policy[s]
                # Compute action-values for all actions in the state
                action_values = []
                for a in self.actions:
                    action_value = sum([p * (r + self.discount_factor * self.value_table[s_next])
                                        for p, s_next, r in self.transition_probs[s][a]])
                    action_values.append(action_value)
                best_action = np.argmax(action_values)  # Choose the best action based on the action-values
                self.policy[s] = best_action
                if old_action != best_action:
                    policy_stable = False
            print(f"Policy: {self.policy}")
            print(f"Value Table: {self.value_table}\n")
            iteration += 1
        return policy_stable

    def policy_iteration(self):
        iteration = 0
        while True:
            print(f"Policy Iteration {iteration}")
            self.policy_evaluation()
            if self.policy_improvement():
                break
            iteration += 1
        return self.policy, self.value_table

# Define states, actions, transition probabilities, and rewards
states = [0, 1, 2]  # State indices
actions = [0, 1]    # Action indices, e.g., 0 and 1 could represent 'a1' and 'a2'

# Define transition probabilities and rewards
# transition_prob[state][action] = [(probability, next_state, reward), ...]
transition_prob = {
    0: {0: [(0.7, 0, 5), (0.3, 1, 5)], 1: [(1.0, 2, 10)]},
    1: {0: [(0.4, 0, -1), (0.6, 2, -1)], 1: [(1.0, 1, 2)]},
    2: {0: [(1.0, 2, 0)], 1: [(0.5, 0, 3), (0.5, 1, 3)]}
}

# Rewards can be accessed from the transition_probs directly for simplicity.

# Initialize and solve the MDP
mdp = MDP(states, actions, transition_prob, rewards=None)
optimal_policy, value_table = mdp.policy_iteration()

print("Optimal Policy:", optimal_policy)
print("Value Table:", value_table)


Policy Iteration 0
Policy Evaluation Iteration 0
Value Table: [5.  0.8 0. ]

Policy Evaluation Iteration 1
Value Table: [8.366   2.01176 0.     ]

Policy Evaluation Iteration 2
Value Table: [10.8137552   2.89295187  0.        ]

Policy Evaluation Iteration 3
Value Table: [12.59376278  3.5337546   0.        ]

Policy Evaluation Iteration 4
Value Table: [13.88818429  3.99974635  0.        ]

Policy Evaluation Iteration 5
Value Table: [14.82948762  4.33861554  0.        ]

Policy Evaluation Iteration 6
Value Table: [15.5140034   4.58504122  0.        ]

Policy Evaluation Iteration 7
Value Table: [16.01178327  4.76424198  0.        ]

Policy Evaluation Iteration 8
Value Table: [16.37376879  4.89455677  0.        ]

Policy Evaluation Iteration 9
Value Table: [16.63700467  4.98932168  0.        ]

Policy Evaluation Iteration 10
Value Table: [16.82842979  5.05823473  0.        ]

Policy Evaluation Iteration 11
Value Table: [16.96763415  5.10834829  0.        ]

Policy Evaluation Iteration 12


here we see that that the policy iteration remains unchanged i.e converging towards stable states  and the policy does not change 

## implementation of MDP using Grid Based 


In [23]:
import seaborn as sn


In [27]:
class MDP:
    def __init__(self,grid,transition_states,rewards,discount=0.9):
        self.grid = grid
        self.transition_states = transition_states
        self.rewards = rewards
        self.discount = discount
        self.action =["up", "down", "left", "right"]
    

    def transition_probabilities(self):
        # Transition probabilities for each state-action pair
        transition_probabilities = {}
        for row in range(self.grid_size):
            for col in range(self.grid_size):
                state = (row, col)
                transition_probabilities[state] = {}
                for action in self.actions:
                    transition_probabilities[state][action] = self.get_transitioning_state(state, action)
        return transition_probabilities
    

    def transition_states(self, state, action):
    # Transitioning state for a given state-action pair
        if action == "up" and state[0] > 0:
            return (state[0] - 1, state[1])
        elif action == "down" and state[0] < self.grid_size - 1:
            return (state[0] + 1, state[1])
        elif action == "left" and state[1] > 0:
            return (state[0], state[1] - 1)
        elif action == "right" and state[1] < self.grid_size - 1:
            return (state[0], state[1] + 1)
        else:
            return state 
        


    def exp_reward(self,state,action):
        # Expected reward for a given state-action pair
        return self.rewards.get((state, action), 0)
    

In [None]:
#calculate action value
