In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

Markov decision process  is one of the decision making problem where the outcomes of the state are partially random or partially controllable.

we have 
1.States: A set of possible situations

2.Actions: A set of possible actions available to the agent

3.Transition model: A function that describes the probability of transitioning from one state to another given an
action

4.Reward function: A function that describes the reward received by the agent after taking an action in a state


5.Policy: A mapping from states to actions
6.Value function: A function that describes the value of being in a state, given a policy 


In [2]:
import numpy as np

class Mdp:
    def __init__(self, states, actions, transition_prob, rewards, policy, gamma=0.9):
        self.states = states
        self.actions = actions
        self.transition_prob = transition_prob
        self.rewards = rewards
        self.policy = policy
        self.gamma = gamma

    def simulate(self, start_state, horizon=10):
        state = start_state
        total_reward = 0
        print(f"Starting simulation at state: {state}")

        for t in range(horizon):
            action = self.policy[state]
            print(f"Time step {t + 1}: State = {state}, Action = {action}")

            next_states = self.states  # Get the list of next states
            probabilities = [self.transition_prob.get((state, action, s_prime), 0) for s_prime in next_states]
            next_state = np.random.choice(next_states, p=probabilities / np.sum(probabilities))  # Normalize probabilities

            # Get the reward for the transition
            reward = self.rewards.get((state, action, next_state), 0)  # Default to 0 if not found
            print(f"Transition to state: {next_state} with reward: {reward}")

            # Accumulate discounted reward
            total_reward += (self.gamma ** t) * reward

            # Update state
            state = next_state

        print(f"Total discounted reward: {total_reward}")
        return total_reward

# Example Usage

states = ['A', 'B', 'C']
actions = ['left', 'right']

# Define transition probabilities P(s' | s, a)
transition_prob = {
    ('A', 'left', 'A'): 0.8, ('A', 'left', 'B'): 0.2, ('A', 'right', 'C'): 1.0,
    ('B', 'left', 'A'): 1.0, ('B', 'right', 'C'): 1.0,
    ('C', 'left', 'A'): 1.0, ('C', 'right', 'B'): 1.0
}

# Define rewards R(s, a, s')
rewards = {
    ('A', 'left', 'A'): 1, ('A', 'left', 'B'): -1, ('A', 'right', 'C'): 2,
    ('B', 'left', 'A'): 0, ('B', 'right', 'C'): 1,
    ('C', 'left', 'A'): 0, ('C', 'right', 'B'): 1
}

# Define an arbitrary policy π(s) -> a
policy = {
    'A': 'right',
    'B': 'right',
    'C': 'left'
}

# Initialize MDP with the states, actions, transition probabilities, rewards, and policy
mdp = Mdp(states, actions, transition_prob, rewards, policy, gamma=0.9)

# Simulate the MDP starting from state 'A' for 10 time steps
total_reward = mdp.simulate(start_state='A', horizon=10)


Starting simulation at state: A
Time step 1: State = A, Action = right
Transition to state: C with reward: 2
Time step 2: State = C, Action = left
Transition to state: A with reward: 0
Time step 3: State = A, Action = right
Transition to state: C with reward: 2
Time step 4: State = C, Action = left
Transition to state: A with reward: 0
Time step 5: State = A, Action = right
Transition to state: C with reward: 2
Time step 6: State = C, Action = left
Transition to state: A with reward: 0
Time step 7: State = A, Action = right
Transition to state: C with reward: 2
Time step 8: State = C, Action = left
Transition to state: A with reward: 0
Time step 9: State = A, Action = right
Transition to state: C with reward: 2
Time step 10: State = C, Action = left
Transition to state: A with reward: 0
Total discounted reward: 6.8560164200000004


when it comes to MDP  we need to understand the markov process which is defind by states and transition probability where we have sequence of states s1,s2,....sn and transition probability is defined as the prbability of jumping from a state  from the current state 
 
 


Markov reward process
the Markov reward process is defined by the reward ,Transistion probability and gamma being the discount factor
reward State R is the expected reward over all the possible states that one can transition to from state s. This reward is received for being at the state S


Optimal Policy: [1 0 0]
Value Function: [34.8225197  27.58139372  0.        ]
