In [3]:
import numpy as np

# David Silver's Student Markov Chain
# States: C1 (Class1), C2 (Class2), C3 (Class3), FB (Facebook), Pub, Pass, Sleep


In [4]:
state_map = {
    "C1": "Class1",
    "C2": "Class2",
    "C3": "Class3",
    "FB": "Facebook",
    "Pub": "Pub",
    "Sleep": "Sleep",
    "Pass": "Pass",
}

states = ["Class1", "Class2", "Class3", "Facebook", "Pub", "Sleep", "Pass"]
terminal_states = ["Sleep", "Pass"]

transitions = {
    "Class1": [
        (0.5, "Class2"),  # Study
        (0.5, "Facebook"),  # Facebook
    ],
    "Class2": [
        (0.8, "Class3"),  # Study
        (0.2, "Sleep"),  # Sleep
    ],
    "Class3": [
        (0.6, "Pass"),
        (0.4, "Pub"),
    ],
    "Facebook": [
        (0.9, "Facebook"),  # Keep browsing
        (0.1, "Class1"),  # Quit
    ],
    "Pub": [
        (0.2, "Class1"),  # Leave pub, go to C1 (0.5 * 0.2)
        (0.4, "Class2"),  # Leave pub, go to C2 (0.5 * 0.4)
        (0.4, "Class3"),  # Leave pub, go to C3 (0.5 * 0.4)
    ],
    "Pass": [],  # Terminal state
    "Sleep": [],  # Terminal state
}


P = {s: [] for s in states}
for s, trans in transitions.items():
    # s = state_map[s_short]
    P[s] = [(p, s_next) for p, s_next in trans]

for t in terminal_states:
    P[t] = [(1.0, t)]  # we stay at terminal states

P

{'Class1': [(0.5, 'Class2'), (0.5, 'Facebook')],
 'Class2': [(0.8, 'Class3'), (0.2, 'Sleep')],
 'Class3': [(0.6, 'Pass'), (0.4, 'Pub')],
 'Facebook': [(0.9, 'Facebook'), (0.1, 'Class1')],
 'Pub': [(0.2, 'Class1'), (0.4, 'Class2'), (0.4, 'Class3')],
 'Sleep': [(1.0, 'Sleep')],
 'Pass': [(1.0, 'Pass')]}

In [5]:
r = {
    "Class1": -2.0,
    "Class2": -2.0,
    "Class3": -2.0,
    "Facebook": -1.0,
    "Pub": 1.0,
    "Sleep": 0.0,
    "Pass": 10.0,
}
r

{'Class1': -2.0,
 'Class2': -2.0,
 'Class3': -2.0,
 'Facebook': -1.0,
 'Pub': 1.0,
 'Sleep': 0.0,
 'Pass': 10.0}

In [6]:
def get_transitions(state):
    """Transition function for the MDP"""
    return P[state]


def reward(state) -> float:
    """Reward function for the MDP"""
    return float(r[state])

In [7]:
def step(state):
    """Take a step in the Markov Chain from current state"""
    if state in terminal_states:
        return state

    if state not in transitions:
        raise ValueError(f"Invalid state: {state}")

    # Get possible transitions
    possible_transitions = transitions[state]

    if not possible_transitions:
        return state

    # Sample next state based on probabilities
    probs = [t[0] for t in possible_transitions]
    idx = np.random.choice(len(possible_transitions), p=probs)
    _, next_state = possible_transitions[idx]

    return next_state


step("Class1")  # Example step

'Facebook'

In [8]:
def generate_trajectory(start_state, max_steps=20):
    """Generate a sample trajectory starting from start_state"""
    trajectory = [start_state]
    state = start_state

    for _ in range(max_steps):
        if state in terminal_states:
            break

        next_state = step(state)
        trajectory.append(next_state)
        state = next_state

    return trajectory


generate_trajectory("Class1")

['Class1', 'Class2', 'Class3', 'Pass']

In [9]:
gamma = 1.0
theta = 1e-12
max_iters = 30_000

V = {s: 0.0 for s in states}

# Terminal states yield their reward once and then the episode ends.
# So we set their values directly, and do not update them in the loop.
for t in terminal_states:
    V[t] = reward(t)

for _ in range(max_iters):
    delta = 0.0

    for s in states:
        if s in terminal_states:
            continue
        v_old = V[s]
        exp_next = sum(p * V[s_next] for p, s_next in get_transitions(s))
        V[s] = reward(s) + gamma * exp_next
        delta = max(delta, abs(v_old - V[s]))
    if delta < theta:
        break

V

{'Class1': -12.543209876532266,
 'Class2': 1.4567901234579068,
 'Class3': 4.320987654322317,
 'Facebook': -22.543209876523473,
 'Pub': 0.8024691358056364,
 'Sleep': 0.0,
 'Pass': 10.0}

In [12]:
## Analytical Value Calculation with bellman Equations


_rewards = [-2, -2, -2, 10, 1, -1, 0]
p_matrix = [
    [0, 0.5, 0, 0, 0, 0.5, 0],
    [0, 0, 0.8, 0, 0, 0, 0.2],
    [0, 0, 0, 0.6, 0.4, 0, 0],
    [0, 0, 0, 0, 0, 0, 1],
    [0.2, 0.4, 0.4, 0, 0, 0, 0],
    [0.1, 0, 0, 0, 0, 0.9, 0],
    [0, 0, 0, 0, 0, 0, 0],
]
gamma = 1
R = np.array(_rewards)
P = np.matrix(p_matrix)
I = np.identity(len(p_matrix))
solution = np.dot(np.linalg.inv((I - gamma * P)), R)
solution = solution.tolist()[0]
for state in range(len(states)):
    print(states[state], solution[state])


Class1 -12.543209876543214
Class2 1.4567901234567908
Class3 4.320987654320986
Facebook 10.0
Pub 0.8024691358024674
Sleep -22.543209876543223
Pass 0.0


In [None]:
# Transform MDP to a MRP

# Probabilities changed to reflect uniform random policy
# Notice Class 3 probabilities reflect possible "Pub" choice:
# (.5 * .2) = probability of picking pub (.5) AND
# probability of then being sent to class 1 (.2)
# Together they mean a total probability of (.5 * .2) = .1
# for ending up back in C1 from C3

state_names = ["C1", "C2", "C3", "FB", "Sleep"]

p_matrix = [
    [0, 0.5, 0, 0.5, 0],
    [0, 0, 0.5, 0, 0.5],
    [0.1, 0.2, 0.2, 0, 0.5],
    [0.5, 0, 0, 0.5, 0],
    [0, 0, 0, 0, 0],
]
# Action rewards are also weighted and summed by probability of occurring
# I.E: 5.5 = (.5 * 10) + (.5 * 1)
_rewards = [-1.5, -1, 5.5, -0.5, 0]
gamma = 1
R = np.array(_rewards)
P = np.matrix(p_matrix)
I = np.identity(len(p_matrix))
solution = np.dot(np.linalg.inv((I - gamma * P)), R)
solution = solution.tolist()[0]
for state in range(len(state_names)):
    print(state_names[state], solution[state])

C1 -1.307692307692308
C2 2.6923076923076925
C3 7.384615384615385
FB -2.3076923076923075
Sleep 0.0
