In [5]:
import numpy as np

# David Silver's Student Markov Chain
# States: C1 (Class1), C2 (Class2), C3 (Class3), FB (Facebook), Pub, Pass, Sleep


class StudentMarkovChain:
    def __init__(self):
        self.states = ["C1", "C2", "C3", "FB", "Pub", "Pass", "Sleep"]
        self.terminal_states = ["Pass", "Sleep"]

        # Transition probabilities: {state: [(prob, next_state), ...]}
        # Simplified model where student makes random choices at each state
        self.transitions = {
            "C1": [
                (0.5, "C2"),  # Study
                (0.5, "FB"),  # Facebook
            ],
            "C2": [
                (0.8, "C3"),  # Study
                (0.2, "Sleep"),  # Sleep
            ],
            "C3": [
                (0.6, "Pass"),
                (0.4, "Pub"),
            ],
            "FB": [
                (0.9, "FB"),  # Keep browsing
                (0.1, "C1"),  # Quit
            ],
            "Pub": [
                (0.2, "C1"),  # Leave pub, go to C1 (0.5 * 0.2)
                (0.4, "C2"),  # Leave pub, go to C2 (0.5 * 0.4)
                (0.4, "C3"),  # Leave pub, go to C3 (0.5 * 0.4)
            ],
            "Pass": [],  # Terminal state
            "Sleep": [],  # Terminal state
        }

    def step(self, state):
        """Take a step in the Markov Chain from current state"""
        if state in self.terminal_states:
            return state

        if state not in self.transitions:
            raise ValueError(f"Invalid state: {state}")

        # Get possible transitions
        possible_transitions = self.transitions[state]

        if not possible_transitions:
            return state

        # Sample next state based on probabilities
        probs = [t[0] for t in possible_transitions]
        idx = np.random.choice(len(possible_transitions), p=probs)
        _, next_state = possible_transitions[idx]

        return next_state

    def generate_trajectory(self, start_state, max_steps=20):
        """Generate a sample trajectory starting from start_state"""
        trajectory = [start_state]
        state = start_state

        for _ in range(max_steps):
            if state in self.terminal_states:
                break

            next_state = self.step(state)
            trajectory.append(next_state)
            state = next_state

        return trajectory


# Create the Markov Chain
mc = StudentMarkovChain()

# Generate sample trajectories from different starting states
print("=" * 70)
print("SAMPLE TRAJECTORIES FROM DAVID SILVER'S STUDENT MARKOV CHAIN")
print("=" * 70)

starting_states = ["C1", "C2", "C3", "FB", "Pub"]

for start_state in starting_states:
    print(f"\n{'=' * 70}")
    print(f"Starting from: {start_state}")
    print(f"{'=' * 70}")

    for i in range(3):  # 3 sample trajectories per starting state
        trajectory = mc.generate_trajectory(start_state)

        print(f"\nTrajectory {i + 1}: {' -> '.join(trajectory)}")

        if trajectory[-1] in mc.terminal_states:
            print(f"  Ended in terminal state: {trajectory[-1]}")


SAMPLE TRAJECTORIES FROM DAVID SILVER'S STUDENT MARKOV CHAIN

Starting from: C1

Trajectory 1: C1 -> C2 -> C3 -> Pub -> C2 -> C3 -> Pub -> C2 -> C3 -> Pub -> C2 -> C3 -> Pass
  Ended in terminal state: Pass

Trajectory 2: C1 -> FB -> FB -> FB -> FB -> FB -> FB -> C1 -> C2 -> C3 -> Pass
  Ended in terminal state: Pass

Trajectory 3: C1 -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> C1 -> C2 -> C3 -> Pass
  Ended in terminal state: Pass

Starting from: C2

Trajectory 1: C2 -> C3 -> Pub -> C2 -> C3 -> Pub -> C2 -> C3 -> Pass
  Ended in terminal state: Pass

Trajectory 2: C2 -> C3 -> Pub -> C1 -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> FB -> C1

Trajectory 3: C2 -> C3 -> Pub -> C3 -> Pass
  Ended in terminal state: Pass

Starting from: C3

Trajectory 1: C3 -> Pub -> C3 -> Pass
  Ended in terminal state: Pass

Trajectory 2: C3 -> Pass
  Ended in terminal state: Pass

Trajectory 3: C3 -> Pub -> C2

In [12]:
# Markov Reward Process (MRP): no actions, just P(s'|s) and r(s).
# We'll use the Markov chain dynamics from StudentMarkovChain as the MRP transition model.

state_map = {
    "C1": "Class1",
    "C2": "Class2",
    "C3": "Class3",
    "FB": "Facebook",
    "Pub": "Pub",
    "Sleep": "Sleep",
    "Pass": "Pass",
}


class StudentMRP:
    def __init__(self, mc: StudentMarkovChain):
        self.states = ["Class1", "Class2", "Class3", "Facebook", "Pub", "Sleep", "Pass"]
        self.terminal_states = ["Sleep", "Pass"]

        # Transition probabilities P(s'|s) induced by the Markov chain
        self.P = {s: [] for s in self.states}
        for s_short, trans in mc.transitions.items():
            s = state_map[s_short]
            self.P[s] = [(p, state_map[s_next]) for p, s_next in trans]

        # Optional: treat terminal states as absorbing (not required for evaluation below)
        for t in self.terminal_states:
            self.P[t] = [(1.0, t)]

        # State rewards r(s) (edit these as desired)
        self.r = {
            "Class1": -2.0,
            "Class2": -2.0,
            "Class3": -2.0,
            "Facebook": -1.0,
            "Pub": 1.0,
            "Sleep": 0.0,
            "Pass": 10.0,
        }

    def transitions(self, state):
        return self.P[state]

    def reward(self, state):
        return float(self.r[state])


def mrp_value_function(mrp: StudentMRP, gamma=0.9, theta=1e-12, max_iters=100_000):
    """Evaluate V(s) for an MRP: V(s) = r(s) + gamma * sum_{s'} P(s'|s) V(s')."""
    V = {s: 0.0 for s in mrp.states}

    # Keep terminal values pinned at 0.0 (common convention for episodic tasks)
    for _ in range(max_iters):
        delta = 0.0
        for s in mrp.states:
            if s in mrp.terminal_states:
                continue
            v_old = V[s]
            exp_next = sum(p * V[s_next] for p, s_next in mrp.transitions(s))
            V[s] = mrp.reward(s) + gamma * exp_next
            delta = max(delta, abs(v_old - V[s]))
        if delta < theta:
            break
    return V


mrp = StudentMRP(mc)
V_mrp = mrp_value_function(mrp, gamma=1.0)

print("=" * 70)
print("MRP STATE REWARDS r(s)")
print("=" * 70)
for s in mrp.states:
    print(f"{s:>8s} : {mrp.r[s]:>6.2f}")

print("\n" + "=" * 70)
print("MRP VALUE FUNCTION V(s) (NO ACTIONS)")
print("=" * 70)
for s in mrp.states:
    print(f"{s:>8s}  V={V_mrp[s]:>10.6f}")

MRP STATE REWARDS r(s)
  Class1 :  -2.00
  Class2 :  -2.00
  Class3 :  -2.00
Facebook :  -1.00
     Pub :   1.00
   Sleep :   0.00
    Pass :  10.00

MRP VALUE FUNCTION V(s) (NO ACTIONS)
  Class1  V=-19.950617
  Class2  V= -5.950617
  Class3  V= -4.938272
Facebook  V=-29.950617
     Pub  V= -7.345679
   Sleep  V=  0.000000
    Pass  V=  0.000000


In [9]:
# Turn the Student Markov Chain into an MDP by making actions explicit.
# In an MRP there are no actions (only states + rewards).
# NOTE: 'study' is an ACTION, not a state.
# States are: Class1, Class2, Class3, Facebook, Pub, Sleep, Pass.


class StudentMDP:
    def __init__(self):
        self.states = ["Class1", "Class2", "Class3", "Facebook", "Pub", "Sleep", "Pass"]
        self.terminal_states = ["Sleep", "Pass"]

        # Available actions per state
        self.actions = {
            "Class1": ["study", "facebook"],
            "Class2": ["study", "sleep"],
            "Class3": ["study", "pub"],
            "Facebook": ["facebook", "quit"],
            "Pub": ["drink"],
            "Sleep": [],
            "Pass": [],
        }

        # Transition probabilities P(s' | s, a)
        # (Matches the stochastic dynamics used above where applicable.)
        self.P = {
            "Class1": {
                "study": [(1.0, "Class2")],
                "facebook": [(1.0, "Facebook")],
            },
            "Class2": {
                "study": [(1.0, "Class3")],
                "sleep": [(1.0, "Sleep")],
            },
            "Class3": {
                "study": [(1.0, "Pass")],
                "pub": [(1.0, "Pub")],
            },
            "Facebook": {
                # Either keep browsing or (eventually) quit;
                # we expose an explicit quit action as well.
                "facebook": [(0.9, "Facebook"), (0.1, "Class1")],
                "quit": [(1.0, "Class1")],
            },
            "Pub": {
                "drink": [(0.2, "Class1"), (0.4, "Class2"), (0.4, "Class3")],
            },
            "Sleep": {},
            "Pass": {},
        }

        # Reward per action (edit these numbers as desired)
        self.R = {
            "study": -2.0,
            "facebook": -1.0,
            "sleep": 0.0,
            "pub": 1.0,
            "drink": 1.0,
            "quit": 0.0,
        }

    def available_actions(self, state):
        return self.actions.get(state, [])

    def transitions(self, state, action):
        return self.P[state][action]

    def reward(self, action):
        return float(self.R[action])


def value_iteration(mdp: StudentMDP, gamma=0.9, theta=1e-10, max_iters=10_000):
    """Compute optimal V(s) and greedy policy via value iteration.

    Bellman optimality update: V(s) = max_a [ r(a) + gamma * sum_{s'} P(s'|s,a) V(s') ]
    Terminal states are fixed to V(s)=0.
    """
    V = {s: 0.0 for s in mdp.states}

    for _ in range(max_iters):
        delta = 0.0
        for s in mdp.states:
            if s in mdp.terminal_states:
                continue

            acts = mdp.available_actions(s)
            if not acts:
                continue

            v_old = V[s]
            q_vals = []
            for a in acts:
                r = mdp.reward(a)
                exp_next = sum(p * V[s_next] for p, s_next in mdp.transitions(s, a))
                q_vals.append(r + gamma * exp_next)
            V[s] = float(max(q_vals))
            delta = max(delta, abs(v_old - V[s]))

        if delta < theta:
            break

    # Greedy policy w.r.t. V
    pi = {}
    for s in mdp.states:
        if s in mdp.terminal_states or not mdp.available_actions(s):
            pi[s] = None
            continue
        best_a = None
        best_q = -float("inf")
        for a in mdp.available_actions(s):
            r = mdp.reward(a)
            exp_next = sum(p * V[s_next] for p, s_next in mdp.transitions(s, a))
            q = r + gamma * exp_next
            if q > best_q:
                best_q = q
                best_a = a
        pi[s] = best_a

    return V, pi


mdp = StudentMDP()
V, pi = value_iteration(mdp, gamma=0.9)

print("=" * 70)
print("ACTION REWARDS")
print("=" * 70)
for a in sorted(mdp.R.keys()):
    print(f"{a:>10s} : {mdp.R[a]:>6.2f}")

print("\n" + "=" * 70)
print("OPTIMAL VALUE FUNCTION V(s) AND BEST ACTION (pi(s))")
print("=" * 70)
for s in mdp.states:
    a = pi[s]
    a_str = a if a is not None else "(terminal)"
    print(f"{s:>8s}  V={V[s]:>8.4f}   pi(s)={a_str}")

ACTION REWARDS
     drink :   1.00
  facebook :  -1.00
       pub :   1.00
      quit :   0.00
     sleep :   0.00
     study :  -2.00

OPTIMAL VALUE FUNCTION V(s) AND BEST ACTION (pi(s))
  Class1  V= -1.7640   pi(s)=study
  Class2  V=  0.2623   pi(s)=study
  Class3  V=  2.5136   pi(s)=pub
Facebook  V= -1.5876   pi(s)=quit
     Pub  V=  1.6818   pi(s)=drink
   Sleep  V=  0.0000   pi(s)=(terminal)
    Pass  V=  0.0000   pi(s)=(terminal)
