In [3]:
import random
import numpy as np

# Define the Reward Matrix for run & stay
stay = np.array([0, 0, 0, 0, 0])
run = np.array([0.4, 0.2, 0.3, 0.7, 0.0])

# Define the Q-table with arbirtary values 
q_table = np.array([[1, 2],
                   [3, 4],
                   [5, 6],
                   [7, 8],
                   [9, 10]]
                  )

In [None]:
# update Q-table (1 single episode)

# Parameters
alpha = 0.5     # learning rate
gamma = 1    # discount factor

# Example: transition from state 2, take action 'run' (1), ends up in state 3
state = 2
action = 1
next_state = 3

# Get reward
reward = run[state] if action == 1 else stay[state]

# Calculate the max Q value for the next state (best possible future reward)
future_q = np.max(q_table[next_state])

# Q-learning update
q_table[state][action] = q_table[state][action] + alpha * (reward + gamma * future_q - q_table[state][action])


In [7]:
# Full q-learning loop

# States: s0, s1, s2, s3, s4
num_states = 5
num_actions = 2  # 0 = stay, 1 = run

# Rewards
stay = np.array([0, 0, 0, 0, 0])
run = np.array([0.4, 0.2, 0.3, 0.7, 0.0])  # assume run ends in a new state based on transition logic

# Initialize Q-table with zeros
q_table = np.zeros((num_states, num_actions))

# Hyperparameters
alpha = 0.1     # Learning rate
gamma = 0.9     # Discount factor
epsilon = 0.1   # Exploration rate
episodes = 1000
max_steps = 10  # per episode

# Simple next state logic (optional): just randomly jump to another state for this example
def get_next_state(state, action):
    if action == 0:
        return state  # staying
    else:
        return random.choice([s for s in range(num_states) if s != state])  # simulate movement

# Terminal state example: let's say state 4 (index 4) is 'dead'
def is_terminal(state):
    return state == 4

# Q-Learning Training Loop
for episode in range(episodes):
    state = random.randint(0, num_states - 2)  # don't start at terminal state

    for step in range(max_steps):
        # Choose action (epsilon-greedy)
        if random.random(0, 1) > epsilon:
            action = np.argmax(q_table[state])  # exploit
        else:
            action = random.choice([0, 1])  # explore

        # Get reward
        reward = stay[state] if action == 0 else run[state]

        # Transition to next state
        next_state = get_next_state(state, action)

        # Update Q-value using Q-learning formula
        old_q = q_table[state][action]
        future_q = np.max(q_table[next_state])
        q_table[state][action] = old_q + alpha * (reward + gamma * future_q - old_q)

        # Move to next state
        state = next_state

        # End if terminal
        if is_terminal(state):
            break

# Display learned Q-table
import pandas as pd
df_q = pd.DataFrame(q_table, columns=["stay", "run"], index=[f"s{i}" for i in range(num_states)])
print("Learned Q-table:")
print(df_q)

Learned Q-table:
        stay       run
s0  0.962081  1.013732
s1  0.781918  1.094868
s2  0.855823  1.176294
s3  1.235519  1.429134
s4  0.000000  0.000000
