In [3]:
import random
import numpy as np

In [6]:
class GridWorld:
    
    def __init__(self, size):
        self.size = size
        self.actions = ["up", "down", "left", "right"]
        
        self.agent_pos = (random.randint(0,self.size -1), random.randint(0,self.size -1))
        self.pickup_pos = (random.randint(0,self.size -1), random.randint(0,self.size -1))
        self.dropoff_pos = (self.size-1, self.size-1)
        self.loaded = False
        self.done = False
        

    def _get_state(self):
        return (self.agent_pos, self.loaded, self.pickup_pos) 

    def step(self, action):
        if self.done:
            return self._get_state(), 0, True

        x,y = self.agent_pos 

        if action == "up" and x > 0:
            x-=1
        elif action == "down" and x < self.size-1:
            x+=1
        elif action == "left" and y > 0:
            y-=1
        elif action == "right" and y < self.size-1:
            y+=1

        self.agent_pos = (x,y)
        reward = -1

        if not self.loaded and self.agent_pos == self.pickup_pos:
            self.loaded = True 
        elif self.agent_pos == self.dropoff_pos:
            reward = 20
            self.done = True 

        return self._get_state(), reward, self.done
            

In [5]:
# Define the Reward Matrix for run & stay
stay = np.array([0, 0, 0, 0, 0])
run = np.array([0.4, 0.2, 0.3, 0.7, 0.0])

# Define the Q-table with arbirtary values 
q_table = np.array([[1, 2],
                   [3, 4],
                   [5, 6],
                   [7, 8],
                   [9, 10]]
                  )

In [None]:
# Full q-learning loop

import numpy as np
import random

# States: s0, s1, s2, s3, s4
num_states = 5
num_actions = 2  # 0 = stay, 1 = run

# Rewards
stay = np.array([0, 0, 0, 0, 0])
run = np.array([0.4, 0.2, 0.3, 0.7, 0.0])  # assume run ends in a new state based on transition logic

# Initialize Q-table with zeros
q_table = np.zeros((num_states, num_actions))

# Hyperparameters
alpha = 0.1     # Learning rate
gamma = 0.9     # Discount factor
epsilon = 0.1   # Exploration rate
episodes = 1000
max_steps = 10  # per episode

# Simple next state logic (optional): just randomly jump to another state for this example
def get_next_state(state, action):
    if action == 0:
        return state  # staying
    else:
        return random.choice([s for s in range(num_states) if s != state])  # simulate movement

# Terminal state example: let's say state 4 (index 4) is 'dead'
def is_terminal(state):
    return state == 4

# Q-Learning Training Loop
for episode in range(episodes):
    state = random.randint(0, num_states - 2)  # don't start at terminal state

    for step in range(max_steps):
        # Choose action (epsilon-greedy)
        if random.uniform(0, 1) < epsilon:
            action = random.choice([0, 1])  # explore
        else:
            action = np.argmax(q_table[state])  # exploit

        # Get reward
        reward = stay[state] if action == 0 else run[state]

        # Transition to next state
        next_state = get_next_state(state, action)

        # Update Q-value using Q-learning formula
        old_q = q_table[state][action]
        future_q = np.max(q_table[next_state])
        q_table[state][action] = old_q + alpha * (reward + gamma * future_q - old_q)

        # Move to next state
        state = next_state

        # End if terminal
        if is_terminal(state):
            break

# Display learned Q-table
import pandas as pd
df_q = pd.DataFrame(q_table, columns=["stay", "run"], index=[f"s{i}" for i in range(num_states)])
print("Learned Q-table:")
print(df_q)
