In [1]:
import numpy as np

# Define the environment (a simple grid world)
# 'S': starting point
# 'G': goal
# 'H': hazardous location
# 'F': frozen (safe) location
env = np.array([
    ['F', 'F', 'F', 'F'],
    ['F', 'H', 'F', 'H'],
    ['F', 'F', 'F', 'H'],
    ['S', 'H', 'F', 'G']
])

# Define actions: up, down, left, right
actions = ['UP', 'DOWN', 'LEFT', 'RIGHT']
num_actions = len(actions)

# Define rewards
rewards = {
    'G': 10,  # Goal
    'H': -10,  # Hazard
    'F': -1  # Frozen
}

# Define parameters
gamma = 0.9  # Discount factor
alpha = 0.1  # Learning rate
num_episodes = 1000  # Number of episodes

# Initialize Q-table
q_table = np.zeros((env.shape[0] * env.shape[1], num_actions))

# Helper function to convert coordinates to state index


def coords_to_state(x, y):
    return x * env.shape[1] + y

# Helper function to choose action based on epsilon-greedy policy


def choose_action(state, epsilon):
    if np.random.uniform(0, 1) < epsilon:
        return np.random.choice(actions)
    else:
        return actions[np.argmax(q_table[state])]


# Q-learning algorithm
for episode in range(num_episodes):
    state = coords_to_state(3, 0)  # Starting state
    done = False
    epsilon = 0.1  # Epsilon for epsilon-greedy policy

    while not done:
        action = choose_action(state, epsilon)

        # Get new state based on action
        if action == 'UP':
            new_state = coords_to_state(
                max(state // env.shape[1] - 1, 0), state % env.shape[1])
        elif action == 'DOWN':
            new_state = coords_to_state(
                min(state // env.shape[1] + 1, env.shape[0] - 1), state % env.shape[1])
        elif action == 'LEFT':
            new_state = coords_to_state(
                state // env.shape[1], max(state % env.shape[1] - 1, 0))
        else:
            new_state = coords_to_state(
                state // env.shape[1], min(state % env.shape[1] + 1, env.shape[1] - 1))

        reward = rewards[env[new_state // env.shape[1]]
                         [new_state % env.shape[1]]]

        # Q-table update
        q_table[state][actions.index(action)] += alpha * (reward + gamma * np.max(
            q_table[new_state]) - q_table[state][actions.index(action)])

        if env[new_state // env.shape[1]][new_state % env.shape[1]] == 'G' or env[new_state // env.shape[1]][new_state % env.shape[1]] == 'H':
            done = True
        else:
            state = new_state

# Display the learned Q-table
print("Learned Q-table:")
print(q_table)

KeyError: 'S'