In [11]:
import gymnasium
import skyscraper
import numpy as np
import random

In [12]:
env = gymnasium.make('skyscraper/GridWorld-v0')

In [13]:
model_M = {}

with open('powered_flight.txt', 'r') as file:
    for line in file:
        i, j, a, r, i_prime, j_prime = map(int, line.split())
        # adjusting for 0-indexing
        i, j, a, i_prime, j_prime = i - 1, j - 1, a - 1, i_prime - 1, j_prime - 1

        # key: current state and action, value: reward and next state
        model_M[((i, j), a)] = (r, (i_prime, j_prime))

In [14]:
# model_M

In [15]:
# for key in sorted(model_M):
#     print(key, "=", model_M[key])

In [16]:
def random_argmax(arr):
    max_value = np.max(arr)
    max_indices = np.where(arr == max_value)[0]
    random_index = random.choice(max_indices)
    return random_index

In [17]:
def eps_greedy_policy(Q, epsilon, action_space):
    # define an epsilon-greedy policy function based on Q table
    def policy_func(state):
        # initialize a uniform distribution for all actions with a bias of epsilon
        prob_distribution = np.ones(action_space) * epsilon / action_space
        # use random_argmax to choose action with the highest Q value (with ties broken arbitrarily)
        best_action = random_argmax(Q[state])
        # increase probability of taking the best action by 1 - epsilon
        prob_distribution[best_action] += (1 - epsilon)
        return prob_distribution

    return policy_func

In [18]:
def get_pos(state):
    return tuple(map(int, state['agent']['pos']))

In [22]:
def tabular_dyna_q(env, model, num_episodes, alpha=0.1, gamma=1, epsilon=0.3, n=10, env_height=32, env_width=64):
    # initialize Q(s, a) with zeros for all state-action pairs
    Q = np.zeros((env_height, env_width, env.action_space.n))

    # initialize policy using epsilon-greedy method
    policy = eps_greedy_policy(Q, epsilon, env.action_space.n)

    for episode in range(1, num_episodes + 1):
        if episode % 100 == 0:
            print(f"\rEpisode {episode}/{num_episodes}.", end="")

        state, _ = env.reset()

        episode_length = 0

        while True:
            probs = policy(get_pos(state))
            # choose A from S using eps-greedy policy derived from Q
            action = np.random.choice(np.arange(len(probs)), p=probs)

            # take action A; observe resultant reward R, and state, S'
            next_state, reward, done, _, _ = env.step(action)

            episode_length += 1
            if done:
                # print("=" * 25, f"Episode {episode}: Done in {episode_length} steps", "=" * 25)
                break  # exit loop if agent has reached target

            # update Q-value for the current state and action
            best_next_action = np.argmax(Q[get_pos(next_state)])
            td_target = reward + gamma * Q[get_pos(next_state)][best_next_action]
            td_delta = td_target - Q[get_pos(state)][action]
            Q[get_pos(state)][action] += alpha * td_delta

            # update the model with the new experience
            model[(get_pos(state), action)] = (reward, get_pos(next_state))

            # planning: update Q-values using simulated experiences from the model
            for _ in range(n):
                # randomly select state and action previously observed
                sim_state, sim_action = list(model.keys())[np.random.choice(len(model.keys()))]
                sim_reward, sim_next_state = model[(sim_state, sim_action)]

                # update Q-values as if the selected s, a, r, s_prime were a real experience
                sim_best_next_action = np.argmax(Q[sim_next_state])
                sim_td_target = sim_reward + gamma * Q[sim_next_state][sim_best_next_action]
                sim_td_delta = sim_td_target - Q[sim_state][sim_action]
                Q[sim_state][sim_action] += alpha * sim_td_delta

            state = next_state

    return Q

In [23]:
Q = tabular_dyna_q(env, model_M, num_episodes=500)

{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
{'agent': {'pos': array([14., 54.])}} 1
