In [None]:
from tqdm import tqdm
import plotly.graph_objects as go
import gymnasium as gym
import numpy as np

# Dynamic Programming

In [None]:
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False, render_mode='human')
theta, discount, episodes = 0.0001, 0.99, 100
state_values, new_state_values = np.zeros(env.observation_space.n), np.zeros(env.observation_space.n)

In [None]:
for episode in tqdm(range(episodes)):
    for state in range(len(state_values)):
        q_values = np.zeros(env.action_space.n)
        for action in range(env.action_space.n):
            prob, next_state, reward, is_terminal = env.P[state][action][0]
            q_values[action] += prob*(reward + discount*state_values[next_state])
        if np.abs(max(q_values)-state_values[state]) > theta:
            new_state_values[state] = max(q_values)
    state_values = new_state_values
print(f"Sweep done for {episodes} episodes")    

In [None]:
state_values.reshape(4,-1)                

"4x4":<br> [ 
        "SFFF", \
        "FHFH", \
        "FFFH", \
        "HFFG"
        ]

In [None]:
optimal_actions = np.zeros(env.observation_space.n)
for state in range(len(state_values)-1):
    q_values = np.zeros(env.action_space.n)
    for action in range(env.action_space.n):
        _, next_state, reward, _ = env.P[state][action][0]
        q_values[action] = reward+discount*state_values[next_state]
    optimal_actions[state] = np.random.choice([i for i in range(len(q_values)) if q_values[i] == max(q_values)])
print("Optimal actions selected for all states")    

In [None]:
q_values

In [None]:
optimal_actions.reshape(4,-1)

In [None]:
observation, info = env.reset()
terminate = False
state = observation
while not terminate:
    next_state, _, terminate, _, _ = env.step(action=int(optimal_actions[state]))
    state = next_state    
env.close()

# Every-Visit Monte Carlo

In [None]:
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False)
epsilon, episodes, episode_reward, terminate = 0.01, 20000, 0, False
episode_values, episodes_rewards, returns = [], [], {}
action_values, optimal_actions = np.zeros((env.observation_space.n, env.action_space.n)), np.zeros(env.observation_space.n)

In [None]:
state, info = env.reset()
for episode in tqdm(range(episodes)):
    while not terminate:
        action_prob = np.random.uniform()
        if action_prob < epsilon:
            action = env.action_space.sample()
        else:    
            action = np.random.choice([i for i in range(len(action_values[state])) if action_values[state][i] == max(action_values[state])])
        next_state, reward, terminate, _, _ = env.step(int(action))
        episode_values.append([state,action,reward])
        episode_reward += reward
        state = next_state
    G = 0    
    for i in reversed(range(len(episode_values))):
        s,a,r = episode_values[i]
        G += r
        if returns.get((s,a)):
            returns[(s,a)].append(G)
        else:
            returns[(s,a)] = [G]
        action_values[s,a] += (returns[(s,a)][-1]-action_values[s,a])/len(returns[(s,a)])    
    
    for i in range(len(optimal_actions)-1):
        if i not in [5,7,11,12]:
            optimal_actions[i] = np.random.choice([j for j in range(len(action_values[i])) if action_values[i,j] == max(action_values[i])])
    
    state, info = env.reset()
    terminate = False
    episodes_rewards.append(episode_reward)
    episode_values, episode_reward = [], 0

In [None]:
optimal_actions.reshape(4,-1)

In [None]:
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False, render_mode='human')
observation, info = env.reset()
terminate = False
state = observation
while not terminate:
    next_state, _, terminate, _, _ = env.step(action=int(optimal_actions[state]))
    state = next_state  
env.close()

In [None]:
mean_rewards = np.convolve(episodes_rewards, np.ones(100), 'valid')/100

In [None]:
fig = go.Figure(data=go.Scatter(x=np.arange(len(mean_rewards)), y=mean_rewards))
fig.update_layout(title='Mean Episode Reward',
                   xaxis_title='Episode',
                   yaxis_title='Mean Reward')
fig.show()