In [1]:
import numpy as np
import gym
import random

In [241]:
def reward_func0(env, field):
    if env.desc[field] == b'G':
        return 1
    else:
        return 0

def reward_func1(env, field):
    if env.desc[field] == b'G':
        return 1
    elif env.desc[field] == b'H':
        return -1
    else:
        return 0

def reward_func2(env, field):
    if env.desc[field] == b'G':
        return 5
    elif env.desc[field] == b'H':
        return -1
    else:
        return 0

In [91]:
env.desc[0]

array([b'S', b'F', b'F', b'F', b'F', b'F', b'F', b'F'], dtype='|S1')

In [176]:
env = gym.make("FrozenLake8x8-v1")
n_observations = env.observation_space.n
n_actions = env.action_space.n
Q_Table = np.zeros((n_observations, n_actions))

rewards_per_episode = list()
steps_per_episode = list()

exploration_prob = 1.0 
max_exploration_prob = 1.0             # Exploration probability at start
min_exploration_prob = 0.01 
exploration_decreasing_decay = 0.001
total_episodes = 100000
gamma = 0.99

for i in range(total_episodes):
    # reset env
    current_state = env.reset()
    done = False

    total_episode_reward = 0
    steps = 0
    for i in range(MAX_ACTIONS):
        # action random or using bellans

        if np.random.uniform(0,1) < exploration_prob:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q_Table[current_state,:])

        next_state, reward, done, _ = env.step(action)
        Q_Table[current_state, action] = (1-lr) * Q_Table[current_state, action] +lr*(reward + gamma*max(Q_Table[next_state,:]))
        total_episode_reward = total_episode_reward + reward
        steps = steps + 1
        if done:
            break
        current_state = next_state

    steps_per_episode.append(steps)
    exploration_prob = max(min_exploration_prob, np.exp(-exploration_decreasing_decay*i))
    rewards_per_episode.append(total_episode_reward)

print(f"You've reached the goal {sum(rewards_per_episode)/total_episodes*100}% of the time")
print(f"It usually took {np.mean(steps_per_episode)} steps")



You've reached the goal 0.22300000000000003% of the time
It usually took 32.95583 steps


In [178]:
sum(rewards_per_episode)

223.0

In [17]:
print(f"You've reached the goal {sum(rewards_per_episode)/total_episodes*100}% of the time")
print(f"It usually took {np.mean(steps_per_episode)} steps")

You've reached the goal 0.23900000000000002% of the time
It usually took 33.02662 steps


In [8]:
env = gym.make("FrozenLake8x8-v1")
func = reward_func0
total_episodes = 100000
lr = 0.1
gamma = 0.99
n_observations = env.observation_space.n
n_actions = env.action_space.n

qtable = np.zeros((n_observations, n_actions))

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.1            # Minimum exploration probability 
decay_rate = 0.001            # Exponential decay rate for exploration prob




steps_per_episode = []
rewards = []


for i in range(total_episodes):
    current_state = env.reset()

    done = False
    total_episode_reward = 0
    steps = 0
    for step in range(MAX_ACTIONS):

        if np.random.uniform(0,1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(qtable[current_state,:])

        next_state, reward, done, _ = env.step(action)
        # field = divmod(next_state, 8)
        # reward = func(env, field)

        qtable[current_state, action] = qtable[current_state, action] + lr * (reward + gamma * np.max(qtable[next_state, :]) - qtable[current_state, action])
        total_episode_reward = total_episode_reward + reward

        steps += 1

        if done:
            break
        current_state = next_state

    steps_per_episode.append(steps)
    rewards.append(total_episode_reward)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*step) 

print(f"You've reached the goal {sum(rewards)/total_episodes*100}% of the time")
print(f"It usually took {np.mean(steps_per_episode)} steps")


You've reached the goal 0.233% of the time
It usually took 32.97696 steps


In [156]:
total_episode_reward

0.0

In [234]:
MAX_ACTIONS = 200

def train(env, func, total_episodes = 10000, lr = 0.1, gamma = 0.99):

    n_observations = env.observation_space.n
    n_actions = env.action_space.n

    qtable = np.zeros((n_observations, n_actions))

    # Exploration parameters
    epsilon = 1.0                 # Exploration rate
    max_epsilon = 1.0             # Exploration probability at start
    min_epsilon = 0.01            # Minimum exploration probability 
    decay_rate = 0.001            # Exponential decay rate for exploration prob




    steps_per_episode = []
    rewards = []


    for i in range(total_episodes):
        current_state = env.reset()

        done = False
        total_episode_reward = 0
        steps = 0
        for step in range(MAX_ACTIONS):

            if np.random.uniform(0,1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(qtable[current_state,:])

            next_state, reward_default, done, _ = env.step(action)
            field = divmod(next_state, 8)
            reward = func(env, field)

            qtable[current_state, action] = qtable[current_state, action] + lr * (reward + gamma * np.max(qtable[next_state, :]) - qtable[current_state, action])
            total_episode_reward += reward_default

            steps += 1

            if done:
                break
            current_state = next_state

        steps_per_episode.append(steps)
        rewards.append(total_episode_reward)
        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*step) 
        # print("hi",sum(total_episode_reward)/total_episodes*100)
        # print((total_episode_reward)/total_episodes*100)
    return qtable, (total_episode_reward)/total_episodes*100

def test(env, qtable, attempts = 1000):
    env = gym.make("FrozenLake8x8-v1")
    success = 0
    done = False
    current_state = env.reset()

    for _ in range(attempts):
        for _ in range(MAX_ACTIONS):
            action = np.argmax(qtable[current_state,:])
            next_state, reward, done, _ = env.step(action)

            if done:
                # success += reward
                break
            current_state = next_state
        success += reward

    return ((success/attempts)*100)



In [None]:
env = gym.make("FrozenLake8x8-v1")
qt, succes_rate_train = train(env, reward_func0, total_episodes = 100000)
print(f'train succes rate: {succes_rate_train}%')
succes_rate_test = test(env, qt)
print(f'train succes rate: {succes_rate_test}%')

In [228]:
succes_rate_test = test_new(env, qt)
print(f'train succes rate: {succes_rate_test}%')

75.4
train succes rate: 75.4%


In [249]:
env = gym.make("FrozenLake8x8-v1")
total_episodes = 10000
qt, succes_rate_train = train(env, reward_func2, total_episodes)
print(f'train succes rate: {succes_rate_train}%')
succes_rate_test = test_new(env, qt, int(total_episodes/10))
print(f'train succes rate: {succes_rate_test}%')

train succes rate: 0.0%
train succes rate: 61.5%


In [246]:
def test_new(env, qt, iters):
    a = []

    for _ in range(iters):
        done = False
        current_state = env.reset()

        for _ in range(MAX_ACTIONS):
            action = np.argmax(qt[current_state,:])
            next_state, reward, done, _ = env.step(action)

            if done:
                break
            current_state = next_state
        a.append(reward)
    # print(sum(a)/iters*100)
    return sum(a)/iters*100

In [222]:
max_iter = 200

env = gym.make("FrozenLake8x8-v1")
a = []
iters = 1000

for _ in range(iters):
    done = False
    current_state = env.reset()

    for _ in range(max_iter):
        action = np.argmax(qt[current_state,:])
        next_state, reward, done, _ = env.step(action)

        if done:
            break
        current_state = next_state
    a.append(reward)

    # if reward:
    #     print("Agent succeded")
    # else:
    #     print("Agent failed")
print(sum(a)/iters*100)


74.7


In [201]:
sum(a)

33.0

In [84]:
reward

1.0