<a href="https://colab.research.google.com/github/joomj2000/Reinforcement-Learning/blob/main/Q_Learning_cliffwalking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
!pip install gym[toy_text,accept_rom_license]



In [21]:
import numpy as np
import random
import gym
env = gym.make('CliffWalking-v0', new_step_api=True)

In [22]:
env.action_space

Discrete(4)

In [23]:
env.observation_space

Discrete(48)

In [24]:
# step types
STEPTYPE_FIRST = 0
STEPTYPE_MID = 1
STEPTYPE_LAST = 2

Q = np.random.uniform(size=(48, 4))

In [25]:
# wrapper for gym's blackjack environment
def generate_start_step():
    return { 'observation': env.reset(), 'reward': 0., 'step_type': STEPTYPE_FIRST }

def generate_next_step(step, action):
    obs, reward, done, _, info = env.step(action)
    step_type = STEPTYPE_LAST if done else STEPTYPE_MID
    return { 'observation': obs, 'reward': reward, 'step_type': step_type }

In [26]:
epsilon = 0.1

def get_eps_soft_action(step):
    if random.random() < epsilon:
        return np.random.choice(env.action_space.n,1)[0]
    else:
        return np.argmax(Q[step['observation']])

In [27]:
def get_greedy_action(step):
    observ = step['observation']
    return np.argmax(Q[observ])

In [28]:
def get_random_action(step):
    return random.randint(0, env.action_space.n-1)

behavior_prob_hit = 1. / float(env.action_space.n)

In [29]:
def generate_episode(policy_func=get_random_action):
    episode = list()
    actions = list()
    frames = list()
    step = generate_start_step()
    frames.append(env.render(mode='ansi'))
    episode.append(step)
    while step['step_type'] != STEPTYPE_LAST:
        action = policy_func(step)
        step = generate_next_step(step, action)
        frames.append(env.render(mode='ansi'))
        episode.append(step)
        actions.append(action)
    return episode, actions, frames

In [30]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame)
        sleep(.2)

In [31]:
maxiter = 100000
gamma = 1
epsilon = 0.3
lr_rate = 0.8

Q = np.random.uniform(size=(env.observation_space.n, env.action_space.n))

for _ in range(maxiter):
    # starting step
    step = generate_start_step()
    action = get_random_action(step)
    done = False
    while not done:
        next_step = generate_next_step(step, action)

        if next_step['step_type'] == STEPTYPE_LAST:
            state = step['observation']
            idx1 = (state, action)
            Q[idx1] = Q[idx1] + lr_rate * (next_step['reward'] - Q[idx1])
            done = True
        else:
            best_action = get_greedy_action(next_step)

            state = step['observation']
            next_state = next_step['observation']
            idx1 = (state, action)
            idx2 = (next_state, best_action)
            Q[idx1] = Q[idx1] + lr_rate * ((next_step['reward'] + gamma * Q[idx2]) - Q[idx1])

            next_action = get_eps_soft_action(step)

            step = next_step
            action = next_action


In [32]:
epi, actions, frames = generate_episode(policy_func=get_greedy_action)
print_frames(frames)

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  x


