In [1]:
%matplotlib inline

import gym
import matplotlib
import numpy as np
import pandas as pd
import sys
from tqdm import tqdm_notebook

from collections import defaultdict

import itertools
if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.cliff_walking import CliffWalkingEnv

matplotlib.style.use('ggplot')



In [2]:
env = CliffWalkingEnv()

In [3]:
def random_policy(state):
    action_probs = np.array([0.25,0.25,0.25,0.25])
    return np.random.choice(np.arange(len(action_probs)), p=action_probs)

In [9]:
def nstep_prediction(policy, env, num_episodes, n_step = 2, alpha = 0.5, discount_factor=0.5):    


    # The final value function
    V = defaultdict(float)
    
    for i_episode in tqdm_notebook(range(num_episodes)):
        

        state = env.reset() ### S_0
        
        T = np.inf
        updating_time = 0
        state_history = [] ### S_0, S_1,...
        state_history.append(state)
        reward_history = [] ### R_1, R_2,...
        for t in itertools.count():
            if t< T:

                ###action_probs = policy(state)
                action_probs = np.array([0.25,0.25,0.25,0.25])
                action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
                next_state, reward, done, _ = env.step(action)
                reward_history.append(reward)
                state_history.append(next_state)
                

                if done: T=t+1

            updating_time = t-n_step + 1
            
            
            if updating_time >= 0:
                G = 0

                for i in range(updating_time + 1, int(np.min([updating_time + n_step, T])) + 1):
                    G+= (discount_factor** (i-updating_time - 1)) * reward_history[i-1]
                
                if updating_time +n_step < T:
                    G += ( discount_factor**n_step ) * V[state_history[updating_time + n_step]]

                V[state_history[updating_time]] += alpha * (G - V[state_history[updating_time]])
            if updating_time == T-1:
                break
            state = next_state
    return V

In [10]:
def ViewValue(V):
    value_table = np.zeros(48)
    for key in V.keys():
        value_table[key] = V[key]
    value_table = value_table.reshape(4,12)
    value_table = np.around(value_table,2)
    
    return pd.DataFrame(value_table)

In [11]:
V = nstep_prediction(random_policy, env, 5000, n_step = 4)
ViewValue(V)

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-3.61,-2.32,-2.13,-2.22,-2.13,-5.94,-3.32,-3.92,-2.19,-2.83,-1.95,-1.88
1,-5.86,-29.61,-2.06,-2.6,-2.75,-9.15,-5.41,-3.22,-2.21,-2.43,-2.17,-1.75
2,-40.34,-52.42,-4.78,-52.62,-57.54,-71.81,-46.69,-3.87,-3.15,-6.84,-26.73,-1.44
3,-71.69,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
V = nstep_prediction(random_policy, env, 5000, n_step = 2)
ViewValue(V)

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-2.28,-2.18,-3.24,-6.03,-3.57,-2.11,-5.34,-6.3,-2.44,-2.22,-2.31,-2.33
1,-3.29,-10.34,-28.98,-26.91,-9.05,-14.08,-4.33,-25.34,-26.99,-2.84,-7.05,-2.16
2,-5.75,-16.58,-53.5,-57.19,-56.43,-14.89,-33.25,-83.58,-92.73,-69.43,-27.67,-2.71
3,-2.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
