In [1]:
%matplotlib inline

import gym
import matplotlib
import numpy as np
import pandas as pd
import sys
from tqdm import tqdm_notebook

from collections import defaultdict

import itertools
if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.cliff_walking import CliffWalkingEnv

matplotlib.style.use('ggplot')



In [2]:
env = CliffWalkingEnv()

In [3]:
def random_policy(state):
    action_probs = np.array([0.25,0.25,0.25,0.25])
    return np.random.choice(np.arange(len(action_probs)), p=action_probs)

In [4]:
def nstep_prediction(policy, env, num_episodes, n_step = 2, alpha = 0.5, discount_factor=0.5):    


    # The final value function
    V = defaultdict(float)
    
    for i_episode in tqdm_notebook(range(num_episodes)):
        

        state = env.reset() ### S_0
        
        T = np.inf
        updating_time = 0
        state_history = [] ### S_0, S_1,...
        state_history.append(state)
        reward_history = [] ### R_1, R_2,...
        for t in itertools.count():
            if t< T:

                ###action_probs = policy(state)
                action_probs = np.array([0.25,0.25,0.25,0.25])
                action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
                next_state, reward, done, _ = env.step(action)
                reward_history.append(reward)
                state_history.append(next_state)
                

                if done: T=t+1

            updating_time = t-n_step + 1
            
            
            if updating_time >= 0:
                G = 0

                for i in range(updating_time + 1, int(np.min([updating_time + n_step, T])) + 1):
                    G+= (discount_factor** (i-updating_time - 1)) * reward_history[i-1]
                
                if updating_time +n_step < T:
                    G += ( discount_factor**n_step ) * V[state_history[updating_time + n_step]]

                V[state_history[updating_time]] += alpha * (G - V[state_history[updating_time]])
            if updating_time == T-1:
                break
            state = next_state
    return V

In [5]:
def ViewValue(V):
    value_table = np.zeros(48)
    for key in V.keys():
        value_table[key] = V[key]
    value_table = value_table.reshape(4,12)
    value_table = np.around(value_table,2)
    
    return pd.DataFrame(value_table)

In [6]:
V = nstep_prediction(random_policy, env, 5000, n_step = 4)
ViewValue(V)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-2.09,-4.78,-2.85,-2.05,-4.41,-3.84,-2.56,-2.16,-2.09,-2.17,-2.06,-2.09
1,-22.21,-5.2,-3.43,-2.08,-2.21,-8.18,-14.48,-29.58,-2.52,-29.6,-14.38,-2.81
2,-32.93,-62.16,-8.1,-16.36,-31.38,-58.93,-64.32,-80.29,-22.88,-56.05,-32.94,-7.54
3,-59.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
V = nstep_prediction(random_policy, env, 5000, n_step = 2)
ViewValue(V)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-2.44,-2.12,-2.56,-2.48,-5.09,-3.43,-4.37,-2.74,-4.47,-9.98,-3.79,-3.07
1,-6.44,-8.9,-4.94,-27.9,-14.09,-10.11,-8.55,-15.53,-23.28,-12.76,-6.58,-3.27
2,-2.31,-42.7,-77.9,-58.49,-38.37,-54.19,-69.05,-83.11,-61.69,-45.43,-73.95,-3.17
3,-81.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
