In [None]:
import numpy as np
import torch
from torch.optim import AdamW
from grid_world import Grid
from actor import Actor, Actor_Loss, choose_action
from critic import Critic, Critic_Loss

In [None]:
np.random.seed(1)

In [None]:
# training config
MAX_EPISODE = 450
Actor_lr = 1e-3
Critic_lr = 1e-3

In [None]:
# problem setting
grid = Grid()
grid.draw_board()
state_dim = 2
action_dim = 4

In [None]:
# init models
actor = Actor(input_dim=state_dim, output_dim=action_dim)
critic = Critic(input_dim=state_dim)
actor_opt = AdamW(actor.parameters(), lr=Actor_lr)
critic_opt = AdamW(critic.parameters(), lr=Critic_lr)

In [None]:
# init loss
a_loss = Actor_Loss()
c_loss = Critic_Loss()

In [None]:
s = grid.reset()
t = 0
total_action = []
done = False

In [None]:
# step 1
s = torch.Tensor(s)
pai = actor(s[None, :])

In [None]:
print("s.shape = {}".format(s.shape))
print("pai.shape = {}".format(pai.shape))
print("")
print("s = {}".format(s))
print("pai = {}".format(pai))

In [None]:
# step 2
a = choose_action(pai)

In [None]:
print("a = {}".format(a))

In [None]:
# step 3
s_, r, done = grid.step(grid.t_action[a])
total_action.append(grid.t_action[a])
if done:
    r = -200

In [None]:
print("s_ = {}".format(s_))
print("r = {}".format(r))
print("done = {}".format(done))

In [None]:
# step 4
s_ = torch.Tensor(s_)
v = critic(s[None, :])
v_ = critic(s_[None, :])

In [None]:
print("s_.shape = {}".format(s_.shape))
print("v.shape = {}".format(v.shape))
print("v_.shape = {}".format(v_.shape))
print("")
print("s_ = {}".format(s_))
print("v = {}".format(v))
print("v_ = {}".format(v_))

In [None]:
# step 5, 6
critic_loss = c_loss(r, v_, v)

In [None]:
print("critic_loss = {}".format(critic_loss))

In [None]:
# step 8
actor_loss = a_loss(pai, a-1, critic_loss.item())

In [None]:
print("actor_loss = {}".format(actor_loss))

In [None]:
# step 11, 12
critic_loss.backward()
critic_opt.step()

In [None]:
# step 10
actor_loss.backward()
actor_opt.step()

In [None]:
# other
s = s_
t += 1

In [None]:
import numpy as np
import torch
from torch.optim import AdamW
from grid_world import Grid
from actor import Actor, Actor_Loss, choose_action
from critic import Critic, Critic_Loss

np.random.seed(1)

# training config
MAX_EPISODE = 450
Actor_lr = 1e-3
Critic_lr = 1e-3

# problem setting
grid = Grid()
grid.draw_board()
state_dim = 2
action_dim = 4

# init models
actor = Actor(input_dim=state_dim, output_dim=action_dim)
critic = Critic(input_dim=state_dim)
actor_opt = AdamW(actor.parameters(), lr=Actor_lr)
critic_opt = AdamW(critic.parameters(), lr=Critic_lr)

# init loss
a_loss = Actor_Loss()
c_loss = Critic_Loss()

for i_episode in range(MAX_EPISODE):
    s = grid.reset()
    t = 0
    total_action = []
    done = False
    while(not done and t < 200):
        # step 1
        s = torch.Tensor(s)
        pai = actor(s[None, :])
        # step 2
        a = choose_action(pai)
        # step 3
        s_, r, done = grid.step(grid.t_action[a])
        total_action.append(grid.t_action[a])
        if done:
            r = -200
        # step 4
        s_ = torch.Tensor(s_)
        v = critic(s[None, :])
        v_ = critic(s_[None, :])
        #print("v = {}".format(v))
        #print("v_ = {}".format(v_))
        # step 5, 6
        critic_loss = c_loss(r, v_, v)
        #print("critic_loss = {}".format(critic_loss))
        # step 7
        pass
        # step 8
        actor_loss = a_loss(pai, a, critic_loss)
        #print("actor_loss = {}".format(actor_loss))
        # step 9
        pass
        # step 11, 12
        #critic_loss.backward()
        #critic_opt.step()
        # step 10
        actor_loss.backward()
        actor_opt.step()
        # other
        s = s_
        t += 1
    print("episode:", i_episode, "  tracked actions to attempt goal:", total_action)