In [3]:
import gym
import collections
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#Hyperparameters
learning_rate = 0.0005
gamma = 0.98
buffer_limit = 50000
batch_size = 32

class ReplayBuffer():
  def __init__(self):
    self.buffer = collections.deque(maxlen=buffer_limit)

  def put(self,transition):
    self.buffer.append(transition)

  def sample(self,n):
    mini_batch = random.sample(self.buffer,n)
    s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [],[],[],[],[]

    for transition in mini_batch:
      s,a,r,s_prime,done_mask = transition
      s_lst.append(s)
      a_lst.append([a])
      r_lst.append([r])
      s_prime_lst.append(s_prime)
      done_mask_lst.append([done_mask])
    return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), torch.tensor(r_lst), torch.tensor(s_prime_lst,dtype = torch.float), torch.tensor(done_mask_lst)

  def size(self):
    return len(self.buffer)

  and should_run_async(code)


In [6]:
class Qnet(nn.Module):
  def __init__(self):
    super(Qnet,self).__init__()
    self.fc1 = nn.Linear(4,128)
    self.fc2 = nn.Linear(128,128)
    self.fc3 = nn.Linear(128,2)

  def forward(self,x):
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)
    return x

  def sample_action(self,obs,epsilon):
    out = self.forward(obs)
    coin = random.random()
    if coin <epsilon :
      return random.randint(0,1)
    else :
      return out.argmax().item()

def train(q, q_target, memory, optimizer):
  for i in range(10):
    s,a,r,s_prime,done_mask = memory.sample(batch_size)
    q_out = q(s)
    q_a = q_out.gather(1,a)
    max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
    target = r + gamma * max_q_prime * done_mask
    loss = F.smooth_l1_loss(q_a,target)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

def main():
  env = gym.make('CartPole-v1')
  q = Qnet()
  q_target = Qnet()
  q_target.load_state_dict(q.state_dict())
  memory = ReplayBuffer()
  print_interval = 20
  score = 0.0
  optimizer = optim.Adam(q.parameters(),lr=learning_rate)
  for n_epi in range(10000):
    epsilon = max(0.01,0.08-0.01*(n_epi/200)) #Linear annealing from 8% to 1%
    s=env.reset()
    done=False
    while not done :
      a=q.sample_action(torch.from_numpy(s).float(),epsilon)
      s_prime,r,done,info = env.step(a)
      done_mask = 0.0 if done else 1.0
      memory.put((s,a,r/100.0, s_prime, done_mask))
      s=s_prime
      score +=r
      if done:
        break

    if memory.size()>2000:
      train(q,q_target,memory,optimizer)

    if n_epi%print_interval ==0 and n_epi!=0:
      q_target.load_state_dict(q.state_dict())
      print("n_episode:{},score:{:.1f},n_buffer:{},eps:{:.1f}%".format(n_epi, score/print_interval , memory.size(), epsilon*100))
      score = 0.0

  env.close()

if __name__ == '__main__':
  main()


  and should_run_async(code)
  deprecation(
  deprecation(


n_episode:20,score:10.3,n_buffer:206,eps:7.9%
n_episode:40,score:9.4,n_buffer:395,eps:7.8%
n_episode:60,score:9.7,n_buffer:588,eps:7.7%
n_episode:80,score:9.6,n_buffer:779,eps:7.6%
n_episode:100,score:9.3,n_buffer:965,eps:7.5%
n_episode:120,score:9.7,n_buffer:1159,eps:7.4%
n_episode:140,score:9.7,n_buffer:1353,eps:7.3%
n_episode:160,score:9.6,n_buffer:1545,eps:7.2%
n_episode:180,score:9.4,n_buffer:1734,eps:7.1%
n_episode:200,score:9.8,n_buffer:1931,eps:7.0%
n_episode:220,score:10.1,n_buffer:2133,eps:6.9%
n_episode:240,score:9.8,n_buffer:2330,eps:6.8%
n_episode:260,score:9.7,n_buffer:2524,eps:6.7%
n_episode:280,score:10.5,n_buffer:2734,eps:6.6%
n_episode:300,score:13.2,n_buffer:2997,eps:6.5%
n_episode:320,score:17.2,n_buffer:3342,eps:6.4%
n_episode:340,score:31.2,n_buffer:3967,eps:6.3%
n_episode:360,score:38.9,n_buffer:4744,eps:6.2%
n_episode:380,score:142.3,n_buffer:7591,eps:6.1%
n_episode:400,score:166.8,n_buffer:10927,eps:6.0%
n_episode:420,score:164.9,n_buffer:14225,eps:5.9%
n_episo

In [None]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

#Hyperparameters
learning_rate = 0.0002
gamma = 0.98
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self). __init__()
        self.data = []

        self.fc1 = nn. Linear(4, 128)
        self.fc2 = nn.Linear(128, 2)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=0)
        return x

    def put_data(self, item):
        self.data.append(item)

    def train_net(self):
        R = 0
        self.optimizer.zero_grad()
        for r, prob in self.data[::-1]:
            R = r + gamma * R
            loss = -torch.log(prob) * R
            loss. backward()
        self.optimizer.step()
        self.data = []
def main() :
    env = gym.make('CartPole-v1')
    pi = Policy()
    score = 0.0
    print_interval = 20
    for n_epi in range (10000):
        s= env.reset()
        done = False
        while not done:
            prob = pi(torch.from_numpy(s).float())
            m = Categorical(prob)
            a = m.sample()
            s_prime, r, done, info = env.step(a.item())
            pi.put_data((r,prob[a]))
            s = s_prime
            score += r
        pi.train_net()
        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {}".format(n_epi, score/print_interval))
        score = 0.0
    env.close()
if __name__ == '__main__' :
    main()

# of episode :20, avg score : 0.7
# of episode :40, avg score : 1.15
# of episode :60, avg score : 2.45
# of episode :80, avg score : 1.5
# of episode :100, avg score : 1.55
# of episode :120, avg score : 0.7
# of episode :140, avg score : 1.05
# of episode :160, avg score : 1.0
# of episode :180, avg score : 1.9
# of episode :200, avg score : 0.6
# of episode :220, avg score : 1.9
# of episode :240, avg score : 2.25
# of episode :260, avg score : 1.75
# of episode :280, avg score : 2.25
# of episode :300, avg score : 3.25
# of episode :320, avg score : 2.05
# of episode :340, avg score : 4.85
# of episode :360, avg score : 1.15
# of episode :380, avg score : 1.35
# of episode :400, avg score : 2.4
# of episode :420, avg score : 1.6
# of episode :440, avg score : 3.3
# of episode :460, avg score : 0.95
# of episode :480, avg score : 2.95
# of episode :500, avg score : 1.6
# of episode :520, avg score : 4.85
# of episode :540, avg score : 2.3
# of episode :560, avg score : 0.9
# of epis

In [None]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

#Hyperparameters
learning_rate = 0.0002
gamma = 0.98
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self). __init__()
        self.data = []
        self.fc1 = nn. Linear(4, 128)
        self.fc2 = nn.Linear(128, 2)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=0)
        return x

    def put_data(self, item):
        self.data.append(item)

    def train_net(self):
        R = 0
        self.optimizer.zero_grad()
        for r, prob in self.data[::-1]:
            R = r + gamma * R
            loss = -torch.log(prob) * R
            loss. backward()
        self.optimizer.step()
        self.data = []
def main() :
    env = gym.make('CartPole-v1')
    pi = Policy()
    score = 0.0
    print_interval = 20
    for n_epi in range (10000):
        s= env.reset()
        done = False
        while not done:
            prob = pi(torch.from_numpy(s).float())
            m = Categorical(prob)
            a = m.sample()
            s_prime, r, done, info = env.step(a.item())
            pi.put_data((r,prob[a]))
            s = s_prime
            score += r
        pi.train_net()
        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {}".format(n_epi, score/print_interval))
        score = 0.0
    env.close()
if __name__ == '__main__' :
    main()

# of episode :20, avg score : 1.2
# of episode :40, avg score : 1.75
# of episode :60, avg score : 1.15
# of episode :80, avg score : 1.05
# of episode :100, avg score : 0.95
# of episode :120, avg score : 3.2
# of episode :140, avg score : 0.55
# of episode :160, avg score : 0.7
# of episode :180, avg score : 2.05
# of episode :200, avg score : 1.65
# of episode :220, avg score : 1.65
# of episode :240, avg score : 1.2
# of episode :260, avg score : 0.8
# of episode :280, avg score : 0.75
# of episode :300, avg score : 1.3
# of episode :320, avg score : 0.9
# of episode :340, avg score : 0.9
# of episode :360, avg score : 1.75
# of episode :380, avg score : 1.2
# of episode :400, avg score : 1.75
# of episode :420, avg score : 2.6
# of episode :440, avg score : 1.15
# of episode :460, avg score : 1.35
# of episode :480, avg score : 2.25
# of episode :500, avg score : 1.35
# of episode :520, avg score : 2.25
# of episode :540, avg score : 1.2
# of episode :560, avg score : 2.55
# of ep

In [None]:
# CartPole 문제 #Actor-Critic(AC)

import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

#Hyperparameters
learning_rate = 0.0002
gamma = 0.98
n_rollout = 10

class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self). __init__()
        self.data = []

        self.fc1 = nn. Linear(4, 256)
        self.fc2 = nn.Linear(256, 2)
        self.fc_v = nn.Linear(256,1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def pi(self,x,softmax_dim=0):
      x = F.relu(self.fc1(x))
      x = self.fc_pi(x)
      prob = F.softmax(x,dim = softmax_dim)
      return prob

    def v(self, x):
        x = F.relu(self.fc1(x))
        v = self.fc_v(x)
        return v

    def put_data(self, transition):
        self.data.append(transition)

    def make_batch(self):
      s_lst, a_lst, r_lst, s_prime_lst, done_lst = [],[],[],[],[]

      for transition in self.data:
        s,a,r,s_prime,done = transition
        s_lst.append(s)
        a_lst.append([a])
        r_lst.append([r/100.0])
        s_prime_lst.append(s_prime)
        done_mask = 0.0 if done else 1.0
        done_lst.append([done_mask])

      s_batch, a_batch, r_batch, s_prime_batch, done_batch = torch.tensor(s_lst, dtype =torch.float), torch.tensor(a_lst),torch.tensor(r_lst, dtype =torch.float), torch.tensor(s_prime_lst, dtype =torch.float),torch.tensor(done_lst, dtype =torch.float)
      self.data = []
      return s_batch, a_batch, r_batch, s_prime_batch, done_batch

    def train_net(self):
        s,a,r,s_prime,done = self.make_batch()
        td_target = r+gamma*self.v(s_prime)*done
        delta = td_target - self.v(s)

        pi = self.pi(s,softmax_dim =1)
        pi_a = pi.gather(1,a)
        loss = -torch.log(pi_a) * delta.detach() + F.smooth_1l_loss(self.v(s),td_target.detach())

        self.optimizer.zero_grad()
        loss.mean().backward()
        self.optimizer.step()

def main() :
    env = gym.make('CartPole-v1')
    pi = ActorCritic()
    score = 0.0
    print_interval = 20

    for n_epi in range (10000):
        s= env.reset()
        done = False
        while not done:
          for t in range(n_rollout):
              prob = model.pi(torch.from_numpy(s).float())
              m = Categorical(prob)
              a = m.sample().item()
              s_prime, r, done, truncated, info = env.step(a)
              model.put_data((s,a,r,s_prime,done))
              s = s_prime
              score += r
              if done :
                break
          model.train_net()

        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {}".format(n_epi, score/print_interval))
        score = 0.0
    env.close()
if __name__ == '__main__' :
    main()