<a href="https://colab.research.google.com/github/hhant-max/RL3/blob/main/CMAES_POP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from multiprocessing.pool import ThreadPool
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import copy


# before


In [None]:

#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 3
T_horizon     = 20

class PPO(nn.Module):
    def __init__(self):
        super(PPO, self).__init__()
        self.data = []
        
        self.fc1   = nn.Linear(4,256)
        self.fc_pi = nn.Linear(256,2)
        self.fc_v  = nn.Linear(256,1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def pi(self, x, softmax_dim = 0):
        x = F.relu(self.fc1(x))
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=softmax_dim)
        return prob
    
    def v(self, x):
        x = F.relu(self.fc1(x))
        v = self.fc_v(x)
        return v
      
    def put_data(self, transition):
        self.data.append(transition)
        
    def make_batch(self):
        s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], []
        for transition in self.data:
            s, a, r, s_prime, prob_a, done = transition
            
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            prob_a_lst.append([prob_a])
            done_mask = 0 if done else 1
            done_lst.append([done_mask])
            
        s,a,r,s_prime,done_mask, prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
                                          torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
                                          torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst)
        self.data = []
        return s, a, r, s_prime, done_mask, prob_a
        
    def train_net(self):
        s, a, r, s_prime, done_mask, prob_a = self.make_batch()

        for i in range(K_epoch):
          # td_target is rewards
            td_target = r + gamma * self.v(s_prime) * done_mask
            delta = td_target - self.v(s)
            delta = delta.detach().numpy()

            advantage_lst = []
            advantage = 0.0
            for delta_t in delta[::-1]:
              # GAE funciton! fitness function? min?
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_lst.append([advantage])
            advantage_lst.reverse()
            advantage = torch.tensor(advantage_lst, dtype=torch.float)

            pi = self.pi(s, softmax_dim=1)
            pi_a = pi.gather(1,a)
            ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a))  # a/b == exp(log(a)-log(b))

            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage
            loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s) , td_target.detach())

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
        
def main():
    env = gym.make('CartPole-v1')
    model = PPO()
    score = 0.0
    print_interval = 20

    for n_epi in range(10000):
        s = env.reset()
        done = False
        while not done:
            for t in range(T_horizon):
                prob = model.pi(torch.from_numpy(s).float())
                m = Categorical(prob)
                a = m.sample().item()
                s_prime, r, done, info = env.step(a)

                model.put_data((s, a, r/100.0, s_prime, prob[a].item(), done))
                s = s_prime

                score += r
                if done:
                    break

            model.train_net()

        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval))
            score = 0.0

    env.close()

if __name__ == '__main__':
    main()

In [None]:
def run_env_PPO():
  # 画图 计算 rewards？？？？
  env = gym.make("CartPole-v1")
  MAXEPISODE = 1000
  model = PPO()

  for episode in range(MAXEPISODE):
    s = env.reset()
    done = False
    while not done:
        # for t in range(T_horizon):
          prob = model.pi(torch.from_numpy(s).float()) #tensor([0.5188, 0.4812]
          m = Categorical(prob)
          a = m.sample().item()
          # action = dist.sample().squeeze()???
          # log_prob = Categorical(prob).log_prob(a).squeeze()
          #"""Converts variable to numpy."""
          #alue, action, logprob = utils.to_data(value), utils.to_data(action), utils.to_data(logprob)
          s_prime, r, done, info = env.step(a)

          model.put_data((s, a, r/100.0, s_prime, prob[a].item(), done))
          s = s_prime 


run_env_PPO()

In [None]:
# rewards calculation

rewards = []
# network CNN return return value, log_prob, entropy
#policy = policies.get_policy(args, env) 
policy = model.pi()
alg= ESPPOModule(policy,run_env_PPO)
while True:
  weights = alg.step()
  # caculate every 10 iterations
  if iteration % 10 == 0 :
    # policy returns a softmax (2d) pi
    test_reward = run_env_PPO(policy, stochastic=False, render=False, reward_only=True)
    rewards.append(test_reward) #states, actions, rewards, values.squeeze(), logprobs, returns

# After

In [None]:
# 看怎么 update
# es ppo document
class ESPPOModule(nn.Module):

  def __init__(
      self,
      population_size = 5,
      learning_rate = 0.95,

      softmax_dim = 0,
      sigma = 0.1,
      n_rollout = 10, # running depth in one batch

  ):
    super().__init__()
    self.data = []
    #self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    self.population_size = population_size
    self.policy = nn.Sequential(
        nn.Linear(4,256),
        nn.ReLU(),
        nn.Linear(256,2),
        nn.Softmax()
        ) # return prob
    self.v = nn.Sequential(
        nn.Linear(4,256),
        nn.ReLU(),
        nn.Linear(256,1),      
        )

    self.weights = list(self.policy.parameters())
    self.sigma = sigma
    self.pool = ThreadPool(4)
    self.n_rollout = n_rollout
    
    # v share parameters 
    # def v(self, x):
    #     x = F.relu(self.fc1(x))
    #     v = self.fc_v(x)
    #     return v
    # self.fc1   = nn.Linear(4,256)
    # self.fc_pi = nn.Linear(256,2)
    # self.fc_v  = nn.Linear(256,1)
    
  def put_data(self, transition):
      self.data.append(transition)
      
  def make_batch(self):
      s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], []
      for transition in self.data:
          s, a, r, s_prime, prob_a, done = transition
          
          s_lst.append(s)
          a_lst.append([a])
          r_lst.append([r])
          s_prime_lst.append(s_prime)
          prob_a_lst.append([prob_a])
          done_mask = 0 if done else 1
          done_lst.append([done_mask])
          
      s,a,r,s_prime,done_mask, prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
                                        torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
                                        torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst)
      self.data = []
      return s, a, r, s_prime, done_mask, prob_a


  def es_step(self):

    # population initialization (for differnet epsilon to add )
    pops = []
    for _ in range(self.population_size):
      # N(0,I)
      pop = []
      for weight in self.weights:
        # append parameters from every layer 256 4;256;2 256;2
        pop.append(np.random.randn(*weight.data.size()))
      pops.append(pop)

    #add perturb to form a new distrubution
    # new_pops = []
    # for pop in pops:
    #   new_pop = [np.add(p,self.sigma) for p in pop]
    #   new_pops.append(new_pop)
    old_weights = copy.deepcopy(self.weights)
    new_weights = []
    # 两个循环也可以写成 list compresion 加函数
    for pop in pops:
      for i,weight in enumerate(old_weights):
        tmp = torch.from_numpy(self.sigma * pop[i]).float()
        new_weights.append(weight.data + tmp)
    #print(new_weights)

    # to see truth have four nested 
    # for pop in pops[0]:
    #   print(pop)

    # evaluate the results from new pops
    self.pool.map(self.a2c,new_weights)

    pass


  def run(self,policy):
    '''
    run for one time
    '''
    s = env.reset()
    score = 0.0
    done = False

    while not done:
      for _ in range(self.n_rollout):
        prob = policy(torch.from_numpy(s).float())
        m = Categorical(prob)
        a = m.sample().item()
        s_prime, r, done, info = env.step(a)
        self.put_data((s,a,r,s_prime,done))
        
        s = s_prime
        score += r
        
        if done:
          break
      
      # train network
      s, a, r, s_prime, done = self.make_batch()
      td_target = r + gamma * self.v(s_prime) * done
      delta = td_target - self.v(s)
      
      pi = policy(s)
      pi_a = pi.gather(1,a)
      # policy loss?
      loss = -torch.log(pi_a) * delta.detach() + F.smooth_l1_loss(self.v(s), td_target.detach())

      self.optimizer.zero_grad()
      loss.mean().backward()
      self.optimizer.step() 


    env.close()

    return score


  def a2c(self,weights):

    # update by new weight, then use policy to update not weight simulate with a clone policy
    simulate_policy = copy.deepcopy(self.policy)
    # new weight for simulate policy 
    for i,simu_weight in enumerate(list(simulate_policy.parameters())):
      simu_weight = new_weights[i]

    # optimizer?
    optimizer = optim.Adam(simulate_policy.parameters(), lr=self.learning_rate)

    # simulate with original netwrork
    score = self.run(simulate_policy)


    #use trained policy to return rewards




    


    pass


test = ESPPOModule()

In [None]:
fir_wei = list(test.policy.parameters())[0]
fir_wei

Parameter containing:
tensor([[-0.1741,  0.0036, -0.0396, -0.4171],
        [-0.3754, -0.4730,  0.1517,  0.2199],
        [-0.1115, -0.4842, -0.4735,  0.3360],
        ...,
        [ 0.2666,  0.3873,  0.2126, -0.2084],
        [ 0.2393, -0.1507, -0.2080, -0.0130],
        [-0.4058, -0.4969,  0.0038, -0.4174]], requires_grad=True)

In [None]:
result = [1,2,3,4,5,2,3]
lst = []
lst.extend(result)
lst


[1, 2, 3, 4, 5, 2, 3]