In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from timeit import default_timer as timer
from datetime import timedelta
from itertools import count
import os
import sys
import pickle
from timeit import default_timer as timer
from datetime import timedelta
from IPython.display import clear_output

import gym

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential, optimizers

import tensorflow_probability as tfp

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# tf.config.list_physical_devices(device_type='GPU')

In [2]:
seed = 1
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed) # 为了禁止hash随机化，使得实验可复现。

tf.random.set_seed(seed)
# tensorflow 如何设置在GPU上能够复现结果还不太清楚怎么弄

In [3]:
# hyperparameter
training_env_seed = 123
lr = 1e-4
gamma = 0.99

# Policy

In [4]:
class Policy(keras.Model):
    def __init__(self, action_dim):
        super(Policy, self).__init__()
        
        self.action_dim = action_dim
        
        self.affine1 = layers.Dense(128)
        self.dropout = layers.Dropout(rate=0.6)
        self.affine2 = layers.Dense(self.action_dim)
        
    def call(self, obs, training=None):
        x = self.affine1(obs)
        x = self.dropout(x)
        x = tf.nn.relu(x)
        action_logits = self.affine2(x)
        actions = tf.nn.softmax(action_logits, axis=-1)
        return actions

# Agent

## tensorflow和pytorch在写法上会有不同，如果按照pytorch相同的写法，会导致在计算梯度：tape.gradient(loss, self.policy.trainable_variables)时获取不到相应的梯度

In [5]:
class REINFORCEAgent(object):
    def __init__(self, env_name=None, policy=Policy, eval_mode=False):
        
        self.env_name = env_name
        self.env = gym.make(self.env_name)
        self.env.seed(training_env_seed)
        
        self.action_dim = self.env.action_space.n
        
        self.policy = policy(self.action_dim)
        
        self.optimizer = optimizers.Adam(learning_rate=lr)
        
        self.eval_mode = eval_mode
        
        self.log_probs = []        # 用来记录每个时刻t的log(pi(a_t|s_t))
        self.rewards = []          # 用来记录每个时刻t的reward，r_t
        self.returns = []          # 用来记录每个时刻t的return，G_t
        self.loss = []             # 用来记录每个时刻t的loss：G_t * log(pi(a_t|s_t))
        
        self.eps = np.finfo(np.float32).eps.item()   # 创建一个很小的浮点数，加在分母，防止0的出现，直接写1e-10也行
        
    def get_action(self, obs, training=None):
        obs = tf.expand_dims(tf.convert_to_tensor(obs, dtype=tf.float32), axis=0)   # [1, obs_dim]
        probs = self.policy(obs, training=training)      # 产生策略函数，是一个关于action的概率
        m = tfp.distributions.Categorical(probs=probs)   # 生成一个Categorical分布，在CartPole里是二项分布
        action = m.sample()                              # 从分布里采样，采出的是索引
        return action.numpy()[0]
    
    
    def train(self):
        R = 0
        # policy gradient update
        for r in self.rewards[::-1]:     # 倒序
            R = r + gamma * R            # 计算t到T的reward折现和
            self.returns.insert(0, R)    # 在最前面插入

        returns = tf.constant(self.returns, dtype=tf.float32)
        returns = (returns - tf.reduce_mean(returns)) / (tf.math.reduce_std(returns) + self.eps)  # 把returns做一个标准化，这样对于action的矫正会更有效果一些，毕竟一条成功的轨迹里并不一定所有的action都是好的

        for log_prob, R in zip(self.log_probs, returns):
            self.loss.append(-log_prob * R)

        with tf.GradientTape() as tape:
            loss = tf.reduce_sum(tf.stack(self.loss, axis=-1))
            
        self.grads = tape.gradient(loss, self.policy.trainable_variables)
        print(self.grads)
        self.optimizer.apply_gradients(zip(self.grads, self.policy.trainable_variables))
        
        del self.rewards[:]    # 把列表清空，但列表还在，[]
        del self.returns[:]
        del self.log_probs[:]
        del self.loss[:]
        
    
    def eval_(self, env, n_trajs):
        returns = []
        
        for i in range(n_trajs):
            ep_return = 0
            obs = env.reset()
            
            for step in range(10000):
                action = self.get_action(obs, training=False)
                obs, reward, done, _ = env.step(action)
                ep_return += reward
                
                if done:
                    returns.append(ep_return)
                    break
        return np.array(returns).mean()
    
    
    def render(self, env):
        obs = env.reset()
        for _ in range(10000):
            env.render()
            action = self.get_action(obs, training=False)
            obs, reward, done, _ = env.step(action)
            if done:
                break
    
    
    def save(self, step):
        self.policy.save_weights('./reinforce_{}.ckpt'.format(step))
        
    def load(self, path):
        if os.path.isfile(path):
            self.policy.load_weights(path)
        else:
            print('No "{}" exits for loading'.format(path))

# Trainging Loop

In [6]:
name = 'CartPole-v0'
env_eval = gym.make(name)
env_eval.seed(seed)


start = timer()
running_returns = []
agent_reinforce = REINFORCEAgent(env_name=name, policy=Policy)

for episode in count(1): # 一直加1的while, 表示一条episode
    # print('episode%d'%episode)
    obs, ep_return = agent_reinforce.env.reset(), 0
    for step in range(10000):
        action = agent_reinforce.get_action(obs)
        obs, reward, done, _ = agent_reinforce.env.step(action)
        agent_reinforce.rewards.append(reward)
        ep_return += reward
        if done:
            running_returns.append(ep_return)
            break
    
    agent_reinforce.train()
    
    
    if episode % 10 == 0:
        clear_output(True)
        plt.plot(pd.Series(running_returns).rolling(100, 20).mean())
        plt.title('episide:{}, time:{}, returns'.format(episode, timedelta(seconds=int(timer()-start))))
        plt.show()
    if np.array(running_returns)[-10:].mean() > 195:
        eval_return = agent_reinforce.eval_(env_eval, 100)
        if eval_return > 195:
            print("Solved! eval return is now {}!".format(eval_return))
            break 

[None, None, None, None]


ValueError: No gradients provided for any variable: ['policy/dense/kernel:0', 'policy/dense/bias:0', 'policy/dense_1/kernel:0', 'policy/dense_1/bias:0'].

In [16]:
class model(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.d1 = tf.keras.layers.Dense(30,activation='relu')
        self.d2 = tf.keras.layers.Dense(30,activation='relu')
        self.out = tf.keras.layers.Dense(env.action_space.n, activation='softmax')

    def call(self, input_data):
        x = tf.convert_to_tensor(input_data)
        x = self.d1(x)
        x = self.d2(x)
        x = self.out(x)
        return x

In [23]:
class agent():
    def __init__(self, model=model):
        self.model = model()
        self.gamma = 0.9
        self.opt = optimizers.Adam(1e-4)
    def act(self,state):
        prob = self.model(np.array([state]))
        dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
        action = dist.sample()
        return int(action.numpy()[0])

    def train(self, states, rewards, actions):
        sum_reward = 0
        discnt_rewards = []
        rewards.reverse()
        for r in rewards:
            sum_reward = r + self.gamma*sum_reward
            discnt_rewards.append(sum_reward)
        discnt_rewards.reverse()  

        for state, reward, action in zip(states, discnt_rewards, actions):
            with tf.GradientTape() as tape:
                p = self.model(np.array([state]), training=True)
                loss = self.a_loss(p, action, reward)
            grads = tape.gradient(loss, self.model.trainable_variables)
            self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
            
    
    def a_loss(self,prob, action, reward): 
        dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
        log_prob = dist.log_prob(action)
        loss = -log_prob*reward
        return loss 

In [24]:
env = gym.make('CartPole-v0')
agentoo7 = agent()
steps = 500
for s in range(steps):
    done = False
    state = env.reset()
    total_reward = 0
    rewards = []
    states = []
    actions = []
    while not done:
        #env.render()
        action = agentoo7.act(state)
        #print(action)
        next_state, reward, done, _ = env.step(action)
        rewards.append(reward)
        states.append(state)
        actions.append(action)
        state = next_state
        total_reward += reward

        if done:
            agentoo7.train(states, rewards, actions)
            #print("total step for this episord are {}".format(t))
            print("total reward after {} steps is {}".format(s, total_reward))



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

total reward after 0 steps is 46.0
total reward after 1 steps is 31.0
total reward after 2 steps is 48.0
total reward after 3 steps is 10.0
total reward after 4 steps is 36.0
total reward after 5 steps is 40.0
total reward after 6 steps is 18.0
total reward after 7 steps is 11.0
total reward after 8 steps is 11.0
total reward after 9 steps is 12.0
total reward after 10 steps is 20.0
total reward after 11 steps is 44.0
total reward after 12 steps is 17.0
total reward after 13 steps is 18.0
total reward after 14 steps is 26.0
total reward after 15 steps is 21.0
total reward after 16 steps is 10.0
total reward after 17 steps is 10.0
total reward after 18 steps is 49.0
total reward after 19 ste

total reward after 206 steps is 77.0
total reward after 207 steps is 61.0
total reward after 208 steps is 25.0
total reward after 209 steps is 13.0
total reward after 210 steps is 10.0
total reward after 211 steps is 87.0
total reward after 212 steps is 38.0
total reward after 213 steps is 18.0
total reward after 214 steps is 28.0
total reward after 215 steps is 18.0
total reward after 216 steps is 21.0
total reward after 217 steps is 24.0
total reward after 218 steps is 21.0
total reward after 219 steps is 42.0
total reward after 220 steps is 13.0
total reward after 221 steps is 44.0
total reward after 222 steps is 35.0
total reward after 223 steps is 39.0
total reward after 224 steps is 66.0
total reward after 225 steps is 39.0
total reward after 226 steps is 15.0
total reward after 227 steps is 22.0
total reward after 228 steps is 17.0
total reward after 229 steps is 45.0
total reward after 230 steps is 50.0
total reward after 231 steps is 38.0
total reward after 232 steps is 15.0
t

total reward after 428 steps is 94.0
total reward after 429 steps is 107.0
total reward after 430 steps is 40.0
total reward after 431 steps is 47.0
total reward after 432 steps is 12.0
total reward after 433 steps is 17.0
total reward after 434 steps is 61.0
total reward after 435 steps is 75.0
total reward after 436 steps is 26.0
total reward after 437 steps is 61.0
total reward after 438 steps is 29.0
total reward after 439 steps is 66.0
total reward after 440 steps is 65.0
total reward after 441 steps is 19.0
total reward after 442 steps is 69.0
total reward after 443 steps is 27.0
total reward after 444 steps is 32.0
total reward after 445 steps is 41.0
total reward after 446 steps is 86.0
total reward after 447 steps is 73.0
total reward after 448 steps is 42.0
total reward after 449 steps is 79.0
total reward after 450 steps is 40.0
total reward after 451 steps is 100.0
total reward after 452 steps is 27.0
total reward after 453 steps is 72.0
total reward after 454 steps is 63.0