In [6]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from timeit import default_timer as timer
from datetime import timedelta
import os
import sys
import pickle
from timeit import default_timer as timer
from datetime import timedelta
from IPython.display import clear_output

import gym

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential, optimizers

import tensorflow_probability as tfp

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# tf.config.list_physical_devices(device_type='GPU')

In [None]:
seed = 1
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed) # 为了禁止hash随机化，使得实验可复现。

tf.random.set_seed(seed)
# tensorflow 如何设置在GPU上能够复现结果还不太清楚怎么弄

In [None]:
# hyperparameter
training_env_seed = 123
lr = 1e-4
gamma = 0.99

# Policy

In [None]:
class Policy(keras.Model):
    def __init__(self, action_dim):
        super(Policy, self).__init__()
        
        self.action_dim = action_dim
        
        self.affine1 = layers.Dense(128)
        self.dropout = layers.Dropout(rate=0.6)
        self.affine2 = layers.Dense(self.action_dim)
        
    def call(self, obs, training=None):
        x = self.affine1(obs)
        x = self.dropout(x)
        x = tf.nn.relu(x)
        action_logits = self.affine2(x)
        actions = tf.nn.softmax(action_logits, axis=-1)
        return actions

# Agent

In [None]:
class REINFORCEAgent(object):
    def __init__(self, env_name=None, policy=policy, eval_mode=False):
        
        self.env_name = env_name
        self.env = gym.make(self.env_name)
        self.env.seed(training_env_seed)
        
        self.action_dim = self.env.action_space.n
        
        self.policy = policy(self.action_dim)
        
        self.optimizer = optimizers.Adam(learning_rate=lr)
        
        self.eval_mode = eval_mode
        
        self.log_probs = []        # 用来记录每个时刻t的log(pi(a_t|s_t))
        self.rewards = []          # 用来记录每个时刻t的reward，r_t
        self.returns = []          # 用来记录每个时刻t的return，G_t
        self.loss = []             # 用来记录每个时刻t的loss：G_t * log(pi(a_t|s_t))
        
        self.eps = np.finfo(np.float32).eps.item()   # 创建一个很小的浮点数，加在分母，防止0的出现，直接写1e-10也行
        
    def get_action(self, obs, training=None):
        obs = tf.expand_dims(tf.convert_to_tensor(obs, dtype=tf.float32), axis=0)   # [1, obs_dim]
        probs = self.policy(obs, training=training)      # 产生策略函数，是一个关于action的概率
        m = tfp.distributions.Categorical(probs=probs)   # 生成一个Categorical分布，在CartPole里是二项分布
        action = m.sample()                              # 从分布里采样，采出的是索引
        self.log_probs.append(m.log_probs(action))       # 把对应的log概率记录下来, 因为后面导数是对logπ（θ）来求的
        
        return action.numpy()
    
    

In [8]:
m = tfp.distributions.Categorical(probs=[0.1, 0.4, 0.5])