# Gradient Policy

## Introduction

This project reproduces the policy gradient algorithm and test this preformance in CartPole 

And this code project refers this [Code](https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/blob/master/contents/7_Policy_gradient_softmax/RL_brain.py)


## Import Library

In [115]:
import gym
import tensorflow.compat.v1 as tf
import numpy as np

In [116]:
tf.disable_eager_execution()

In [117]:
tf.__version__

'2.6.0'

## Code Project

In [118]:
tf.reset_default_graph()

In [119]:
# np.random.seed(1)
# tf.set_random_seed(1)


class PolicyGradient:
    def __init__(self, n_actions, n_features, learning_rate = 0.01, reward_decay = 0.95):
        self.n_actions = n_actions     # 行动维数
        self.n_features = n_features   # 特征维数
        self.lr = learning_rate   
        self.gamma = reward_decay
        self.sess = tf.Session()
        
        self.build_network()
        
        self.sess.run(tf.global_variables_initializer())
        
        self.ep_obs = []
        self.ep_acts = []
        self.ep_rs = []
        
    def build_network(self):
        self.obs = tf.placeholder(dtype = tf.float32, shape = (None, self.n_features)) # None代表batch_size
        self.acts = tf.placeholder(dtype = tf.int32, shape = (None,))
        self.vts = tf.placeholder(dtype = tf.float32, shape = (None,))
        
        # fc1
        layer = tf.layers.dense(
            inputs=self.obs,
            units=10,
            activation=tf.nn.tanh,  # tanh activation
            kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
            bias_initializer=tf.constant_initializer(0.1),
            name='fc1'
        )
        # fc2
        tmp = tf.layers.dense(
            inputs=layer,
            units=self.n_actions,
            activation=None,
            kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
            bias_initializer=tf.constant_initializer(0.1),
            name='fc2'
        )     
                
        self.predict_probs = tf.nn.softmax(tmp)
        
        #loss = tf.losses.log_loss(labels = self.acts, predictions = self.predict_probs[:, 1], weights = self.vts)
        neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=tmp, labels=self.acts)   # this is negative log of chosen action
            
        #neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits=tmp, labels=tf.one_hot(self.acts, self.n_actions))
        loss = tf.reduce_mean(neg_log_prob * self.vts)
   
        self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)    
    
    def train(self):
        discount_rewards = self.discount_reward()
        self.sess.run(self.train_op, feed_dict = {
            self.obs: np.vstack(self.ep_obs),
            self.acts: self.ep_acts,
            self.vts: discount_rewards,
        })
        self.ep_obs = []
        self.ep_acts = []
        self.ep_rs = []
    
    def predict(self, observation):
        probs = self.sess.run(self.predict_probs, feed_dict = {self.obs: observation[np.newaxis, :]})
        return np.random.choice(list(range(self.n_actions)), p = probs.ravel())
    
    def store_transition(self, o, a, t):
        self.ep_obs.append(o)
        self.ep_acts.append(a)
        self.ep_rs.append(t)
        
    def discount_reward(self):
        discount_rewards = np.zeros_like(self.ep_rs)
        tmp = 0
        for i in reversed(range(len(self.ep_rs))):
            discount_rewards[i] = self.gamma * tmp + self.ep_rs[i]
            tmp = discount_rewards[i]
        
        discount_rewards -= np.mean(discount_rewards)
        discount_rewards /= np.std(discount_rewards)
        
        return discount_rewards
             

## Experiment

In [120]:
import gym

In [121]:
EPSILON_NUM = 10000
THRESHOLD = 1000

In [122]:
env = gym.make('CartPole-v0')
# env.seed(1)     
env = env.unwrapped

In [123]:
#env.render()

In [124]:
model = PolicyGradient(n_actions = env.action_space.n, n_features = env.observation_space.shape[0])

In [125]:
#env.render()
for ep in range(EPSILON_NUM):
    observation = env.reset()
    index = 0
    while True:
        action = model.predict(observation)
        observation, reward, done, info = env.step(action)
        index += 1
        model.store_transition(observation, action, reward)
        if done:
            print(f"epsilon:{ep}, reward:{sum(model.ep_rs)}")
            model.train()
            break
        if index > THRESHOLD:
            break
        

epsilon:0, reward:18.0
epsilon:1, reward:27.0
epsilon:2, reward:11.0
epsilon:3, reward:18.0
epsilon:4, reward:25.0
epsilon:5, reward:56.0
epsilon:6, reward:33.0
epsilon:7, reward:35.0
epsilon:8, reward:58.0
epsilon:9, reward:23.0
epsilon:10, reward:14.0
epsilon:11, reward:142.0
epsilon:12, reward:45.0
epsilon:13, reward:43.0
epsilon:14, reward:49.0
epsilon:15, reward:13.0
epsilon:16, reward:34.0
epsilon:17, reward:14.0
epsilon:18, reward:29.0
epsilon:19, reward:81.0
epsilon:20, reward:56.0
epsilon:21, reward:27.0
epsilon:22, reward:53.0
epsilon:23, reward:27.0
epsilon:24, reward:19.0
epsilon:25, reward:27.0
epsilon:26, reward:43.0
epsilon:27, reward:68.0
epsilon:28, reward:45.0
epsilon:29, reward:73.0
epsilon:30, reward:33.0
epsilon:31, reward:19.0
epsilon:32, reward:55.0
epsilon:33, reward:36.0
epsilon:34, reward:24.0
epsilon:35, reward:69.0
epsilon:36, reward:41.0
epsilon:37, reward:60.0
epsilon:38, reward:32.0
epsilon:39, reward:140.0
epsilon:40, reward:92.0
epsilon:41, reward:92.0


KeyboardInterrupt: 

## Result