In [None]:
import tensorflow as tf
from tensorflow.keras import layers,Model,Input
import tensorflow_probability as tfp


from collections import namedtuple, deque
import numpy as np
import gym

# tf.compat.v1.disable_eager_execution()

In [None]:
env_name = "CartPole-v0"
env = gym.make(env_name)

input_dims = env.observation_space.shape
n_actions = env.action_space.n

print(input_dims)
print(n_actions)

(4,)
2


In [None]:
learning_rate = 0.0001

inputs = Input([*input_dims])
x = layers.Dense(256, activation=tf.nn.relu)(inputs)
x = layers.Dense(256, activation=tf.nn.relu)(x)
outputs = layers.Dense(n_actions, activation = tf.nn.softmax)(x)

train_net = Model(inputs = inputs, outputs = outputs)
opt = tf.keras.optimizers.Adam(learning_rate = learning_rate)

# train_net.compile(optimizer = opt)

In [None]:
class Agent:

  def __init__(self):
    self.train_net = train_net
  
  def control(self, action_log_probs, values_of_states, tape):
    
    action_log_probs = -1*tf.convert_to_tensor(action_log_probs)
    values_of_states = tf.reshape(tf.convert_to_tensor(values_of_states),(-1,1))
    loss_val = tf.reduce_sum(tf.multiply(action_log_probs, values_of_states))
    gradients = tape.gradient(loss_val, self.train_net.trainable_variables)
    opt.apply_gradients(zip(gradients, self.train_net.trainable_variables))
    tape.__exit__(None,None,None)

  @tf.function  
  def predict(self, obs):
    action_probs = self.train_net(tf.expand_dims(obs, axis=0))
    m = tfp.distributions.Categorical(probs = action_probs)
    action = m.sample()
    return action, m.log_prob(action)


In [None]:
def reinforce(n_episodes=1500, max_iters=1000, gamma=1.0, print_every=100):

  agent = Agent()
  scores = []
  scores_window = deque(maxlen = print_every)

  for episode in range(n_episodes):
    tape = tf.GradientTape()
    tape.__enter__()
    
    score = 0
    obs = env.reset()
    done = False
    saved_probs = []
    rewards = []
    for _ in range(max_iters):
      action, action_log_prob = agent.predict(obs)
      saved_probs.append(action_log_prob)
      new_obs, reward, done, _ = env.step(action.numpy()[0])
      rewards.append(reward)
      score+=reward
      obs = new_obs
      if done:
        break

    scores.append(score)
    scores_window.append(score)

    discounts = [gamma**i for i in range(len(rewards)+1)]
    R = sum([i*j for i,j in zip(discounts, rewards)]) # but why summing ? why not multiply correspoding elements of log probs with rewards ?
    agent.control(saved_probs, R, tape)
    


  
    if episode % print_every == 0:
      print('Episode {}  AverageScore: {:.2f}'.format(episode, np.mean(scores_window)))

reinforce()


Episode 0  AverageScore: 27.00
Episode 100  AverageScore: 21.04
Episode 200  AverageScore: 25.77
Episode 300  AverageScore: 32.55
Episode 400  AverageScore: 38.17
Episode 500  AverageScore: 41.01
Episode 600  AverageScore: 45.21
Episode 700  AverageScore: 56.01
Episode 800  AverageScore: 66.59
Episode 900  AverageScore: 101.40
Episode 1000  AverageScore: 108.41
Episode 1100  AverageScore: 142.18
Episode 1200  AverageScore: 118.58
Episode 1300  AverageScore: 152.98
Episode 1400  AverageScore: 180.32


In [None]:
k = tf.random.uniform((10,))
print(k)
print(-1*k)

tf.Tensor(
[0.02844322 0.47876954 0.91451204 0.7833332  0.67737305 0.38027084
 0.7910849  0.67119    0.6244683  0.82444537], shape=(10,), dtype=float32)
tf.Tensor(
[-0.02844322 -0.47876954 -0.91451204 -0.7833332  -0.67737305 -0.38027084
 -0.7910849  -0.67119    -0.6244683  -0.82444537], shape=(10,), dtype=float32)


In [None]:
import gym
gym.logger.set_level(40) # suppress warnings (please remove if gives error)
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
torch.manual_seed(0) # set random seed
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical


In [None]:
env = gym.make('CartPole-v0')
env.seed(0)
print('observation space:', env.observation_space)
print('action space:', env.action_space)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Policy(nn.Module):
    def __init__(self, s_size=4, h_size=16, a_size=2):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, a_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=1)
    
    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

p = Policy()
p.act(env.reset())

observation space: Box(4,)
action space: Discrete(2)


(1, tensor([-0.5285], grad_fn=<SqueezeBackward1>))

In [None]:
policy = Policy().to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)

def reinforce(n_episodes=1000, max_t=1000, gamma=1.0, print_every=100):
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_episodes+1):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        for t in range(max_t):
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break 
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        
        discounts = [gamma**i for i in range(len(rewards)+1)]
        R = sum([a*b for a,b in zip(discounts, rewards)])
        
        policy_loss = []
        for log_prob in saved_log_probs:
            print("log_prob", log_prob)
            print("R", R)
            policy_loss.append(-log_prob * R)
          
       
        
        policy_loss = torch.cat(policy_loss)
        print(policy_loss.shape)
        policy_loss = policy_loss.sum()
        print(policy_loss)
        raise KeyboardInterrupt

        
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        if np.mean(scores_deque)>=195.0:
            print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
            break
        
    return scores
    
scores = reinforce()

log_prob tensor([-0.5922], grad_fn=<SqueezeBackward1>)
R 18.0
log_prob tensor([-0.6147], grad_fn=<SqueezeBackward1>)
R 18.0
log_prob tensor([-0.7654], grad_fn=<SqueezeBackward1>)
R 18.0
log_prob tensor([-0.7765], grad_fn=<SqueezeBackward1>)
R 18.0
log_prob tensor([-0.5955], grad_fn=<SqueezeBackward1>)
R 18.0
log_prob tensor([-0.7760], grad_fn=<SqueezeBackward1>)
R 18.0
log_prob tensor([-0.7996], grad_fn=<SqueezeBackward1>)
R 18.0
log_prob tensor([-0.5815], grad_fn=<SqueezeBackward1>)
R 18.0
log_prob tensor([-0.7989], grad_fn=<SqueezeBackward1>)
R 18.0
log_prob tensor([-0.5798], grad_fn=<SqueezeBackward1>)
R 18.0
log_prob tensor([-0.5983], grad_fn=<SqueezeBackward1>)
R 18.0
log_prob tensor([-0.6166], grad_fn=<SqueezeBackward1>)
R 18.0
log_prob tensor([-0.6316], grad_fn=<SqueezeBackward1>)
R 18.0
log_prob tensor([-0.7493], grad_fn=<SqueezeBackward1>)
R 18.0
log_prob tensor([-0.6321], grad_fn=<SqueezeBackward1>)
R 18.0
log_prob tensor([-0.6449], grad_fn=<SqueezeBackward1>)
R 18.0
log_prob

KeyboardInterrupt: ignored