In [38]:
import argparse
import gym
import numpy as np
from itertools import count
from collections import deque
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical, Normal

##Continuous Env

In [80]:
env = gym.make('Pendulum-v1')
env.reset(seed=543)
torch.manual_seed(543)

<torch._C.Generator at 0x11ca753f0>

In [64]:
env = gym.make('CartPole-v1')
env.reset(seed=543)
torch.manual_seed(543)

<torch._C.Generator at 0x11ca753f0>

In [35]:
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

 everytime an action is selected , saved_log_probs gets updated 
1. we get the probability of actions from the inputting the state to the policy network, 
2. we get a categorical dist over the output from policy network
3. action is a sample from the cat distribution 
4. we get the log probability of that action 

In [93]:
LOG_SIG_MAX = 2
LOG_SIG_MIN = -20
epsilon = 1e-6
class Gaussian_Policy(nn.Module):
    '''
    Gaussian policy that consists of a neural network with 1 hidden layer that
    outputs mean and log std dev (the params) of a gaussian policy
    '''

    def __init__(self, num_inputs, hidden_size, action_space):
        super(Gaussian_Policy, self).__init__()

        self.action_space = action_space
        num_outputs = action_space.shape[0] # the number of output actions

        self.linear = nn.Linear(num_inputs, hidden_size)
        self.mean = nn.Linear(hidden_size, num_outputs)
        self.log_std = nn.Linear(hidden_size, num_outputs)
        self.optimizer = optim.Adam(self.parameters(),lr=1e-2)

    def forward(self, inputs):

        # forward pass of NN
        x = inputs
        x = F.relu(self.linear(x))

        mean = self.mean(x)
        log_std = self.log_std(x) # if more than one action this will give you the diagonal elements of a diagonal covariance matrix
        log_std = torch.clamp(log_std, min=LOG_SIG_MIN, max=LOG_SIG_MAX) # We limit the variance by forcing within a range of -2,20
        std = log_std.exp()

        return mean, std

In [112]:
"""For baseline in reinforce : V(s_t) = E[G_t | s_t] """
class ValueNetwork(nn.Module):
    def __init__(self, num_inputs,hidden_size):
        super(ValueNetwork,self).__init__()

        self.linear = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size,1)
        self.optimizer = optim.Adam(self.parameters(),lr=1e-2)

    def forward(self,state):

        x = F.relu(self.linear1(state))
        x = self.linear2(x)

        return x
    




In [41]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(4, 128)
        self.dropout = nn.Dropout(p=0.6)
        self.affine2 = nn.Linear(128, 2)

        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        x = self.affine1(x)
        x = self.dropout(x)
        x = F.relu(x)
        action_scores = self.affine2(x)
        return F.softmax(action_scores, dim=1)

In [103]:
continuous_policy = Gaussian_Policy(obs_dim, 128,env.action_space)
# discrete_policy = Policy()
optimizer = optim.Adam(continuous_policy.parameters(), lr=1e-2)
eps = np.finfo(np.float32).eps.item()
sigma = 0.2
env_action = "continuous" # get from envs 
discount_factor = 0.99
no_of_episodes = 10
print_every = 2

In [94]:
def select_action_cont(state):
    state = torch.from_numpy(state).float().unsqueeze(0) # just to make it a Tensor obj
        # get mean and std
    mean, std = continuous_policy(state)

        # create normal distribution
    normal = Normal(mean, std)
        # sample action
    action = normal.sample()

        # get log prob of that action
    ln_prob = normal.log_prob(action)
    ln_prob = ln_prob.sum()
	# squeeze action into [-1,1]
    action = torch.tanh(action)
        # turn actions into numpy array
    action = action.numpy()

    return action[0], ln_prob 



In [110]:
#assuming continuous here 
#Reinforce

#main func:
#inititae the env

#initiate the policy network
#initiate the value network

#set the loop for certain number of exps

#reset the env , and set an empty list of trajectories

#collect a set of trajectories by letting the policy run in the env

#train both policy and value functions:
##remember : value funsction is mse between the value estimates and the return ; value estimates is just updating the network, feeding it the state 
##then to compute the advantage, we will minus rewards from the value function 
##policy training is as is ; but remember , the policy loss now uses advantage over simple return 
## accumulate policy loss and run gradient descent on it 

Train Function

In [None]:
gamma = 0.99
policy = Gaussian_Policy(obs_dim, 128,env.action_space)
value = ValueNetwork(obs_dim,128)


def train(data,use_baseline=True):
    """Args: data is the set of trajectories after running the policy in the env"""

    rew = 0
    rewards_togo = []
    trajectory_rewards = [r for r in data[0]] #fix list comp
    for i in trajectory_rewards[::-1]:
        rew = i + gamma * rew
        rewards_togo.insert(0,rew)

    if use_baseline:
        trajectory_observations = [obs[0] for obs in data] #check ordering 
        value_estimates = []
        for observation in trajectory_observations:
            obs = torch.from_numpy(obs).float().unsqueeze(0)
            value_estimates.append(value(obs))

        value_estimates = torch.stack(value_estimates).squeeze()

        value_loss = F.mse_loss(value_estimates,rewards_togo)
        value.optimizer.zero_grad
        value_loss.backward()
        value.optimizer.step()

        adv_est = []
        for trajectory_reward, val_est in zip(trajectory_rewards, value_estimates):
            adv_est.append(trajectory_reward - val_est) #doube check is this reversed ??

        log_probabilities = [lp[3] for lp in data] #check ordering 

        policy_loss = []
        for lp,adv in zip(log_probabilities,value_estimates):
            policy_loss.append(-lp * adv)

        policy_loss = torch.stack(policy_loss).sum()
        policy.optimizer.zero_grad()
        policy.optimizer.step()

        return policy_loss , value_loss


        
        

    







In [113]:
def main_func(no_episodes,is_continuos = True):
    if is_continuos:
        env = gym.make('Pendulum-v1')
        env.reset(seed=543)
        torch.manual_seed(543)

        policy = Gaussian_Policy(obs_dim, 128,env.action_space)
        value = ValueNetwork(obs_dim,128) 

    for episode in range(no_of_episodes):
        print('episode #', episode)
        scores_deque = deque(maxlen=100)
        trajectory_data = []
        terminated = False
  
        state , info = env.reset()
        for _ in range(500):
            action , ln_prob = select_action_cont(state)
            new_state,reward,terminated, truncated,_ = env.step(action)

            trajectory_data.append([np.array(state),ln_prob,reward])
            state = new_state

            if terminated or truncated:

                state, _ = env.reset()

        value_loss , policy_loss = train(trajectory_data)

        #go on to log value_loss and policy_loss

        







     





In [111]:
#for continuous
for episode in range(no_of_episodes):
    print('episode #', episode)
    scores_deque = deque(maxlen=100)
    log_probs = [] 
    rewards = []
    terminated = False
    episode_rewards = 0

    state , info = env.reset()

    for _ in range(500):
        action , ln_prob = select_action_cont(state)
        log_probs.append(ln_prob)
        state,reward,terminated, truncated,info = env.step(action)

        if terminated or truncated:
            observation, info = env.reset()

        rewards.append(reward)

    scores_deque.append(sum(rewards))

    cumulative = 0
    discounted_r = np.zeros(len(rewards))
    for rew in reversed(range(len(rewards))):
        cumulative = cumulative * discount_factor + rewards[rew]
        discounted_r[rew] = cumulative

#normalize the discounted rewards
    discounted_r -= np.mean(discounted_r)
    discounted_r /= np.std(discounted_r)

    loss = 0
    for t in range(len(rewards)):

        loss += -log_probs[t] * discounted_r[t] #loss is - log prob * total reward

    continuous_policy.optimizer.zero_grad()
    loss.backward() #update
    continuous_policy.optimizer.step()

    if episode % print_every == 0:
       print('Episode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_deque)))


episode # 0
Episode 0	Average Score: -4379.97
episode # 1
episode # 2
Episode 2	Average Score: -4213.78
episode # 3
episode # 4
Episode 4	Average Score: -4179.14
episode # 5
episode # 6
Episode 6	Average Score: -3943.01
episode # 7
episode # 8
Episode 8	Average Score: -4069.24
episode # 9


In [106]:
8 % 2

0

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.normal import Normal #continuous distribution
import numpy as np
import gym
import math


class Agent(nn.Module):
    def __init__(self,lr):
        super(Agent,self).__init__()
        self.fc1 = nn.Linear(3,64)
        self.fc2 = nn.Linear(64,32)
        self.fc3 = nn.Linear(32,1) #neural network with layers 3,64,32,1

        self.optimizer = optim.Adam(self.parameters(),lr=lr)

    def forward(self,x):
        x = torch.relu(self.fc1(x)) #relu and tanh for output
        x = torch.relu(self.fc2(x))
        x = torch.tanh(self.fc3(x)) * 2
        return x

env = gym.make('Pendulum-v1')
agent = Agent(0.01) #hyperparameters
SIGMA = 0.2
DISCOUNT = 0.99
total = []

for e in range(1000): 
    log_probs, rewards = [], []
    done = False
    state = env.reset()
    while not done:
        mu = agent.forward(torch.from_numpy(state).float().unsq)
        distribution = Normal(mu, SIGMA)
        action = distribution.sample().clamp(-2.0,2.0)
        log_probs.append(distribution.log_prob(action))
        state, reward, done, info = env.step([action.item()])
        #reward = abs(state[1])
        rewards.append(reward)
        
    total.append(sum(rewards))

    cumulative = 0
    d_rewards = np.zeros(len(rewards))
    for t in reversed(range(len(rewards))): #get discounted rewards
        cumulative = cumulative * DISCOUNT + rewards[t]
        d_rewards[t] = cumulative
    d_rewards -= np.mean(d_rewards) #normalize
    d_rewards /= np.std(d_rewards)

    loss = 0
    for t in range(len(rewards)):
        loss += -log_probs[t] * d_rewards[t] #loss is - log prob * total reward

    agent.optimizer.zero_grad()
    loss.backward() #update
    agent.optimizer.step()

## PPO