# CartPole-v0

In [1]:
import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

env = gym.make('CartPole-v0').env
example_state = env.reset()

# plt.imshow(env.render('rgb_array'))

In [2]:
import torch 
import torch.nn as nn

In [3]:
sample = env.reset()

In [4]:
state_dim = sample.shape[0]
state_dim

4

In [5]:
n_actions = env.action_space.n
n_actions

2

In [6]:
# NN predicts policy logits
# logits more stable for computation
model = nn.Sequential(
    nn.Linear(state_dim, 128),
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, n_actions)
)

## Predict function

In [7]:
from torch.nn import functional as F

In [8]:
# prob of actions(by batch)
def predict_probs(states):
    states = torch.FloatTensor(states)
    logits = model(states).detach()
    probs = F.softmax(logits, dim=-1).numpy()
    return probs

## Play the game

In [9]:
def generate_session(t_max=1000):
    """ 
    play a full session with REINFORCE agent and train at the session end.
    returns sequences of states, actions andrewards
    """
    # arrays to record session
    states, actions, rewards = [], [], []
    s = env.reset()

    for t in range(t_max):
        # action probabilities array - pi(a|s)
        action_probs = predict_probs(np.array([s]))[0]

        # Sample action with given probabilities.
        a = np.random.choice(np.arange(n_actions), p = action_probs)
        new_s, r, done, info = env.step(a)

        # record session history to train later
        states.append(s)
        actions.append(a)
        rewards.append(r)

        s = new_s
        if done:
            break

    return states, actions, rewards

In [10]:
# test 
states, actions, rewards = generate_session()

In [11]:
len(states)

12

## Computing cumulative rewards

In [12]:
def get_cumulative_rewards(rewards,  # rewards at each step
                           gamma=0.99  # discount for reward
                           ):
    """
    take a list of immediate rewards r(s,a) for the whole session 
    compute cumulative returns (a.k.a. G(s,a) in Sutton '16)
    G_t = r_t + gamma*r_{t+1} + gamma^2*r_{t+2} + ...

    The simple way to compute cumulative rewards is to iterate from last to first time tick
    and compute G_t = r_t + gamma*G_{t+1} recurrently

    You must return an array/list of cumulative rewards with as many elements as in the initial rewards.
    """
    G = np.zeros_like(rewards, dtype=float)
    G[-1] = rewards[-1]
    for idx in range(-2, -len(rewards)-1,-1):
        G[idx] = rewards[idx]+gamma*G[idx+1]
    return G

## Loss function and updates

In [13]:
def to_one_hot(y_tensor, ndims):
    """ helper: take an integer vector and convert it to 1-hot matrix. """
    y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)
    y_one_hot = torch.zeros(
        y_tensor.size()[0], ndims).scatter_(1, y_tensor, 1)
    return y_one_hot

In [14]:
optimizer = torch.optim.Adam(model.parameters(), 1e-3)


def train_on_session(states, actions, rewards, gamma=0.99, entropy_coef=1e-2):
    """
    Takes a sequence of states, actions and rewards produced by generate_session.
    Updates agent's weights by following the policy gradient above.
    Please use Adam optimizer with default parameters.
    """

    # cast everything into torch tensors
    states = torch.tensor(states, dtype=torch.float32)
    actions = torch.tensor(actions, dtype=torch.int32)
    cumulative_returns = np.array(get_cumulative_rewards(rewards, gamma))
    cumulative_returns = torch.tensor(cumulative_returns, dtype=torch.float32)

    # predict logits, probas and log-probas using an agent.
    logits = model(states)
    probs = nn.functional.softmax(logits, -1)
    log_probs = nn.functional.log_softmax(logits, -1)

    assert all(isinstance(v, torch.Tensor) for v in [logits, probs, log_probs]), \
        "please use compute using torch tensors and don't use predict_probs function"

    # select log-probabilities for chosen actions, log pi(a_i|s_i)
    log_probs_for_actions = torch.sum(
        log_probs * to_one_hot(actions, env.action_space.n), dim=1)
   
    # Compute loss, entropy regularization with `entropy_coef` 
    entropy = - torch.mean(torch.sum(probs * log_probs), dim = -1) # we want bigger entropy to agent be explorator, do diff acts
    loss = -torch.mean(log_probs_for_actions * cumulative_returns) - entropy * entropy_coef

    # Gradient descent step
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    
    return np.sum(rewards)

## Training

In [15]:
for i in range(100):
    rewards = [train_on_session(*generate_session())
               for _ in range(100)]  # generate new sessions
    print("mean reward:%.3f" % (np.mean(rewards)))
    if np.mean(rewards) > 200:
        print("Win!")  
        break

mean reward:16.940
mean reward:31.160
mean reward:53.810
mean reward:68.500
mean reward:101.880
mean reward:164.210
mean reward:183.280
mean reward:193.790
mean reward:165.280
mean reward:119.990
mean reward:217.250
Win!


# Acrobot-v1

In [16]:
env = gym.make("Acrobot-v1")
env.reset()

# plt.imshow(env.render("rgb_array"))
state_dim = env.reset().shape[0]
n_actions = env.action_space.n

print(state_dim, n_actions)



6 3


In [17]:
model = nn.Sequential(
    nn.Linear(state_dim, 128),
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, n_actions)
)

In [18]:
states, actions, rewards = generate_session()

In [19]:
# here we need to use baseline

In [20]:
optimizer = torch.optim.Adam(model.parameters(), 1e-3)


def train_on_session(states, actions, rewards, gamma=0.99, entropy_coef=1e-3):
    """
    Takes a sequence of states, actions and rewards produced by generate_session.
    Updates agent's weights by following the policy gradient above.
    Please use Adam optimizer with default parameters.
    """

    # cast everything into torch tensors
    states = torch.tensor(states, dtype=torch.float32)
    actions = torch.tensor(actions, dtype=torch.int32)
    cumulative_returns = np.array(get_cumulative_rewards(rewards, gamma))
    cumulative_returns = torch.tensor(cumulative_returns, dtype=torch.float32)

    # predict logits, probas and log-probas using an agent.
    logits = model(states)
    probs = nn.functional.softmax(logits, -1)
    log_probs = nn.functional.log_softmax(logits, -1)

    assert all(isinstance(v, torch.Tensor) for v in [logits, probs, log_probs]), \
        "please use compute using torch tensors and don't use predict_probs function"

    # select log-probabilities for chosen actions, log pi(a_i|s_i)
    log_probs_for_actions = torch.sum(
        log_probs * to_one_hot(actions, env.action_space.n), dim=1)
   
    # Compute loss, entropy regularization with `entropy_coef` 
    entropy = - torch.mean(torch.sum(probs * log_probs), dim = -1) # we want bigger entropy to agent be explorator, do diff acts
    loss = -torch.mean(log_probs_for_actions * (cumulative_returns-torch.mean(cumulative_returns))) - entropy * entropy_coef

    # Gradient descent step
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    
    return np.sum(rewards)

In [22]:
for i in range(100):
    rewards = [train_on_session(*generate_session())
               for _ in range(100)]  # generate new sessions
    print("mean reward:%.3f" % (np.mean(rewards)))
    if np.mean(rewards) > 200:
        print("Win!")  
        break

mean reward:-302.710
mean reward:-156.270
mean reward:-148.510
mean reward:-138.580
mean reward:-139.490
mean reward:-118.560
mean reward:-113.210
mean reward:-116.210
mean reward:-102.080
mean reward:-104.220
mean reward:-110.560
mean reward:-105.740
mean reward:-114.270
mean reward:-112.480
mean reward:-109.910
mean reward:-102.640
mean reward:-103.740
mean reward:-96.520
mean reward:-101.500
mean reward:-96.810
mean reward:-95.660
mean reward:-93.480
mean reward:-103.640


KeyboardInterrupt: 

# I've add `(cumulative_returns-torch.mean(cumulative_returns))` like biseline, but it isn't enought
## Could you suggest what to use here?

### by the video, the stick sways and rises, but not stable 