# Laboratorium 6

Celem szĂłstego laboratorium jest zapoznanie siÄ oraz zaimplementowanie algorytmu gĹÄbokiego uczenia aktywnego - REINFORCE. Zaimplementowany algorytm bÄdzie testowany z wykorzystaniem Ĺrodowiska z OpenAI - _CartPole_.


DoĹÄczenie standardowych bibliotek


In [94]:
from collections import deque
import gym
import numpy as np
import random

DoĹÄczenie bibliotek do obsĹugi sieci neuronowych


In [95]:
import torch
import torch.nn as nn
from torch.optim import Adam, SGD

## Zadanie 1 - REINFORCE

<p style='text-align: justify;'>
Celem Äwiczenie jest zaimplementowanie algorytmu REINFORCE. Wagi sieci aktualizowane sÄ zgodnie ze wzorem:
\begin{equation*}
    \theta \leftarrow \theta + \alpha G_t \nabla_\theta log \pi_{\theta}(a_t, s_t | \theta)
\end{equation*}.
</p>


In [105]:
def get_cumulative_rewards(rewards,  # rewards at each step
                           gamma=0.99  # discount for reward
                           ):
    """
    based on https://github.com/yandexdataschool/Practical_RL/blob/spring20/week06_policy_based/reinforce_tensorflow.ipynb
    take a list of immediate rewards r(s,a) for the whole session
    compute cumulative rewards R(s,a) (a.k.a. G(s,a) in Sutton '16)
    R_t = r_t + gamma*r_{t+1} + gamma^2*r_{t+2} + ...

    The simple way to compute cumulative rewards is to iterate from last to first time tick
    and compute R_t = r_t + gamma*R_{t+1} recurrently

    You must return an array/list of cumulative rewards with as many elements as in the initial rewards.
    """

    def G_t(rewards):
        return sum([r*gamma**(i) for i, r in enumerate(rewards)])

    cumulative_rewards = [G_t(rewards[i:]) for i in range(len(rewards))]
    return cumulative_rewards


assert len(get_cumulative_rewards(range(100))) == 100

assert np.allclose(get_cumulative_rewards([0, 0, 1, 0, 0, 1, 0], gamma=0.9),
                   [1.40049, 1.5561, 1.729, 0.81, 0.9, 1.0, 0.0])
assert np.allclose(get_cumulative_rewards([0, 0, 1, -2, 3, -4, 0], gamma=0.5),
                   [0.0625, 0.125, 0.25, -1.5, 1.0, -4.0, 0.0])
assert np.allclose(get_cumulative_rewards([0, 0, 1, 2, 3, 4, 0], gamma=0), [0, 0, 1, 2, 3, 4, 0])

In [106]:
def to_one_hot(y_tensor, ndims):
    """ helper: take an integer vector and convert it to 1-hot matrix. """
    y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)
    y_one_hot = torch.zeros(
        y_tensor.size()[0], ndims).scatter_(1, y_tensor, 1)
    return y_one_hot

In [122]:
class REINFORCEAgent:
    def __init__(self, learning_rate, state_size, action_size, model):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99    # discount rate
        self.learning_rate = 0.001
        self.model = model
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.optimizer = Adam(self.model.parameters(), learning_rate)

        
    def remember(self, state, action, reward):
        #Function adds information to the memory about last action and its results
        self.state_memory.append(state)
        self.action_memory.append(action)
        self.reward_memory.append(reward)

    def get_action(self, state):
        """
        Compute the action to take in the current state, basing on policy returned by the network.

        Note: To pick action according to the probability generated by the network
        """

        #
        # INSERT CODE HERE to get action in a given state
        #        
        probs = nn.functional.softmax(self.model(torch.tensor(state, dtype=torch.float32)))
        return torch.multinomial(probs, 1).item()

    def replay(self, batch_size=None):
        """
        Function learn network using data stored in state, action and reward memory. 
        First calculates G_t for each state and train network
        """
        #
        # INSERT CODE HERE to train network
        #

        self.model.train(True) 
        self.optimizer.zero_grad()

        states_tensor = torch.tensor(self.state_memory, dtype=torch.float32)
        actions_tensor = torch.tensor(self.action_memory, dtype=torch.int64)
        returns_tensor = torch.tensor(get_cumulative_rewards(self.reward_memory), dtype=torch.float32)

        logits = self.model(states_tensor)
        log_probs = nn.functional.log_softmax(logits, dim=1)

        log_probs_for_actions = torch.gather(log_probs, 1, actions_tensor.unsqueeze(1)).squeeze(1)

        policy_loss = -torch.mean(log_probs_for_actions * returns_tensor)

        policy_loss.backward()
        self.optimizer.step()

        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []



Czas przygotowaÄ model sieci, ktĂłra bÄdzie siÄ uczyĹa dziaĹania w Ĺrodowisku [_CartPool_](https://gym.openai.com/envs/CartPole-v0/):


In [123]:
env = gym.make("CartPole-v0").env
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001

model = nn.Sequential(
    nn.Linear(state_size, 32),
    nn.ReLU(),
    nn.Linear(32, action_size)
)

Przygotuj funkcjÄ obliczajÄcÄ wartoĹÄ nagrody skumulowanej:


Czas nauczyÄ agenta gry w Ĺrodowisku _CartPool_:


In [125]:
agent = REINFORCEAgent(learning_rate, state_size, action_size, model)
batch_size = 64

def generate_session(t_max=1000):
    """play env with REINFORCE agent and train at the session end"""

    reward = 0

    s = env.reset()
    s = s[0]

    for t in range(t_max):

        # chose action
        a = agent.get_action(s)

        new_s, r, done, info, _ = env.step(a)

        # record session history to train later
        agent.remember(s, a, r)

        reward += r

        s = new_s
        if done: break

    agent.replay(batch_size)

    return reward


for i in range(100):

    rewards = [generate_session() for _ in range(100)]  # generate new sessions

    print("mean reward:%.3f" % (np.mean(rewards)))

    if np.mean(rewards) > 300:
        print("You Win!")
        break



  probs = nn.functional.softmax(self.model(torch.tensor(state, dtype=torch.float32)))


mean reward:18.840
mean reward:24.240
mean reward:25.260
mean reward:24.530
mean reward:31.050
mean reward:34.800
mean reward:39.640
mean reward:57.640
mean reward:85.820
mean reward:110.640
mean reward:138.090
mean reward:169.640
mean reward:185.700
mean reward:224.660
mean reward:194.070
mean reward:153.060
mean reward:257.020
mean reward:241.990
mean reward:312.330
You Win!
