<a href="https://colab.research.google.com/github/kahchanlow/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/notebooks/DRL_06_07_Cross_Entropy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

DEEP REINFORCEMENT LEARNING EXPLAINED - 06
# **Solving Frozen-Lake Environment With Cross-Entropy Method**
## Agent Creation Using Deep Neural Networks

 

## The Environment

In [None]:
import numpy as np

import torch
import torch.nn as nn

In [None]:
import gym
import gym.spaces

env = gym.make('FrozenLake-v0', is_slippery=False)

In [None]:
class OneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(OneHotWrapper, self).__init__(env)
        self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)

    def observation(self, observation):
        r = np.copy(self.observation_space.low)
        r[observation] = 1.0
        return r

env = OneHotWrapper(env)

## The Agent
 ### The Model

In [None]:
obs_size = env.observation_space.shape[0] # 16
n_actions = env.action_space.n  # 4
HIDDEN_SIZE = 32


net= nn.Sequential(
            nn.Linear(obs_size, HIDDEN_SIZE),
            nn.Sigmoid(),
            nn.Linear(HIDDEN_SIZE, n_actions)
        )

### Get an Action

In [None]:
sm = nn.Softmax(dim=1)

def select_action(state):
        state_t = torch.FloatTensor([state])
        act_probs_t = sm(net(state_t))
        act_probs = act_probs_t.data.numpy()[0]
        action = np.random.choice(len(act_probs), p=act_probs)
        return action

### Optimizer and Loss function

In [None]:
import torch.optim as optim

objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)

## Training the Agent

In [None]:
BATCH_SIZE = 100

GAMMA = 0.9

PERCENTILE = 30
REWARD_GOAL = 0.8

from collections import namedtuple

Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])


Summary of algorithm:
1. Play some episodes (100).
2. Pick episodes where the rewards are good. These are the elite episodes.
3. Take all the (state, action) pairs of these elite episode and train the NN one round! Take the state as the input and action as ground truth.
4. Update NN weights and repeat step 1 until performance is good enough!

In [None]:
iter_no = 0
reward_mean = 0
full_batch = []
batch = []
episode_steps = []
episode_reward = 0.0
state = env.reset()
    
while reward_mean < REWARD_GOAL:
        action = select_action(state)
        next_state, reward, episode_is_done, _ = env.step(action)

        episode_steps.append(EpisodeStep(observation=state, action=action))
        episode_reward += reward
        
        if episode_is_done: # Episode finished            
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            next_state = env.reset()
            episode_steps = []
            episode_reward = 0.0
             
            if len(batch) == BATCH_SIZE: # New set of batches ready --> select "elite"
                reward_mean = float(np.mean(list(map(lambda s: s.reward, batch))))
                elite_candidates= batch 
                returnG = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), elite_candidates))
                reward_bound = np.percentile(returnG, PERCENTILE)

                train_obs = []
                train_act = []
                elite_batch = []
                for example, discounted_reward in zip(elite_candidates, returnG):
                        if discounted_reward > reward_bound:
                              train_obs.extend(map(lambda step: step.observation, example.steps))
                              train_act.extend(map(lambda step: step.action, example.steps))
                              elite_batch.append(example)
                full_batch=elite_batch
                state=train_obs
                acts=train_act

                
                if len(full_batch) != 0 : # just in case empty during an iteration
                 state_t = torch.FloatTensor(state)
                 acts_t = torch.LongTensor(acts)
                 optimizer.zero_grad()
                 action_scores_t = net(state_t)
                 loss_t = objective(action_scores_t, acts_t)
                 loss_t.backward()
                 optimizer.step()
                 print("%d: loss=%.3f, reward_mean=%.3f" % (iter_no, loss_t.item(), reward_mean))
                 iter_no += 1
                batch = []
        state = next_state


0: loss=1.386, reward_mean=0.020
1: loss=1.362, reward_mean=0.020
2: loss=1.355, reward_mean=0.050
3: loss=1.393, reward_mean=0.020
4: loss=1.385, reward_mean=0.010
5: loss=1.338, reward_mean=0.020
6: loss=1.336, reward_mean=0.020
7: loss=1.239, reward_mean=0.010
8: loss=1.353, reward_mean=0.030
9: loss=1.387, reward_mean=0.010
10: loss=1.334, reward_mean=0.030
11: loss=1.353, reward_mean=0.030
12: loss=1.254, reward_mean=0.030
13: loss=1.321, reward_mean=0.020
14: loss=1.239, reward_mean=0.010
15: loss=1.311, reward_mean=0.050
16: loss=1.275, reward_mean=0.050
17: loss=1.304, reward_mean=0.060
18: loss=1.340, reward_mean=0.020
19: loss=1.366, reward_mean=0.050
20: loss=1.215, reward_mean=0.060
21: loss=1.216, reward_mean=0.020
22: loss=1.278, reward_mean=0.050
23: loss=1.244, reward_mean=0.060
24: loss=1.227, reward_mean=0.040
25: loss=1.258, reward_mean=0.040
26: loss=1.215, reward_mean=0.030
27: loss=1.286, reward_mean=0.030
28: loss=1.225, reward_mean=0.050
29: loss=1.181, reward_m

## Test the Agent

In [None]:
test_env = OneHotWrapper(gym.make('FrozenLake-v0', is_slippery=False))
state= test_env.reset()
test_env.render()

is_done = False

while not is_done:
    action = select_action(state)
    new_state, reward, is_done, _ = test_env.step(action)
    test_env.render()
    state = new_state

print("reward = ", reward)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
reward =  1.0


----

DEEP REINFORCEMENT LEARNING EXPLAINED - 07
# **Cross-Entropy Method Performance Analysis**
## Implementation of the Cross-Entropy Training Loop

In [None]:
%load_ext tensorboard

In [None]:
from torch.utils.tensorboard import SummaryWriter

def train_loop():
   writer = SummaryWriter(comment="-Frozen-Lake-nonslippery")

   iter_no = 0
   reward_mean = 0
   full_batch = []
   batch = []
   episode_steps = []
   episode_reward = 0.0
   state = env.reset()
    
   while reward_mean < REWARD_GOAL:
        action = select_action(state)
        next_state, reward, episode_is_done, _ = env.step(action)

        episode_steps.append(EpisodeStep(observation=state, action=action))
        episode_reward += reward
        
        if episode_is_done: # Episode finished            
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            next_state = env.reset()
            episode_steps = []
            episode_reward = 0.0
             
            if len(batch) == BATCH_SIZE: # New set of batches ready --> select "elite"
                reward_mean = float(np.mean(list(map(lambda s: s.reward, batch))))
                #elite_candidates= full_batch + batch 
                elite_candidates= batch 
                returnG = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), elite_candidates))
                reward_bound = np.percentile(returnG, PERCENTILE)

                train_obs = []
                train_act = []
                elite_batch = []
                for example, discounted_reward in zip(elite_candidates, returnG):
                        if discounted_reward > reward_bound:
                              train_obs.extend(map(lambda step: step.observation, example.steps))
                              train_act.extend(map(lambda step: step.action, example.steps))
                              elite_batch.append(example)
                full_batch=elite_batch
                state=train_obs
                acts=train_act

                
                if len(full_batch) != 0 : # just in case empty during an iteration
                       state_t = torch.FloatTensor(state)
                       acts_t = torch.LongTensor(acts)

                       optimizer.zero_grad()
                       action_scores_t = net(state_t)
                       loss_t = objective(action_scores_t, acts_t)
                       loss_t.backward()
                       optimizer.step()
                       writer.add_scalar("loss", loss_t.item(), iter_no)
                       writer.add_scalar("reward_mean", reward_mean, iter_no)
                       iter_no += 1
                batch = []
        state = next_state

   writer.close()


### Base line

In [None]:
HIDDEN_SIZE = 32
net= nn.Sequential(
            nn.Linear(obs_size, HIDDEN_SIZE),
            nn.Sigmoid(),
            nn.Linear(HIDDEN_SIZE, n_actions)
        )
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)

train_loop()

In [None]:
tensorboard  --logdir=runs

### More complex Neural Network



In [None]:
HIDDEN_SIZE = 128
net= nn.Sequential(
            nn.Linear(obs_size, HIDDEN_SIZE),
            nn.Sigmoid(),
            nn.Linear(HIDDEN_SIZE, n_actions)
        )
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)

train_loop()

In [None]:
tensorboard  --logdir=runs

### ReLU activation function

In [None]:
HIDDEN_SIZE = 128
net= nn.Sequential(
            nn.Linear(obs_size, HIDDEN_SIZE),
            nn.ReLU(),
            nn.Linear(HIDDEN_SIZE, n_actions)
        )
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)

train_loop()

In [None]:
tensorboard  --logdir=runs

### Improving Cross-Entropy Algorithm

In [None]:
def improved_train_loop():
   writer = SummaryWriter(comment="-Frozen-Lake-nonslippery")

   iter_no = 0
   reward_mean = 0
   full_batch = []
   batch = []
   episode_steps = []
   episode_reward = 0.0
   state = env.reset()
    
   while reward_mean < REWARD_GOAL:
        action = select_action(state)
        next_state, reward, episode_is_done, _ = env.step(action)

        episode_steps.append(EpisodeStep(observation=state, action=action))
        episode_reward += reward
        
        if episode_is_done: # Episode finished            
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            next_state = env.reset()
            episode_steps = []
            episode_reward = 0.0
             
            if len(batch) == BATCH_SIZE: # New set of batches ready --> select "elite"
                reward_mean = float(np.mean(list(map(lambda s: s.reward, batch))))

                elite_candidates= full_batch + batch 
                #elite_candidates= batch 

                returnG = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), elite_candidates))
                reward_bound = np.percentile(returnG, PERCENTILE)

                train_obs = []
                train_act = []
                elite_batch = []
                for example, discounted_reward in zip(elite_candidates, returnG):
                        if discounted_reward > reward_bound:
                              train_obs.extend(map(lambda step: step.observation, example.steps))
                              train_act.extend(map(lambda step: step.action, example.steps))
                              elite_batch.append(example)
                full_batch=elite_batch
                state=train_obs
                acts=train_act

                
                if len(full_batch) != 0 : # just in case empty during an iteration
                       state_t = torch.FloatTensor(state)
                       acts_t = torch.LongTensor(acts)

                       optimizer.zero_grad()
                       action_scores_t = net(state_t)
                       loss_t = objective(action_scores_t, acts_t)
                       loss_t.backward()
                       optimizer.step()
                       writer.add_scalar("loss", loss_t.item(), iter_no)
                       writer.add_scalar("reward_mean", reward_mean, iter_no)
                       iter_no += 1
                batch = []
        state = next_state

   writer.close()

In [None]:
HIDDEN_SIZE = 128
net= nn.Sequential(
            nn.Linear(obs_size, HIDDEN_SIZE),
            nn.ReLU(),
            nn.Linear(HIDDEN_SIZE, n_actions)
        )
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)

improved_train_loop()

In [None]:
tensorboard  --logdir=runs

In [None]:
slippedy_env = gym.make('FrozenLake-v0', is_slippery=True)

class OneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(OneHotWrapper, self).__init__(env)
        self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)

    def observation(self, observation):
        r = np.copy(self.observation_space.low)
        r[observation] = 1.0
        return r

env = OneHotWrapper(slippedy_env)

HIDDEN_SIZE = 128
net= nn.Sequential(
            nn.Linear(obs_size, HIDDEN_SIZE),
            nn.ReLU(),
            nn.Linear(HIDDEN_SIZE, n_actions)
        )
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)

improved_train_loop()

In [None]:
tensorboard  --logdir=runs