# Baseline Cross Entropy Method

### Things to install
```bash
pip install gym
pip install box2d
pip install ffmpeg
pip install imageio-ffmpeg
pip install pygame
```

### References
* [Medium Article](https://medium.com/coinmonks/landing-a-rocket-with-simple-reinforcement-learning-3a0265f8b58c) and [Source Code](https://github.com/djbyrne/Landing-A-Rocket-With-Simple-Reinforcement-Learning/blob/master/Landing%20A%20Rocket%20With%20Simple%20Reinforcement%20Learning.ipynb)
* [Other Article](https://towardsdatascience.com/solving-a-reinforcement-learning-problem-using-cross-entropy-method-23d9726a737)
* [Link between Cross Entropy and Policy Gradient](https://medium.com/intro-to-artificial-intelligence/a-link-between-cross-entropy-and-policy-gradient-expression-b2b308511867)
* [RL tutorial with Mujoco](https://medium.com/swlh/getting-started-with-reinforcement-learning-mujoco-and-openai-gym-67243b78b599)
* [Mujoco Tutorial](https://www.youtube.com/watch?v=j1nCeqtfySQ) and [Installing on Ubuntu](https://www.youtube.com/watch?v=Wnb_fiStFb8)
* [Mujoco and Unity](https://www.youtube.com/watch?v=eyzzsGJ1iic)
* [Rocket lander](https://github.com/EmbersArc/gym-rocketlander)

In [None]:
import gym
import gym.wrappers
from collections import namedtuple
import numpy as np
from matplotlib import pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


session_size = 500

# Filter for top 20 experiences
percentile = 80

# Training hyperparameters
batch_size = 100
hidden_size = 200
learning_rate = 0.01

# Each environment will have a different completion score
completion_score = 200

### Create Environment

In [2]:
env = gym.make("LunarLander-v2")
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
print('Number of states:', n_states)
print('Number of actions:', n_actions)

  and should_run_async(code)
  logger.warn(


### Policy Function Approximation
This network will learn a policy, the input will be a state, and the output the logits that will represent the actions. 
$$\Pi(a|s)_\theta$$

In [3]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(obs_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, n_actions)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

### Generate Data and Filter Batches
The idea here is to make the agent play some games, then return a batch of experiences. Later those experiences will be filter out 

In [4]:
def gather_experience_on_environment(env, policy_net, batch_size, t_max=5000):
    
    activation = nn.Softmax(dim=1)
    batch_actions,batch_states, batch_rewards = [],[],[]
    
    for b in range(batch_size):
        states,actions = [],[]
        total_reward = 0
        s = env.reset()
        for t in range(t_max):
            s_v = torch.FloatTensor([s])
            act_probs_v = activation(policy_net(s_v))
            act_probs = act_probs_v.data.numpy()[0]
            # Explore/Exploit a bit 
            a = np.random.choice(len(act_probs), p=act_probs)
            # Act on the environment
            new_s, r, done, info = env.step(a)
            
            # Save experience (State,Action,Reward)
            states.append(s)
            actions.append(a)
            total_reward += r
            s = new_s
            
            # Game finished
            if done:
                batch_actions.append(actions)
                batch_states.append(states)
                batch_rewards.append(total_reward)
                break
                
    return batch_states, batch_actions, batch_rewards


def filter_batch(states_batch,actions_batch,rewards_batch,percentile=50):
    reward_threshold = np.percentile(rewards_batch, percentile)
    elite_states = []
    elite_actions = []
    for i in range(len(rewards_batch)):
        # Filter experience with "good" reward (ie: top 20 or 80% percentile)
        if rewards_batch[i] > reward_threshold:
            for j in range(len(states_batch[i])):
                elite_states.append(states_batch[i][j])
                elite_actions.append(actions_batch[i][j])
    
    return elite_states,elite_actions

def save_best_model(model, file='cem_policy_best.pth.tar'):
    # save the model
    torch.save(model, file)

### Instantiate Neural Network/Optimiser and Cross Entropy loss function

In [5]:
#neural network (Function approximation)
policy_net = Net(n_states, hidden_size, n_actions)

# Cross Entropy Loss
ce_loss = nn.CrossEntropyLoss()

#optimisation function
optimizer = optim.Adam(params=policy_net.parameters(), lr=learning_rate)

### Go learn Mr. robot!
It will take some time but eventually the policy will be learnt.

In [6]:
best_mean_reward = 0

for i in range(session_size):
    #generate new sessions
    batch_states,batch_actions,batch_rewards = gather_experience_on_environment(
        env, policy_net, batch_size, t_max=5000)

    # Filter a batch of good experiences
    elite_states, elite_actions = filter_batch(batch_states,batch_actions,batch_rewards,percentile)
    
    # Optimise the network a bit
    optimizer.zero_grad()
    tensor_states = torch.FloatTensor(elite_states)
    tensor_actions = torch.LongTensor(elite_actions)
    action_scores_v = policy_net(tensor_states)
    loss_v = ce_loss(action_scores_v, tensor_actions)
    loss_v.backward()
    optimizer.step()

    #show results
    mean_reward, threshold = np.mean(batch_rewards), np.percentile(batch_rewards, percentile)
    print("%d: loss=%.3f, reward_mean=%.1f, reward_threshold=%.1f" % (
            i, loss_v.item(), mean_reward, threshold))
    
    if mean_reward > best_mean_reward:
        print('Save best reward:', mean_reward)
        save_best_model(policy_net)
        best_mean_reward = mean_reward
    
    # If the mean reward is bigger than the completion score threshold we stop
    if mean_reward > completion_score:
        print("Environment has been successfullly completed!")

0: loss=1.386, reward_mean=-168.8, reward_threshold=-97.2
1: loss=1.352, reward_mean=-203.7, reward_threshold=-113.1
2: loss=1.316, reward_mean=-192.7, reward_threshold=-88.7
3: loss=1.267, reward_mean=-232.0, reward_threshold=-85.8
4: loss=1.233, reward_mean=-238.9, reward_threshold=-114.1
5: loss=1.211, reward_mean=-189.0, reward_threshold=-74.9
6: loss=1.224, reward_mean=-161.5, reward_threshold=-68.3
7: loss=1.216, reward_mean=-130.8, reward_threshold=-55.6
8: loss=1.231, reward_mean=-135.2, reward_threshold=-67.0
9: loss=1.231, reward_mean=-100.5, reward_threshold=-42.6
10: loss=1.226, reward_mean=-94.4, reward_threshold=-57.3
11: loss=1.197, reward_mean=-77.3, reward_threshold=-27.4
12: loss=1.199, reward_mean=-83.4, reward_threshold=-30.0
13: loss=1.164, reward_mean=-87.2, reward_threshold=-28.6
14: loss=1.125, reward_mean=-101.1, reward_threshold=-19.1
15: loss=1.114, reward_mean=-116.6, reward_threshold=-15.0
16: loss=1.108, reward_mean=-168.8, reward_threshold=-43.7
17: loss=

KeyboardInterrupt: 

### Load model
Observe that we never fix the model (.eval()) we really want to always keep learning.

In [None]:
policy_net = torch.load('cem_policy_best.pth.tar')

### Try it out the best model
You can locally run on the enviornment but this command will generate a video, which is better to run from Databricks.

In [10]:
env = gym.wrappers.RecordVideo(gym.make("LunarLander-v2"), 'video')
gather_experience_on_environment(env, policy_net, 1, t_max=5000)
env.close()