# Install Dependencies

In [12]:
import sys
sys.version

'3.7.1 (default, Dec 10 2018, 22:54:23) [MSC v.1915 64 bit (AMD64)]'

In [None]:
# refer the requirements document for installation of libraries

In [1]:
import gym
from collections import namedtuple
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Neural Network
#A class Net is derived from nn.Module 
#Here we initialise our two fully connected layer that makes the core of our neural network. 
#The first fully connected layer (fc1) is our input layer which takes in a tensor the same size( 8 dimensional vector) as our state size and outputs a tensor that is the size of our hidden nodes (in this case its 200). 
#The second fully connected layer (fc2) is our hidden layer, it takes in the output from our previous layer and outputs a tensor that is the size of our action space (4 actions) it will output a number for each possible action our agent can take.
#This class also contain a method forward which was called automatically whenver data has passed through network object
#We pass that state through the first layer of our neural network and apply a ReLU activation function to the output of fc1. Next we take that output and pass it through our second layers. This value is then returned as the output of the whole network
#Usually we would add an activation layer after the final output layer, such as the softmax function. The softmax function takes the numbers that were outputed for each possible action and normalises them so that they all add up to 1.

In [2]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(obs_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, n_actions)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)


# Generate Sessions
#This is where we generate our batch of episodes. The agent will play through N episodes and gather the actions/states for each step so we can train our agent.
#We define an activation function which takes the vector of probabilities as input from feedforward net defined in previous cell
#defined  three lists to store our episode data: batchstates,batchactions(These are list of lists)
say batchstates represents list of state sequences[[],[],.......[],[]]
batchaction represents list of actionsequences[[],[],......[],[]]
#batch_rewards stores the total reward achieved during each episode;[ .....] It is just a list gives rewards across batches.
#Iterate through  batch size, running an episode for each iteration. 
#In our first loop we initialise two empty lists to store our actions/states for this episode. 
#We also created a variable to count the total reward of the episode. T
#These are our data variables.
#Finally we initialise our state variable 's' with a fresh episode by calling env.reset(), this will start a new game
#Now we call a second loop that carrys out a single step in the game environment up until we reach our time limit for that episode. 
#First we need to get our current state and pass it through our network. 
#To do this we need to turn our state s into a torch float tensor so we can give it into the network. 
#Next we get the action probability from our network. 
#Remember we have to apply our activation function to the prediciton in order for the probabilities of the actions to all add up to 1 and be usable. 
#Once we have retrieved our probability distribution we can decide what action to take. This is done by using numpys random.choice function. 
#It will choose a “random” action based on the probabilities given. 
#Once we have decided upon which action to take, the action is carried out in the environment. 
#This will return the new state the reward recieved by taking that action, wether or not the episode is finished and any additional information the environment might provide. 
#Now that we have the information of our updated environment we need to add the state,action and reward to our data variables. #Finally we update our current state.
#The last thing we need to do before is check if the episode has finished during this step. 
#If done is True we simply add our actions, states and rewards to their corresponding batch lists. Then break.
#Once It is done just return our batch data

In [3]:
def generate_batch(env,batch_size, t_max=5000):
    
    activation = nn.Softmax(dim=1)
    batch_actions,batch_states, batch_rewards = [],[],[]
    
    for b in range(batch_size):
        states,actions = [],[]
        total_reward = 0
        s = env.reset()
        for t in range(t_max):
            
            s_v = torch.FloatTensor([s])
            act_probs_v = activation(net(s_v))
            act_probs = act_probs_v.data.numpy()[0]
            a = np.random.choice(len(act_probs), p=act_probs)

            new_s, r, done, info = env.step(a)

            #record sessions like you did before
            states.append(s)
            actions.append(a)
            total_reward += r

            s = new_s
            if done:
                batch_actions.append(actions)
                batch_states.append(states)
                batch_rewards.append(total_reward)
                break
                
    return batch_states, batch_actions, batch_rewards

# Filter Elite Episodes
#This method is used to select only the best episodes from the latest batch. 
#Find the reward threshold, in our case this is the top 20% or the 80th percentile but feel free to play around with that number, and then just take the episode data from episodes with a reward ≥ our reward threshold. 

In [4]:
def filter_batch(states_batch,actions_batch,rewards_batch,percentile=50):
    
    reward_threshold = np.percentile(rewards_batch, percentile)
    
    elite_states = []
    elite_actions = []
    
    
    for i in range(len(rewards_batch)):
        if rewards_batch[i] > reward_threshold:
            for j in range(len(states_batch[i])):
                elite_states.append(states_batch[i][j])
                elite_actions.append(actions_batch[i][j])
    
    return elite_states,elite_actions
    

# Training
#batch_size: how many episodes to run at once
#session_size: how many training epochs. each epoch runs one batch
#percentile: used to determine our elite reward threshold
#learning_rate: denotes how much we update our network by during each training step (need to find a good middle ground for this one)
#completion_score: average reward over 100 episodes to be considered solved
#Initialise the network we made previously
#Choose a loss function
#Choose an optimiser.
#We run a loop for the number of sessions given. During each epoch(iteration) we run our generate_batch method to get our batch of episode data.
#Now we filter out the bad episodes and keep the elite ones by calling our filter batch method.
#Once we have the elite episodes that we want to train and we go through the process of passing data through our neural network.
#Before each training step we need to set the gradients of our optimiser back to zero. 
#we’re reseting the optimiser. 
#Next we turn our elite_states and elite_actions lists into torch tensors so they can be used with our network.
#We then pass all of the elite episode states into our network. 
#It goes through every state collected and predicts what the policy distribution should look like. 
#Next we compare these predictions to the actions that were carried out in our elite episodes. Ideally we want our networks predictions to be close to these.
#To find out how far off our network was (the loss) we use the objective function (CrossEntropyLoss).
#Once we have calculated the loss we use the backward method to calculate the gradients of our loss (backpropagation). 
#Finally optimizer updates our network by calling the step method.
#The last thing to do is show the results and check if we achieved an average score that is higher than the completion score

In [5]:
batch_size = 100
#session_size = 500
session_size =5
percentile = 80
hidden_size = 200
completion_score = 200
learning_rate = 0.01

env = gym.make("LunarLander-v2")
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n

#neural network
net = Net(n_states, hidden_size, n_actions)
#loss function
objective = nn.CrossEntropyLoss()
#optimisation function
optimizer = optim.Adam(params=net.parameters(), lr=learning_rate)

for i in range(session_size):
    #generate new sessions
    batch_states,batch_actions,batch_rewards = generate_batch(env, batch_size, t_max=5000)

    elite_states, elite_actions = filter_batch(batch_states,batch_actions,batch_rewards,percentile)
    
    optimizer.zero_grad()
    tensor_states = torch.FloatTensor(elite_states)
    tensor_actions = torch.LongTensor(elite_actions)
    action_scores_v = net(tensor_states)
    loss_v = objective(action_scores_v, tensor_actions)
    loss_v.backward()
    optimizer.step()

    #show results
    mean_reward, threshold = np.mean(batch_rewards), np.percentile(batch_rewards, percentile)
    print("%d: loss=%.3f, reward_mean=%.1f, reward_threshold=%.1f" % (
            i, loss_v.item(), mean_reward, threshold))
    
    #check if 
    if np.mean(batch_rewards)> completion_score:
        print("Environment has been successfullly completed!")

0: loss=1.378, reward_mean=-168.7, reward_threshold=-105.9
1: loss=1.359, reward_mean=-196.6, reward_threshold=-114.4
2: loss=1.348, reward_mean=-146.4, reward_threshold=-91.7
3: loss=1.333, reward_mean=-144.3, reward_threshold=-84.6
4: loss=1.302, reward_mean=-125.9, reward_threshold=-70.7


# Save our model

In [None]:
# save the model
torch.save(net, 'model_best.pth.tar')

# Results

In [None]:
#record sessions
import gym.wrappers
env = gym.wrappers.Monitor(gym.make("LunarLander-v2"), directory="videos", force=True)
generate_batch(env, 1, t_max=5000)
env.close()

In [None]:
#load the model 