In [1]:
import torch as T
import torch.nn as nn                        #To handle layers.
import torch.nn.functional as F              #For value activation function.
import torch.optim as optim                  #For adam optimizer
import numpy as np

import gym

import matplotlib
import matplotlib.pyplot as plt

This implementation uses an Agent that has a Deep Q-Learning network to train the LunarLander-v2 enviornment.
Uses linear layers as the enviornment is an eight element vector observation 
\[Horizontal Coordinate, Vertical Coordinate, Horizontal Speed, Vertical Speed, Angle, Angular Speed, 1 if first leg has contact else 0, 1 if second leg has contact, else 0]\

The landing pad is always at coordinates (0,0). The coordinates are the first two numbers in the state vector.
Reward for moving from the top of the screen to the landing pad and zero speed is about 100..140 points.
If the lander moves away from the landing pad it loses reward. The episode finishes if the lander crashes or
comes to rest, receiving an additional -100 or +100 points. Each leg with ground contact is +10 points.
Firing the main engine is -0.3 points each frame. Firing the side engine is -0.03 points each frame.
Solved is 200 points.
Landing outside the landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land
on its first attempt.

In [2]:
class DeepQNetwork(nn.Module):
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
        super(DeepQNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()              #Mean squared error loss
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
    
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        actions = self.fc3(x)
         
        return actions

In [3]:
class Agent():
    def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions, max_mem_size=1000000, eps_end=0.01, eps_dec=5e-4):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.action_space = [i for i in range(n_actions)]
        self.mem_size = max_mem_size
        self.batch_size = batch_size
        self.mem_cntr = 0
        
        self.Q_eval = DeepQNetwork(self.lr, n_actions=n_actions, input_dims=input_dims, fc1_dims=256, fc2_dims=256)
                                                                                            #Evaluation network
        
        self.state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
                                                                                #Storing state memory as named array 
        self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
                                                                                #Storing new state memory as named array
        #TD update rule requires the value of current state, next state; and the reward it recieved
        
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
        
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = done 
        
        self.mem_cntr += 1
        
    def choose_action(self, observation):                                        #choose action using epsilon 
        if np.random.random() > self.epsilon:
            state = T.tensor([observation]).to(self.Q_eval.device)
            actions = self.Q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)
        
        return action
    
    def learn(self):
        if self.mem_cntr < self.batch_size:
            return                                      #If atleast the batch size of memory is not filled, do actions at random
        
        self.Q_eval.optimizer.zero_grad()
        
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, self.batch_size, replace=False)
        
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        
        state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
        new_state_batch = T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
        reward_batch= T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
        terminal_batch = T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)
                                                        #Converting np array subset of memory into pytorch tensor.
        
        action_batch = self.action_memory[batch]
        
        q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch]
        q_next = self.Q_eval.forward(new_state_batch)
        q_next[terminal_batch] = 0.0
        
        q_target = reward_batch + self.gamma * T.max(q_next, dim=1)[0] 
        
        loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()
        
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min \
                        else self.eps_min
        

In [None]:
if __name__ == '__main__':
    env = gym.make('LunarLander-v2')
    agent = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4,
                     eps_end=0.01, input_dims=[8], lr=0.001)
    scores, eps_history = [], []
    n_games = 500
    
    for i in range(n_games):
        score = 0
        done = False
        observation = env.reset()
        while not done:
            if i >= (n_games - 5) or i<5:
                env.render()
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.store_transition(observation, action, reward, 
                                  observation_, done)
            agent.learn()
            observation = observation_
        scores.append(score)
        eps_history.append(agent.epsilon)
        
        avg_score = np.mean(scores[-100:])
        
        print('episode ', i, 'score %.2f' % score, 
                  'average score %.2f' % avg_score,
                  'epsilon %.2f' % agent.epsilon)
        

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


episode  0 score -141.71 average score -141.71 epsilon 0.99
episode  1 score -234.72 average score -188.22 epsilon 0.93
episode  2 score -37.81 average score -138.08 epsilon 0.88
episode  3 score -246.12 average score -165.09 epsilon 0.85
episode  4 score -164.31 average score -164.93 epsilon 0.78
episode  5 score 10.21 average score -135.74 epsilon 0.73
episode  6 score -327.83 average score -163.18 epsilon 0.68
episode  7 score 24.63 average score -139.71 epsilon 0.65
episode  8 score -491.37 average score -178.78 epsilon 0.60
episode  9 score -154.98 average score -176.40 epsilon 0.55
episode  10 score -159.84 average score -174.90 epsilon 0.48
episode  11 score -61.84 average score -165.47 epsilon 0.43
episode  12 score -85.78 average score -159.34 epsilon 0.38
episode  13 score -113.39 average score -156.06 epsilon 0.28
episode  14 score -188.23 average score -158.21 epsilon 0.22
episode  15 score -82.74 average score -153.49 epsilon 0.08
episode  16 score -196.73 average score -1

episode  136 score -78.36 average score -107.76 epsilon 0.01
episode  137 score -75.95 average score -107.82 epsilon 0.01
episode  138 score -58.10 average score -107.20 epsilon 0.01
episode  139 score -74.32 average score -105.44 epsilon 0.01
episode  140 score -31.32 average score -103.21 epsilon 0.01
episode  141 score -30.38 average score -98.85 epsilon 0.01
episode  142 score -47.61 average score -99.11 epsilon 0.01
episode  143 score -58.12 average score -99.10 epsilon 0.01
episode  144 score -126.18 average score -98.91 epsilon 0.01
episode  145 score -56.28 average score -97.97 epsilon 0.01
episode  146 score -60.85 average score -96.19 epsilon 0.01
episode  147 score -104.57 average score -96.35 epsilon 0.01
episode  148 score -300.18 average score -101.57 epsilon 0.01
episode  149 score -322.57 average score -101.79 epsilon 0.01
episode  150 score -74.48 average score -99.92 epsilon 0.01
episode  151 score -25.22 average score -96.49 epsilon 0.01
episode  152 score -59.52 ave

In [None]:
for i in range(20):
        done = False
        observation = env.reset()
        while not done:
            env.render()
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            observation = observation_

In [None]:
env.close()