In [97]:
import os
import torch as pyTorch
import torch.nn as nn
import torch.nn.functional as function
import torch.optim as optim
import numpy as np
import gym
from utils import plotLearning
from gym import wrappers

In [98]:
# Deep Q Network
class DeepQNetwork(nn.Module):
    def __init__(self, learning_rate, input_dimensions, fc1_dimensions, fc2_dimensions, num_actions):
        super(DeepQNetwork, self).__init__()
        self.input_dimensions = input_dimensions
        self.fc1_dimensions = fc1_dimensions
        self.fc2_dimensions = fc2_dimensions
        self.num_actions = num_actions
        self.fc1 = nn.Linear(*self.input_dimensions, self.fc1_dimensions)
        self.fc2 = nn.Linear(self.fc1_dimensions, self.fc2_dimensions)
        self.fc3 = nn.Linear(self.fc2_dimensions, self.num_actions)
        self.optimizer = optim.Adam(self.parameters(), lr = learning_rate)
        self.loss = nn.MSELoss()
        
        #CPU usage
        self.device = pyTorch.device('cpu')
        self.to(self.device)
        
    
    def forward(self, observation):
        # For this Open AI Gym environment, we don't have to do flattening
        state = pyTorch.Tensor(observation).to(self.device)
        x = function.relu(self.fc1(state))
        x = function.relu(self.fc2(x))
        actions = self.fc3(x)
        
        return actions    

In [99]:
# What makes DQN powerful are the max memory and batch size.
# As the agent plays the game, it stores all its actions, state, 
# reward, new station transitions and done flags in its memory
# This allows it to sample from that memory to collect all of the experiences its used
# to update its values for these actions

#params
# gamma - discount factor of future rewards
# epsilon - how greedy
# eps_end - Final epsilon threshold we want to reach\
# eps_dec - Decrease rate for epsilon

class Agent(object):
    
    def __init__(self, gamma, epsilon, learning_rate, input_dimensions, batch_size, number_actions, 
                     max_memory_size=1000000, eps_end=0.01, eps_dec=0.996):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.learning_rate = learning_rate
        self.input_dimensions = input_dimensions
        self.batch_size = batch_size
        self.number_actions = number_actions
        self.action_space = [i for i in range(number_actions)]
        self.max_memory_size = max_memory_size
        self.Q_eval = DeepQNetwork(learning_rate, input_dimensions, num_actions=self.number_actions,
                             fc1_dimensions=256, fc2_dimensions=256)
        self.state_memory = np.zeros((self.max_memory_size, *input_dimensions))
        self.new_state_memory = np.zeros((self.max_memory_size, *input_dimensions))
        self.action_memory = np.zeros((self.max_memory_size, self.number_actions), dtype=np.bool)
        self.reward_memory = np.zeros(self.max_memory_size)
        self.terminal_memory = np.zeros(self.max_memory_size, dtype=np.bool)
        self.memory_counter = 0
        
    def store_transition(self, state, action, reward, state_, terminal):
            index = self.memory_counter % self.max_memory_size
            self.state_memory[index] = state
            actions = np.zeros(self.number_actions)
            actions[action] = 1.0
            self.action_memory[index] = actions
            self.reward_memory[index] = reward
            self.terminal_memory[index] = 1 - terminal
            self.new_state_memory[index] = state_
            self.memory_counter = self.memory_counter + 1
            
    def choose_action(self, observation):
            rand = np.random.random()
            if rand < self.epsilon:
                action = np.random.choice(self.action_space)
            else:
                actions = self.Q_eval.forward(observation)
                action = pyTorch.argmax(actions).item()
            return action

    def learn(self):
            if self.memory_counter > self.batch_size:
                self.Q_eval.optimizer.zero_grad()
                max_memory = self.memory_counter if self.memory_counter < self.max_memory_size else self.max_memory_size
            
                batch = np.random.choice(max_memory, self.batch_size)
                state_batch = self.state_memory[batch]
                action_batch = self.action_memory[batch]
                reward_batch = self.reward_memory[batch]
                terminal_batch = self.terminal_memory[batch]
                new_state_batch = self.new_state_memory[batch]
                action_values = np.array(self.action_space, dtype=np.bool)
                action_indices = np.dot(action_batch, action_values)

                reward_batch = pyTorch.Tensor(reward_batch).to(self.Q_eval.device)
                terminal_batch = pyTorch.Tensor(terminal_batch)

                q_eval = self.Q_eval.forward(state_batch).to(self.Q_eval.device)
                q_target = self.Q_eval.forward(state_batch).to(self.Q_eval.device)

                q_next = self.Q_eval.forward(new_state_batch).to(self.Q_eval.device)

                batch_index = np.arange(self.batch_size, dtype=np.int32)
                
                target[batch_index, action_indices] = (reward_batch + self.gamma) * pyTorch.max(q_next, dim = 1)[0]*terminal_batch
                self.epsilon = self.epsilon* self.eps_dec if self.epsilon > self.eps_min else self.eps_min

                loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)

                loss.backward()
                self.Q_eval.optimizer.step()

In [100]:
if __name__ == '__main__':
    env = gym.make('LunarLander-v2')
    brain = Agent(gamma = 0.99, epsilon = 1.0, batch_size = 64, number_actions = 4, input_dimensions = [8],
                             learning_rate = 0.003)
    scores = []
    episode_history = []
    number_games = 500
    score = 0
    
    for i in range(number_games):
        if i % 10 == 0 and i > 0:
            average_score = np.mean(scores[max(0, i - 10): (i + 1)])
            print('episode', i, 'score: ', score, 'average score %.3f' % average_score,
                 'epsilon %.3f' % brain.epsilon)
        else:
            print('episode', i, 'score', score)
            score = 0
            episode_history.append(brain.epsilon)
            observation = env.reset()
            done = False
            while not done:
                action = brain.choose_action(observation)
                observation_, reward, done, info = env.step(action)
                score += reward
                brain.store_transition(observation, action, reward, observation_, done)
                brain.learn()
                observation = observation_
            scores.append(score)
    x = [i + 1 for i in range(number_games)]
    filename = 'lunar_lander.png'
    plotLearning(x, scores, eps_history, filename)

episode 0 score 0
episode 1 score -67.21117215505978
episode 2 score -157.21560096732046
episode 3 score -175.9185699624876
episode 4 score -332.0710388727816
episode 5 score -576.2349814715928
episode 6 score -191.21855497220469
episode 7 score -379.5048136468788
episode 8 score -372.0514379968254
episode 9 score -357.1679878542182
episode 10 score:  19.737969724871718 average score -258.886 epsilon 0.022
episode 11 score 19.737969724871718
episode 12 score -195.94421505100905
episode 13 score -228.7999218135468
episode 14 score -195.12377805433235
episode 15 score -256.53299405303693
episode 16 score -456.9358690377209
episode 17 score -449.0479503552436
episode 18 score -301.93641669647195
episode 19 score -448.02980746525026
episode 20 score:  -156.87895007352 average score -298.803 epsilon 0.010
episode 21 score -156.87895007352
episode 22 score -364.0007008652012
episode 23 score -584.77735406636
episode 24 score -378.0427975793806
episode 25 score -13.084852918151498
episode 26 

KeyboardInterrupt: 