## Learn to Play Lunar Lander

I experimented with some hyper-parameter settings, but didn't get satisfactory results. So this is just the raw code, feel free to play around with different settings and see if your rewards/losses converge to a good point.

In [None]:
# Install Pytorch
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision

In [None]:
# Needed to get the gym environment working
!apt-get install swig3.0
!ln -s /usr/bin/swig3.0 /usr/bin/swig
!pip3 install box2d box2d-kengz

In [None]:
!pip install gym

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import gym
import copy

from tqdm import tqdm

import matplotlib.pyplot as plt
from IPython import display
plt.style.use('seaborn')

from collections import deque

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

np.random.seed(1)

In [None]:
ENV = gym.make('LunarLander-v2')
NS = ENV.observation_space.shape[0]  
NA = 1
DEFAULT_PARAMS = {'bn':10, 'bs':256, 'gamma':0.9, 'ns':NS, 'na':NA}
LR = 1e-4

In [None]:
print(ENV.observation_space, ENV.action_space)

In [None]:
ENV.reset()

In [None]:
class RandomAgent():
    
    def __init__(self, env, clip_reward = False):
        self.env = env
        self.max_rounds = int(1e4)
        self.clip_reward = clip_reward
        self.n = 0  # Number of exploration rounds
        
    def randomAction(self):
        return self.env.action_space.sample()
    
    def pi(self, state, explore):
        return self.randomAction()
    
    def playPol(self, save_frames = False, explore = False):
        '''Plays a game from start to finish. '''
        
        if explore: self.n += 1
            
        state = self.env.reset()
        frames = []; reward = 0
        self.history = deque()
        for i in range(self.max_rounds):       
            if save_frames: frames.append(self.env.render(mode = 'rgb_array'))
            action = self.pi(state, explore)
            newstate, r, done, _ = self.env.step(action)
            self.history.append((state, action, r, done, newstate))
            if self.clip_reward:
                r = max(min(r, 1), -1)
            reward += r
            if done: break
        if save_frames: self.env.close()
        return (reward, frames)

In [None]:
random_guy = RandomAgent(ENV)

In [None]:
random_guy.playPol()

In [None]:
random_guy.history[3]

## Create Neural Net Architecture

The task is pretty simple: take the inputs (8), and return the Q function (the value of each action). That is a NN with 8 inputs, 4 outputs, as many hidden layers as we want

In [None]:
arch = nn.Sequential(nn.BatchNorm1d(8, affine = False),
                    nn.Linear(8, 50), 
                    nn.LeakyReLU(inplace = True), 
                    nn.BatchNorm1d(50), 
                    nn.Linear(50, 4))           

In [None]:
state = torch.tensor(ENV.reset()[None])
arch.eval()
with torch.no_grad():
    print(arch(state).numpy()[0])

## Create Deep Q Agent

In [None]:
class DQAgent(RandomAgent):
    
    def __init__(self, env, arch):
        super().__init__(env)
        self.arch = arch
    
    def getQ(self, state):
        self.arch.eval()
        with torch.no_grad():
            inp = torch.tensor(state[None])
            return self.arch(inp).numpy()[0]
        
    def getOptimalAction(self, state):
        return np.argmax(self.getQ(state))
    
    def get_exp_frac(self, low_lim = 0.05, high_lim = 0.9999, n_taper = 1e-4):
        n = self.n
        #frac = low_lim + high_lim  / (1 + n * n_taper)
        frac = high_lim - (high_lim - low_lim) * n * n_taper
        self.exp_frac = max(min(frac, high_lim), low_lim)
        
    def playPol(self, save_frames = False, explore = False):
        
        self.get_exp_frac()
        return super().playPol(save_frames, explore)
    
    def do_exploration(self):
        return np.random.rand() < self.exp_frac        
    
    def pi(self, state, explore = False):
        if explore and self.do_exploration():
            return self.randomAction()
        else:
            return self.getOptimalAction(state)

In [None]:
dqn_guy = DQAgent(ENV, arch)
dqn_guy.playPol()

## Create a buffer of experience replay

In [None]:
class Buffer_Filler():
    def __init__(self, agent, buffer_size = 200, params = DEFAULT_PARAMS):
        self.buffer_size = buffer_size
        self.agent = agent
        self.ns = params['ns']
        self.na = params['na']
        
    def init_empty(self):
        states = np.zeros((self.buffer_size, self.ns))
        actions = np.zeros((self.buffer_size, self.na))
        dones = np.zeros((self.buffer_size, 1))
        rewards = np.zeros((self.buffer_size, 1))
        next_states = states.copy()
        self.buffer = (states, actions, dones, rewards, next_states)
    
    def fill_buffer(self):
        self.init_empty()
        (states, actions, rewards, dones, next_states) = self.buffer
        
        i = 0
        while i < self.buffer_size:
            self.agent.playPol(explore = True)
            n = len(self.agent.history)
            for j in range(n):
                s, a, r, d, nst = self.agent.history.popleft()
                idx = i + j
                if idx >= self.buffer_size:
                    break
                states[idx, :] = s; actions[idx, :] = a; dones[idx, :] = d 
                rewards[idx, :] = r; next_states[idx, :] = nst
            i += n
        self.buffer = (states, actions, rewards, dones, next_states)

In [None]:
bf = Buffer_Filler(dqn_guy)

In [None]:
bf.fill_buffer()

In [None]:
bf.buffer[0][13], bf.buffer[1][13], bf.buffer[2][13], bf.buffer[3][13], bf.buffer[4][13], 

## Create a mini-batch of experience

In [None]:
class replay_Dataset(Dataset):
    
    def __init__(self, states, actions, y):
        super().__init__()
        
        self.size = states.shape[0]        
        assert self.size == actions.shape[0]
        assert self.size == y.shape[0]
        
        self.states = states
        self.actions = actions
        self.y = y
        
    def __len__(self):
        return self.size
    
    def __getitem__(self, idx):
        return self.states[idx], self.actions[idx], self.y[idx]

In [None]:
class Minibatcher(Buffer_Filler):
    
    ## Later: 10 batches of 128
    
    def __init__(self, agent, params = DEFAULT_PARAMS):
        
        self.bn = params['bn']
        self.bs = params['bs']
        super().__init__(agent, self.bn*self.bs, params)
        
        self.gamma = params['gamma']
        self.eval_arch = copy.deepcopy(self.agent.arch)
        self.eval_arch.eval()
        
        
    def getX(self):
        return (torch.tensor(self.buffer[0], dtype = torch.float32),
                torch.tensor(self.buffer[1], dtype = torch.int64))
        
    def getY(self):
        
        __, __, rewards, dones, next_states = self.buffer
        next_states = torch.tensor(next_states, dtype = torch.float32)
        
        
        with torch.no_grad():
            next_values = self.eval_arch(next_states).max(1)[0]
            
        done_mask = torch.ByteTensor(dones)
        next_values[done_mask.squeeze()] = 0.0
        
        rewards = torch.tensor(rewards, dtype = torch.float32).squeeze()
        
        expected_values = next_values.squeeze() * self.gamma + rewards
        
        return expected_values
        
    def createDS(self):
        self.fill_buffer()
        states, actions = self.getX()
        y = self.getY()
        self.ds = replay_Dataset(states, actions, y)
        
    def createDL(self):
        self.createDS()
        self.dl = DataLoader(self.ds, batch_size = self.bs, shuffle=True,
                            num_workers = 4)    

In [None]:
mb = Minibatcher(dqn_guy)

In [None]:
mb.createDL()

In [None]:
len(mb.ds)

In [None]:
mb.ds.__getitem__(3)

## Calculate Loss

In [None]:
def loss_batch(states, actions, ys, model, opt=None):
    
    '''Calculates the loss for a minibatch, and (if opt is given) updates parameters'''
    
    loss_func = nn.MSELoss()
    
    expected = torch.gather(model(states), 1, actions).cuda()  
    loss = loss_func(expected.squeeze(), ys.cuda())
    
    if opt is not None:  # Update parameters
        opt.zero_grad(); loss.backward(); opt.step()
        
    return loss.item(), len(ys)

In [None]:
states, actions, ys = next(iter(mb.dl))

In [None]:
loss_batch(states, actions, ys, arch)

## Create a trainer

In [None]:
def run_avg(x, beta = 0.95, bias_correct = True):
    '''Calculates exponential running average of x.'''
    r = 0; res = np.zeros(len(x))
    for i in range(len(x)):
        r = beta*r + (1-beta)*x[i]
        res[i] = r
    if bias_correct:
        res = res / (1 - beta **(np.arange(len(x))+1))
    return res

In [None]:
class Learner(Minibatcher):
    
    '''This class trains our policy.'''
    
    def __init__(self, agent, params = DEFAULT_PARAMS):       
        
        super().__init__(agent, params = params)
        self.epoch = 0  # The number of epochs for training.
        self.rewards = []  # List of rewards      
        self.losses = []
        
    def train(self, epochs, lr): 
        '''Trains our policy for epochs rounds with learning rate lr.'''
        for epoch in tqdm(range(epochs), position = 0):
            if (epoch) % 10 == 0:  # Update plot every 10 rounds
                self.plot_reward()
                display.clear_output(wait=True)
                display.display(plt.gcf())
            self.train_episode(lr)
        plt.close('all')
    
    def train_episode(self, lr):
        '''Trains our policy for a single epoch.'''
        
        self.epoch += 1      
        self.createDL()   # Creates minibatch of replays to train on
        
        model = self.agent.arch
        opt = optim.SGD(model.parameters(), lr=lr, momentum=0.95,
                        weight_decay=1e-3)
        
        # Train Neural Net
        model.train()
        for sb, ab, yb in self.dl:
            self.losses.append(loss_batch(sb, ab, yb, model, opt)[0])       
        model.eval()
        
        # Play once; calculate and return  reward
        rew, _ = self.agent.playPol(explore = True)
        self.rewards.append(rew)
        return rew
        
    def plot_reward(self):
        '''Plot rewards.'''
        f, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5),
                                    sharex = False, sharey = False)
        ax2.set_yscale('log')
        ax2.set_yscale('linear')
        ax1.plot(run_avg(self.rewards, beta = 0.99), '-')
        ax2.plot(run_avg(self.losses, beta = 0.999), '-')

In [None]:
arch = nn.Sequential(nn.BatchNorm1d(8, affine = False),
                    nn.Linear(8, 50), 
                    nn.LeakyReLU(inplace = True), 
                    nn.BatchNorm1d(50), 
                    nn.Linear(50, 4))     

dqn_guy = DQAgent(ENV, arch)
learn = Learner(dqn_guy)

In [None]:
np.mean([dqn_guy.playPol()[0] for i in range(100)])

In [None]:
learn.train(1, LR)

In [None]:
learn.train(10, LR)

In [None]:
learn.agent.exp_frac

In [None]:
learn.train(100, LR)

In [None]:
learn.agent.exp_frac

In [None]:
np.mean([dqn_guy.playPol()[0] for i in range(100)])

In [None]:
learn.train(100, LR)

In [None]:
learn.agent.exp_frac

In [None]:
np.mean([dqn_guy.playPol()[0] for i in range(100)])

In [None]:
learn.train(100, LR)

In [None]:
learn.agent.exp_frac

In [None]:
np.mean([dqn_guy.playPol()[0] for i in range(100)])

In [None]:
learn.train(100, LR)

In [None]:
np.mean([dqn_guy.playPol()[0] for i in range(100)])

In [None]:
learn.train(200, LR)

In [None]:
np.mean([dqn_guy.playPol()[0] for i in range(100)])

In [None]:
learn.train(300, LR)

In [None]:
learn.train(300, LR)

In [None]:
np.mean([dqn_guy.playPol()[0] for i in range(100)])

In [None]:
learn.train(500, LR)

In [None]:
learn.train(500, LR/100)

In [None]:
np.mean([dqn_guy.playPol()[0] for i in range(100)])