### Mountain Car from OpenAI Gym

Rewritten code from https://towardsdatascience.com/reinforcement-learning-w-keras-openai-dqns-1eed3a5338c 

Basic Deep-Q idea: 

- initialize replay memory
- initializer action-value function $Q$ with random weights 
- observe initial state $s$ 
- select an action a -- with proba $\varepsilon$ random otherwise $a = argmax_{a^{\prime}} Q(s, a^{\prime})$
- carry out $a$
- observe reward $r$ and new state $s^{\prime}$
- store experience <$s,a,r,s^{\prime}$> into replay memory
    
- sample random transitions <$ss,aa,rr,ss^{\prime}$> from replay memory
- calculate target for each minibatch transition - if $ss^{\prime}$ is a terminal state then $tt = rr$ otherwise $tt = rr + \gamma max_{a^{\prime}}Q(ss^{\prime}, a^{\prime})$
- train the Q function using $(tt - Q(ss, aa))^{2}$ as loss 


### Init and Settings 

In [2]:
import gym
import numpy as np
import keras 
from keras.layers import Dense, Flatten
from keras import Sequential 
from keras.optimizers import Adam
from collections import deque
import random
from tqdm import tqdm

Using TensorFlow backend.


In [3]:
# OpenAI gym mountain car environment 
env = gym.make("MountainCar-v0")
n_in = env.observation_space.shape[0]
n_out = env.action_space.n

WARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.


In [4]:
learning_rate = 0.005       # lr for GD optimizer of the Q function
memory_len = 2000           # replay memory size  
max_iter = 10000            # how many times to iterate (1 iter = 200 pushes or car successfully got to the top)
epsilon = 1.0               # exploitation / exploration ratio
epsilon_min = 0.05
epsilon_decay = 0.998
batch_size = 32             # number of samples to draw from replay memory used for training 
gamma = 0.85                # time discount factor 
tau = 0.125                 # updating factor for the weights of the target model from prediction model 

In [None]:
def make_model():
    model = Sequential()
    model.add(Dense(32, activation="relu", input_shape = (2 * n_in,)))
    model.add(Dense(16, activation="relu"))
    model.add(Dense(8, activation="relu"))
    model.add(Dense(n_out, activation="relu"))
    model.compile(loss="mse", optimizer = Adam(lr = learning_rate))
    return model

In [None]:
model = make_model()
target_model = make_model()

In [None]:
memory = deque(maxlen = memory_len)

### Agent Training 

In [None]:
for i in tqdm(range(max_iter)): 
    total_reward = 0
    done = False
    obs = env.reset()
    s = np.concatenate([obs, obs]).reshape(1,2*n_in)
  
    while not done: 
        epsilon = max(epsilon_min, epsilon * epsilon_decay)

        # Select action, execute it and store result in memory 
        if np.random.uniform() < epsilon: 
            a = env.action_space.sample()
        else: 
            a = np.argmax(model.predict(s)[0])
      
        obs, reward, done, _ = env.step(a)
        s_new = np.concatenate([s[0], obs])[n_in:].reshape(1,2*n_in) 
        memory.append([s, a, reward, s_new, done]) 
        total_reward += reward
    
        # Update Q from replay 
        if 200 * i > batch_size: 
            batch = random.sample(memory, batch_size)
            states, states_new = [np.concatenate([b[i] for b in batch]) for i in [0, 3]]
            actions, rewards, dones = [np.array([b[i] for b in batch]) for i in [1, 2, 4]]
    
            max_Q = np.max(model_target.predict(states_new), axis = 1)
    
            targets = model.predict(states)
            targets[range(batch_size), actions] = rewards + (1-dones) * gamma * max_Q    
            model.fit(states, targets, verbose = 0)    
             
            weights = model.get_weights()
            target_weights = target_model.get_weights()
            for i in range(len(target_weights)):
                target_weights[i] = weights[i] * tau + target_weights[i] * (1 - tau)
            target_model.set_weights(target_weights)

        if total_reward > -200: 
        print(total_reward)

  0%|          | 16/100000 [00:30<52:22:04,  1.89s/it]