In [None]:
import numpy as np
import gym
import keras
from keras.initializers import VarianceScaling
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout
from keras.optimizers import Adam, RMSprop
import matplotlib.pyplot as plt
from collections import deque
import itertools
from sklearn.preprocessing import StandardScaler
% matplotlib inline

In [None]:
def build_model(n_inputs, n_outputs, n_hidden=48, lr=0.001):
    model = Sequential([
        Dense(256, input_shape=(n_inputs,), activation="relu", kernel_initializer='he_uniform'),
        Dense(256, activation="relu", kernel_initializer='he_uniform'),
        #Dense(64, activation="relu", kernel_initializer='he_uniform'),
        Dense(n_outputs, activation="linear")
    ])
    
    model.compile(optimizer=RMSprop(lr=lr, clipvalue=1.0),
                  loss="mse")
    return model

In [None]:
def transfer_weights(model, learner, smooth=False):
    if smooth:
        tau = 0.125
        weights = model.get_weights()
        learner_weights = learner.get_weights()
        for i in range(len(weights)):
            weights[i] = learner_weights[i] * tau + weights[i] * (1 - tau)
    else:
        weights = learner.get_weights()
    model.set_weights(weights)
    return model

In [None]:
def report_results(results, episode, episodes, solved, epsilon, rolling, prev_max, step_count):
    best = np.max(results) if results else -1
    rolling = rolling if rolling else [-1]
    roll, variance = np.mean(rolling), np.var(rolling)
    prev_max, best, roll, epsilon = [round(x, 3) for x in (prev_max, best, roll, epsilon)]
    print("episodes: {}/{} - prev steps: {} - prev max: {} - global max: {} - rolling average: {} - epsilon: {}".format(
        episode, episodes, step_count, prev_max, best, roll, epsilon))

In [None]:
def fit_learner(model, replay_buffer, gamma=0.95):
    batchsize = 16
    if len(replay_buffer) < batchsize:
        return model
    memory = np.array(replay_buffer)[np.random.choice(len(replay_buffer), batchsize, replace=False), :]
    #priority1 = memory[memory[:, 0][0] > 0.5]
    priority = memory[memory[:, 4] == True]
    if len(priority) >= 2:
        priority_sample = priority[np.random.choice(len(priority), 2, replace=False), :]
        memory = np.concatenate([memory, priority])
    X, y = np.zeros((len(memory), 2)), np.zeros((len(memory), 2))
    for i, m in enumerate(memory):
        state, action, reward, done, next_state = m[:2], int(m[2]), m[3], m[4], m[5:]
        target = model.predict(state.reshape(-1, 2)).reshape(2)

        if done:
            total_reward = reward
        else:
            total_reward = reward + gamma * np.amax(model.predict(next_state.reshape(-1, 2)))
        target[action] = total_reward
        
        X[i] = state.reshape(-1, 2)
        y[i] = target.reshape(-1, 2)
    
    model.fit(X, y, epochs=1, verbose=0)
    
    return model

In [None]:
def play(model, learner, episodes=500, steps=200, epsilon=1.0, gamma=0.99, render=False):
    env = gym.make("MountainCar-v0")
    results = []
    rolling = deque(maxlen=8)
    replay_buffer = deque(maxlen=100000)
    highest, episode_max, step_count, n_solved = -1, -1, 0, 0
    for episode in range(episodes):
        state = env.reset()
        report_results(results, episode, episodes, 0, epsilon, rolling, episode_max, step_count)
        episode_memory = np.zeros((steps, 7))
        episode_max = -1

        for step in range(steps):
            env.render() if render else None
            if step % 1 == 0 and (np.random.random() < epsilon or step < 0):
                action = np.random.randint(2)
            elif step % 1 == 0:
                action = np.argmax(model.predict(state.reshape(-1, 2)).flatten())
            next_state, reward, done, _ = env.step(action * 2) # 0 for 0, 2 for 1
            v0 = state[1]

            episode_memory[step, :2] = state
            episode_memory[step, 2] = action
            episode_memory[step, 3] = reward
            episode_memory[step, 4] = done
            episode_memory[step, 5:] = next_state

            replay_buffer.append(episode_memory[step])
            state = next_state
            
            episode_max = np.max([episode_max, state[0]])
            if state[0] > highest:
                highest = state[0]
            
            learner = fit_learner(learner, replay_buffer, gamma)
            
            if done:
                step_count = step
                break
        
        rolling.append(episode_max)
        results.append((highest, episode_max))
        
        model = transfer_weights(model, learner)
        epsilon = np.max([epsilon - 0.005, 0.1])
        
        if step_count < 199:
            n_solved += 1
            print("solved ({}).".format(n_solved))
            if n_solved > 9:
                return model, learner, results

    return model, learner, results

In [None]:
model = build_model(2, 2)
learner = build_model(2, 2)
all_results = []
model, learner, results = play(model, learner, episodes=250, epsilon=0.99, render=False)

In [None]:
all_results.extend(results)
global_max, episode_max = [r[0] for r in all_results], [r[1] for r in all_results]
rolling = [np.mean(episode_max[i:i + 5]) for i in range(len(episode_max) - 10)]
rolling.extend([np.mean(rolling) for _ in range(10)])
l = range(len(global_max))
plt.rcParams["figure.figsize"] = (12, 9)
plt.plot(l, global_max)
plt.plot(l, episode_max)
plt.plot(l, rolling)
plt.show()

In [None]:
env = gym.make("MountainCar-v0")
s = env.reset()
sm = 0
d = False
step = 0
for i in range(10000):
    if i % 1 == 0:
        a = np.argmax(model.predict(s.reshape(-1, 2))) * 2
    s, r, d, _ = env.step(a)
    env.render()
    if s[0] > 0.3 and s[0] > sm:
        sm = s[0]
        print(sm)
    step += 1
    if d:
        s = env.reset()
        if step < 199:
            print("done in {} steps".format(i))
            break