In [None]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import gym
import time

In [None]:
def build_network(input_size,output_size,learning_rate = 0.001):
    # Neural Net for Deep Q Learning
    # Sequential() creates the foundation of the layers.
    model = Sequential()
    # 'Dense' is the basic form of a neural network layer
    # Input Layer of state size(4) and Hidden Layer with 24 nodes
    model.add(Dense(24, input_dim=input_size, activation='relu'))
    # Hidden layer with 24 nodes
    model.add(Dense(24, activation='relu'))
    # Output Layer with # of actions: 2 nodes (left, right)
    model.add(Dense(output_size, activation='linear'))
    # Create the model based on the information above
    model.compile(loss='mse', optimizer=Adam(lr=learning_rate))
    return model

In [None]:
# %load "../statistics.py"
def ma(ts, q):
    acc = 0
    res = []
    for i in range(q, len(ts) - q):
        for j in range(i - q, i + q):
            acc += ts[j]
        res.append(acc / (2 * q + 1))
        acc = 0
    return res

def accuracy(results):
    """
    Evaluate the accuracy of results, considering victories and defeats.
    """
    return results[1] / (results[0]+results[1]) * 100

In [None]:
# %load "../qlearning.py"
import numpy as np
import numpy.random as rn

def updateQ(Q, state, new_state, action, reward, alpha, gamma):
    """
    It applies Q-Learning update rule.
    Parameters:
    Q -> Q matrix
    state -> current state t
    new_state -> next state t
    reward -> reward
    action -> current action
    """
    future_action = np.argmax(Q[new_state]) # Find the best action to perform at time t+1
    Q[state, action] = (1 - alpha)*Q[state, action] + alpha * (reward + gamma*Q[new_state, future_action])
    return Q

def updateQ_tensor(Q, state, new_state, action, reward, alpha, gamma):
    """
    It applies Q-Learning update rule considering 3-dimensional matrices. It is used in MountainCar-v0 environment.
    Parameters:
    Q -> Q matrix
    state -> current state t
    new_state -> next state t
    reward -> reward
    action -> current action
    """
    future_action = np.argmax(Q[new_state[0],new_state[1]]) # Find the best action to perform at time t+1
    Q[state[0],state[1], action] = (1 - alpha)*Q[state[0],state[1], action] + alpha * (reward + gamma*Q[new_state[0],new_state[1], future_action])
    return Q

def next_action1(state):
    """
    It chooses the best action given the current state.
    Paramteres:
    state -> array of possible actions in the current state.
    """
    max_value = np.amax(state)
    max_indexes = np.arange(len(state))[state == max_value]
    rn.shuffle(max_indexes)
    return max_indexes[0]

def next_action2(state,i_episode):
    return np.argmax(state + np.random.randn(1,len(state))*(1./(i_episode+1)))

def next_action3(state,epsilon):
    """
    It chooses the best action given the current state.
    Paramteres:
    state -> array of possible actions in the current state.
    """
    if np.random.uniform() > epsilon:
        max_value = np.amax(state)
        max_indexes = np.arange(len(state))[state == max_value]
        rn.shuffle(max_indexes)
        return max_indexes[0]
    return np.argmax(np.random.uniform(0,1, size=4))

def get_epsilon(k,n):
    res = (n - k) / n
    if res < 0.01:
        return 0.01
    return res


def get_epsilon_exp(n):
    res = 1 / (n + 1)
    if res < 0.01:
        return 0.01
    return res
    

In [None]:
import gym
import time
import random as ran

def experiment(alpha = 0.01, gamma = 0.5, n_episodes = 5000, max_action = 100000, final_pun = 0.5, step_pun = 0.07, default_policy = False, policy = np.zeros(64), render = False):
    """
    Execute an experiment given a configuration
    Parameters:
    alpha -> learning rate
    gamma -> discount factor
    n_episodes -> number of completed/failed plays
    max_action -> maximum number of actions per episode
    final_pun -> adjustment for the final reward
    step_pun -> punishment for each step
    """

    Res = [0,0] # array of results accumulator: {[0]: Loss, [1]: Victory}
    Accuracy_res = [0,0]
    Scores = [] # Cumulative rewards
    Steps = [] # Steps per episode
    
    from gym import wrappers
    from tqdm import tqdm
    from collections import deque
    env = gym.make('MountainCar-v0')
    env._max_episode_steps = max_action
    # Set seeds
    np.random.seed(88)
    env.seed(88)
     
    #env = wrappers.Monitor(env, '/tmp/frozenlake-experiment-1', force=True)
    Q = build_network(env.observation_space.shape[0], env.action_space.n)
    memory = deque(maxlen = 200)
    for i_episode in tqdm(range(n_episodes), desc="Episode"):
        state = env.reset()
        cumulative_reward = 0
        
        for t in range(max_action):
            if (render):
                env.render()
                #time.sleep(1)
            
            if (default_policy):
                prediction = policy.predict(np.reshape(state,[1,env.observation_space.shape[0]]))
                next_action = np.argmax(prediction)
            else:
                epsilon = get_epsilon(i_episode, n_episodes)
                #epsilon = get_epsilon_exp(i_episode)
                prediction = Q.predict(np.reshape(state,[1,env.observation_space.shape[0]]))
                if np.random.uniform() > epsilon:
                    next_action = np.argmax(prediction)
                else:
                    next_action = np.argmax(np.random.uniform(0,1, size=3))
            new_state, reward, end, info = env.step(next_action)
            memory.append((state, next_action, reward, new_state, end))
            
            if end:
                if t == max_action - 1:
                    Res[0] += 1
                else:
                    Res[1] += 1
                    
                if reward == -1:
                    reward = reward - final_pun
                    
                Steps.append(t)
                break
            else:
                state = new_state
                cumulative_reward += reward - step_pun
            if (t + 1) % 32 == 0:
                random_pick = ran.sample(memory, 32)
                for state, next_action, reward, next_state, end in random_pick:
                    if end:
                        target = reward
                    else:
                        target = reward + gamma * np.amax(Q.predict(np.reshape(new_state,[1,env.observation_space.shape[0]])))
                     
                    prediction = Q.predict(np.reshape(state,[1,env.observation_space.shape[0]]))
                    prediction[:,next_action] = target
                    Q.fit(np.reshape(state,[1,env.observation_space.shape[0]]), prediction, epochs = 1, verbose = 0)
                    
        cumulative_reward += reward
        Scores.append(cumulative_reward)
    env.close()
    return {"results": np.array(Res), "steps": np.array(Steps), "scores": np.array(Scores), "Q": Q}

In [None]:
config = {"alpha": 0.8, "gamma": .95, "n_episodes": 1000, "max_action": 250, "final_pun": 0, "step_pun": 0, "render": False}
res = experiment(**config)

In [None]:
q = 20

import matplotlib.pyplot as plt
%matplotlib inline

# Scores
x = range(len(res["scores"])-2*q)
plt.figure(figsize=(15,5))
plt.plot(x, ma(res["scores"], q))
#plt.errorbar(x, res["scores"], fmt='ro', label="data", xerr=0.75, ecolor='black')

# Steps
x = range(len(res["steps"])-2*q)
plt.figure(figsize=(15,5))
plt.plot(x, ma(res["steps"],q))

# Steps distribution
plt.figure(figsize=(15,5))
kwargs = dict(histtype='stepfilled', alpha=0.3, density=True, bins=40)
plt.hist(res["steps"],**kwargs)
#plt.hist(res["steps"], len(res["steps"]), density=0, facecolor='green')

In [None]:
config = {"alpha": 0.8, "gamma": .95, "n_episodes": 3, "max_action": 500, "final_pun": 0, "step_pun": 0, "default_policy": True, "policy": res["Q"], "render": True}
res2 = experiment(**config)

In [None]:
print(accuracy(res2["results"]))
print(np.mean(res2["scores"]))

In [None]:
config = {"alpha": 0.8, "gamma": .95, "n_episodes": 2, "max_action": 200, "final_pun": 0, "step_pun": 0, "default_policy": True, "policy": learnt_policy, "render": True}
res = experiment(**config)

| Bins  | Train Mean Score    | Test Mean Score | Accuracy |
|-------|---------------------|-----------------|----------|
| 70    | -166.78             | -152.97         | 98%      |
| 80    | -162.06             | -147.19         | 100%     |
| 90    | -158.856            | -130.3          | 100%     |
| 100   | -158.567            | -169.68         | 100%     |
| 130   | -162.172            | -132.06         | 100%     |
| 150   | -169.692            | -129.28         | 100%     |
| 180   | -179.890            | -141.28         | 100%     |
| myalg | -198.66             | -244.71         | 26%      |