In [None]:
import gym
import numpy as np
import random
import tensorflow as tf
import os
os.environ['PYTHONHASHSEED'] = '0'

seed = 51
# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

np.random.seed(seed)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

random.seed(seed)

# Force TensorFlow to use single thread.
# Multiple threads are a potential source of
# non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res

session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)

from keras import backend as K

# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed

tf.set_random_seed(seed)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import TensorBoard

from keras.optimizers import Adam
from keras.optimizers import RMSprop
import gym
from time import time

In [None]:
from keras import backend as K


# INSERT HERE THE NAME OF THE NETWORK. The FOLDER NAME should have this structure:
# If I have a network with 2 layers, the first one with 10 neurons and the second one with 20 neurons, the file name is:
# log_dir='./Monitoring/Test_10_20'

tensorboard = TensorBoard(log_dir='./Monitoring/Test_5', histogram_freq=0, write_graph=True, write_images=True)
# Note: pass in_keras=False to use this function with raw numbers of numpy arrays for testing
def huber_loss(a, b, in_keras=True):
    error = a - b
    quadratic_term = error*error / 2
    linear_term = abs(error) - 1/2
    use_linear_term = (abs(error) > 1.0)
    if in_keras:
        # Keras won't let us multiply floats by booleans, so we explicitly cast the booleans to floats
        use_linear_term = K.cast(use_linear_term, 'float32')
    return use_linear_term * linear_term + (1-use_linear_term) * quadratic_term

In [None]:
def build_network(input_size, output_size, learning_rate = 0.001, compile = True):
    # Neural Net for Deep Q Learning
    # Sequential() creates the foundation of the layers.
    model = Sequential()
    # 'Dense' is the basic form of a neural network layer
    # Input Layer of state size(4) and Hidden Layer with 24 nodes
    model.add(Dense(5, input_dim=input_size, activation='relu'))
    # Output Layer with # of actions: 2 nodes (left, right)
    model.add(Dense(output_size))
    # Create the model based on the information above
    if compile:
        model.compile(loss='mean_squared_error', optimizer=RMSprop(lr=learning_rate))
    # model.compile(loss=huber_loss, optimizer=Adam(lr=learning_rate))
    # model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    #model.compile(sgd(lr=.2), "mse")
    return model

In [None]:
# %load "../statistics.py"
def ma(ts, q):
    acc = 0
    res = []
    for i in range(q, len(ts) - q):
        for j in range(i - q, i + q):
            acc += ts[j]
        res.append(acc / (2 * q + 1))
        acc = 0
    return res

def accuracy(results):
    """
    Evaluate the accuracy of results, considering victories and defeats.
    """
    return results[1] / (results[0]+results[1]) * 100

In [None]:
# %load "../qlearning.py"
import numpy as np
import numpy.random as rn

def updateQ(Q, state, new_state, action, reward, alpha, gamma):
    """
    It applies Q-Learning update rule.
    Parameters:
    Q -> Q matrix
    state -> current state t
    new_state -> next state t
    reward -> reward
    action -> current action
    """
    future_action = np.argmax(Q[new_state]) # Find the best action to perform at time t+1
    Q[state, action] = (1 - alpha)*Q[state, action] + alpha * (reward + gamma*Q[new_state, future_action])
    return Q

def updateQ_tensor(Q, state, new_state, action, reward, alpha, gamma):
    """
    It applies Q-Learning update rule considering 3-dimensional matrices. It is used in MountainCar-v0 environment.
    Parameters:
    Q -> Q matrix
    state -> current state t
    new_state -> next state t
    reward -> reward
    action -> current action
    """
    future_action = np.argmax(Q[new_state[0],new_state[1]]) # Find the best action to perform at time t+1
    Q[state[0],state[1], action] = (1 - alpha)*Q[state[0],state[1], action] + alpha * (reward + gamma*Q[new_state[0],new_state[1], future_action])
    return Q

def next_action1(state):
    """
    It chooses the best action given the current state.
    Paramteres:
    state -> array of possible actions in the current state.
    """
    max_value = np.amax(state)
    max_indexes = np.arange(len(state))[state == max_value]
    rn.shuffle(max_indexes)
    return max_indexes[0]

def next_action2(state,i_episode):
    return np.argmax(state + np.random.randn(1,len(state))*(1./(i_episode+1)))

def next_action3(state,epsilon):
    """
    It chooses the best action given the current state.
    Paramteres:
    state -> array of possible actions in the current state.
    """
    if np.random.uniform() > epsilon:
        max_value = np.amax(state)
        max_indexes = np.arange(len(state))[state == max_value]
        rn.shuffle(max_indexes)
        return max_indexes[0]
    return np.argmax(np.random.uniform(0,1, size=4))

def get_epsilon(k,n):
    res = (n - k) / n
    if res < 0.01:
        return 0.01
    return res


def get_epsilon_exp(n):
    res = 1 / (n + 1)
    if res < 0.01:
        return 0.01
    return res
    

In [None]:
import gym
import time
import random as ran
from tensorflow import set_random_seed
import tensorflow as tf
#from keras.backend import stop_gradient

def copy_weights(Q, Q_noob):
    weights = Q.get_weights()
    Q_noob.set_weights(weights)

def replay(Q, random_pick, gamma, Q_noob):
    Loss = []
    Local_loss = []
    for state, next_action, _reward, new_state, end in random_pick:
        if not end:
            _reward = _reward + gamma * np.amax(Q_noob.predict(new_state)[0])
            #_reward = stop_gradient(_reward)

        new_prediction = Q_noob.predict(state)
        new_prediction[0][next_action] = _reward
        Local_loss.append(Q.fit(state, new_prediction, epochs = 1, verbose = 0, callbacks = [tensorboard]).history['loss'][0])
        Loss.append(np.mean(Local_loss))
    return Loss
           
            
def my_f(epsilon):
    epsilon *= 0.95
    epsilon = max(0.01, epsilon)
    return epsilon
    
def experiment(alpha = 0.01, gamma = 0.5, n_episodes = 5000, max_action = 100000, final_pun = 0.5, step_pun = 0.07, default_policy = False, policy = np.zeros(64), render = False):
    
    """
    Execute an experiment given a configuration
    Parameters:
    alpha -> learning rate
    gamma -> discount factor
    n_episodes -> number of completed/failed plays
    max_action -> maximum number of actions per episode
    final_pun -> adjustment for the final reward
    step_pun -> punishment for each step
    """

    with tf.device('/cpu:0'):
        Res = [0,0] # array of results accumulator: {[0]: Loss, [1]: Victory}
        Accuracy_res = [0,0]
        Scores = [] # Cumulative rewards
        Steps = [] # Steps per episode
        Loss = []
        Actions = []
        epsilon = 0
        
        from gym import wrappers
        from tqdm import tqdm
        from collections import deque
        env = gym.make('MountainCar-v0')
        env._max_episode_steps = 1000000
        # Set seeds
        env.seed(seed)
        #env = wrappers.Monitor(env, '/tmp/frozenlake-experiment-1', force=True)
        Q = build_network(env.observation_space.shape[0], env.action_space.n)
        Q_noob = build_network(env.observation_space.shape[0], env.action_space.n, compile=False)
        memory = deque(maxlen = 3000)
        batch_size = 32
        for i_episode in tqdm(range(n_episodes), desc="Episode"):
            state = env.reset()
            cumulative_reward = 0

            state = np.reshape(state,[1,2])
            
            t = 0
            while True:
            #for t in range(max_action):
                if (t % 1000) == 0:
                    print("t is:",t)
                if (render):
                    env.render()
                    #time.sleep(1)

                if (default_policy):
                    #if np.random.uniform() < 0.95:
                    prediction = policy.predict(state)
                    next_action = np.argmax(prediction[0])
                    #else:
                    #    next_action = np.argmax(np.random.uniform(0,1, size=3))
                else:
                    #epsilon = get_epsilon(i_episode, n_episodes)
                    #epsilon = get_epsilon_exp(i_episode)
                    #epsilon = my_f(epsilon)
                    if t > 1000:
                        #epsilon = epsilon + 0.0002 if epsilon < 0.9 else 0.9
                        epsilon = epsilon + 0.0002 if epsilon < 1 else 1
                        if t > 10000:
                            epsilon = 0.9
                    
                    if np.random.uniform() < epsilon:
                        prediction = Q.predict(state)
                        next_action = np.argmax(prediction[0])
                        Actions.append(next_action)
                    else:
                        next_action = np.argmax(np.random.uniform(0,1, size=3))
                        #next_action = np.random.randint(0, 3)
                        
                new_state, reward, end, _ = env.step(next_action)

                reward = abs(new_state[0] - (-0.5))     # r in [0, 1]
                new_state = np.reshape(new_state,[1,2])
                
                memory.append((state, next_action, reward, new_state, end))

                if end:
                    if t == max_action -1:
                        Res[0] += 1
                    else:
                        Res[1] += 1
                        #print("ENTRATO!,", t, "steps")

                    Steps.append(t)
                    break
                else:
                    state = new_state
                    cumulative_reward += reward
                
                if t > 1000 and (t % 300) == 0 and default_policy == False:
                    copy_weights(Q,Q_noob)
                if t > 1000 and default_policy == False:   
                    random_pick = ran.sample(memory, batch_size)
                    Loss.append(replay(Q, random_pick, gamma, Q_noob))
                t += 1

            cumulative_reward += reward
            Scores.append(cumulative_reward)
        env.close()
        return {"results": np.array(Res), "steps": np.array(Steps), "scores": np.array(Scores), "Q": Q, "Q_noob": Q_noob, "loss": np.array(Loss), "actions": np.array(Actions)}
    
    # TODO: documentare relazione tra RMSE e gradient descent optimisation

In [None]:
config = {"alpha": 0.8, "gamma": .90, "n_episodes": 120, "max_action": 10000, "final_pun": 0, "step_pun": 0, "render": False}
res = experiment(**config)

In [None]:
trained_model.save('model120episodesepsilonfix.h5')

In [None]:
from keras.models import load_model
trained_model = load_model('model500episodes_double_layer.h5')

In [None]:
q = 2

import matplotlib.pyplot as plt
%matplotlib inline

# Scores
#x = range(len(res["scores"])-2*q)
#plt.figure(figsize=(15,5))
#plt.plot(x, ma(res["scores"], q))
#plt.errorbar(x, res["scores"], fmt='ro', label="data", xerr=0.75, ecolor='black')
x = range(len(res["scores"]))
plt.figure(figsize=(15,5))
plt.plot(x, res["scores"])


# Steps
#x = range(len(res["steps"])-2*q)
#plt.figure(figsize=(15,5))
#plt.plot(x, ma(res["steps"],q))
x = range(len(res["steps"]))
plt.figure(figsize=(15,5))
plt.plot(x, res["steps"])
# Steps distribution
plt.figure(figsize=(15,5))
kwargs = dict(histtype='stepfilled', alpha=0.3, density=True, bins=40)
plt.hist(res["steps"],**kwargs)
#plt.hist(res["steps"], len(res["steps"]), density=0, facecolor='green')

# Steps distribution
plt.figure(figsize=(15,5))
kwargs = dict(histtype='stepfilled', alpha=0.3, density=True, bins=40)
plt.hist(res["steps"],**kwargs)
#plt.hist(res["steps"], len(res["steps"]), density=0, facecolor='green')

# Loss function
x = range(len(res["loss"]))
plt.figure(figsize=(15,5))
plt.ylim(0,50)
plt.plot(x, res["loss"])


In [None]:
config = {"alpha": 0.8, "gamma": .90, "n_episodes": 5, "max_action": 10000, "final_pun": 0, "step_pun": 0, "default_policy": True, "policy": trained_model, "render": True}
res2 = experiment(**config)

In [None]:
print(accuracy(res2["results"]))
print(np.mean(res2["scores"]))
print(res2["steps"])

In [None]:
config = {"alpha": 0.8, "gamma": .95, "n_episodes": 2, "max_action": 200, "final_pun": 0, "step_pun": 0, "default_policy": True, "policy": learnt_policy, "render": True}
res2 = experiment(**config)

| Bins  | Train Mean Score    | Test Mean Score | Accuracy |
|-------|---------------------|-----------------|----------|
| 70    | -166.78             | -152.97         | 98%      |
| 80    | -162.06             | -147.19         | 100%     |
| 90    | -158.856            | -130.3          | 100%     |
| 100   | -158.567            | -169.68         | 100%     |
| 130   | -162.172            | -132.06         | 100%     |
| 150   | -169.692            | -129.28         | 100%     |
| 180   | -179.890            | -141.28         | 100%     |
| myalg | -198.66             | -244.71         | 26%      |