In [1]:
import tensorflow as tf
import numpy as np
import random
from constants import TASK_LIST
from constants import INITIAL_ALPHA
from constants import INITIAL_GAMMA
from constants import EPSILON
from constants import NUM_TRA_EPISODES
from scene_loader import THORDiscreteEnvironment as Environment
from RLalgs.utils import epsilon_greedy


def QLearning(env, num_episodes, gamma, lr, e):
    """
    Implement the Q-learning algorithm following the epsilon-greedy exploration.

    Inputs:
    env: THORDiscreteEnvironment
    num_episodes: int
            Number of episodes of training
    gamma: float
            Discount factor.
    lr: float
            Learning rate.
    e: float
            Epsilon value used in the epsilon-greedy method.

    Outputs:
    Q: numpy.ndarray
    """

    Q = np.zeros((env.n_locations, env.nA))
    
    

    for i in range(num_episodes):
        env.reset()
        current_state = env.current_state_id
        episode_reward = 0
        while env.terminal==False:
            action = epsilon_greedy(Q[current_state], e)
            S = env.step(action)
            if env.terminal == True:
                R = 10
            elif env.collided == True:
                R = -0.1
            else: R = -0.01
            episode_reward += R
            Q[current_state, action] = Q[current_state, action] + lr * (R + gamma * np.max(Q[S]) - Q[current_state, action])
            current_state = env.current_state_id
        print ("episode reward: ", episode_reward)

    

    return Q






RLalgs is a package containing Reinforcement Learning algorithms Epsilon-Greedy, Policy Iteration, Value Iteration, Q-Learning, and SARSA.


In [2]:
# #Q Learning for realizing the navigation
# if __name__ == '__main__':

list_of_tasks = TASK_LIST
scene_scopes = list_of_tasks.keys()

initial_learning_rate = INITIAL_ALPHA
gamma = INITIAL_GAMMA

branches = []
for scene in scene_scopes:
    for task in list_of_tasks[scene]:
        branches.append((scene, task))

scene, task = branches[0]


env = Environment({
    'scene_name': scene,
    'terminal_state_id': int(task)
})
env.reset()

Q = QLearning(env, NUM_TRA_EPISODES, gamma, initial_learning_rate, EPSILON)
print (Q)



episode reward:  3.7000000000000535
episode reward:  0.8600000000001309
episode reward:  -15.030000000001099
episode reward:  -3.989999999999954
episode reward:  9.709999999999999
episode reward:  -10.23999999999992
episode reward:  -318.329999999999
episode reward:  -0.7199999999998461
episode reward:  -2.7899999999997913
episode reward:  0.3200000000001051
episode reward:  -2.7999999999997893
episode reward:  1.0600000000000414
episode reward:  1.8700000000001147
episode reward:  -5.949999999999843
episode reward:  9.25
episode reward:  -7.929999999999829
episode reward:  5.889999999999998
episode reward:  -412.0700000000035
episode reward:  8.9
episode reward:  -33.29999999999983
episode reward:  -21.27000000000003
episode reward:  9.87
episode reward:  9.24
episode reward:  2.9300000000000805
episode reward:  -21.54000000000144
episode reward:  4.590000000000033
episode reward:  -12.640000000000612
episode reward:  8.7
episode reward:  9.41
episode reward:  -3.9399999999998236
epis

episode reward:  -2.7899999999998837
episode reward:  -4.2699999999998575
episode reward:  9.209999999999999
episode reward:  3.3900000000000663
episode reward:  8.86
episode reward:  9.91
episode reward:  5.080000000000001
episode reward:  -22.23000000000036
episode reward:  9.34
episode reward:  -17.820000000001254
episode reward:  -6.879999999999843
episode reward:  8.649999999999999
episode reward:  8.75
episode reward:  -2.5699999999998546
episode reward:  8.669999999999998
episode reward:  3.1700000000000275
episode reward:  1.0700000000000855
episode reward:  -24.3000000000006
episode reward:  8.29
episode reward:  -10.129999999999956
episode reward:  -28.99000000000205
episode reward:  6.0699999999999985
episode reward:  -0.1399999999999384
episode reward:  7.670000000000005
episode reward:  8.129999999999999
episode reward:  0.21000000000009145
episode reward:  5.730000000000047
episode reward:  -12.58000000000056
episode reward:  6.610000000000028
episode reward:  9.98
episod

episode reward:  -14.670000000001213
episode reward:  8.6
episode reward:  -7.449999999999829
episode reward:  6.690000000000024
episode reward:  7.8000000000000025
episode reward:  -20.710000000000704
episode reward:  -12.640000000000551
episode reward:  4.150000000000022
episode reward:  -0.4899999999999345
episode reward:  -31.49999999999963
episode reward:  9.58
episode reward:  -17.640000000001482
episode reward:  -14.130000000000294
episode reward:  -11.510000000000502
episode reward:  -13.240000000000478
episode reward:  3.640000000000085
episode reward:  2.8900000000000086
episode reward:  -3.109999999999806
episode reward:  -10.459999999999823
episode reward:  9.059999999999999
episode reward:  1.760000000000117
episode reward:  4.570000000000004
episode reward:  10
episode reward:  4.610000000000005
episode reward:  -19.81000000000052
episode reward:  6.510000000000002
episode reward:  1.7700000000001221
episode reward:  9.39
episode reward:  3.8100000000000165
episode reward

episode reward:  -9.049999999999766
episode reward:  -0.6199999999998269
episode reward:  7.369999999999999
episode reward:  -15.740000000000016
episode reward:  6.3400000000000345
episode reward:  9.99
episode reward:  -0.6799999999998434
episode reward:  -43.929999999998984
episode reward:  -11.01000000000042
episode reward:  4.940000000000007
episode reward:  -2.1599999999997888
episode reward:  -40.13999999999903
episode reward:  7.06
episode reward:  9.97
episode reward:  -1.389999999999901
episode reward:  -12.40999999999988
episode reward:  7.150000000000016
episode reward:  5.300000000000001
episode reward:  -3.159999999999771
episode reward:  9.27
episode reward:  8.87
episode reward:  2.0700000000000607
episode reward:  -1.719999999999855
episode reward:  0.060000000000060894
episode reward:  -2.3599999999998573
episode reward:  5.840000000000014
episode reward:  -33.25999999999963
episode reward:  -8.250000000000032
episode reward:  -8.410000000000199
episode reward:  -1.569

episode reward:  -1.8499999999998593
episode reward:  -16.39000000000103
episode reward:  4.190000000000046
episode reward:  9.94
episode reward:  4.160000000000013
episode reward:  7.32
episode reward:  6.250000000000034
episode reward:  8.349999999999998
episode reward:  9.76
episode reward:  -17.420000000000407
episode reward:  -0.4399999999998343
episode reward:  -68.36000000000058
episode reward:  3.4000000000000457
episode reward:  7.760000000000003
episode reward:  8.62
episode reward:  5.500000000000001
episode reward:  3.420000000000017
episode reward:  0.5400000000001448
episode reward:  4.5000000000000435
episode reward:  -11.20000000000034
episode reward:  8.62
episode reward:  7.0699999999999985
[[ 8.29589812e+02  8.61990000e+02  8.47576123e+02  8.28134234e+02]
 [ 8.02874678e+02  8.29199141e+02  8.61990000e+02  8.39393861e+02]
 [ 1.71922214e+02  1.76379374e+02  8.51265062e+02  4.05491657e+02]
 [ 8.61407614e+02  8.02621127e+02  8.15595843e+02  8.08793069e+02]
 [-1.00000000e