# Import Libraries and Files

In [1]:
from DiscreteDeepRobots import ThreeLinkRobot
from math import pi, log
import random
import numpy as np
import copy
import matplotlib.pyplot as plt

# Q-Learning and Auxiliary Functinos

In [2]:
def Qlearner(num_robots, alpha, gamma, epsilon, theta_lower, theta_upper, theta_interval,
             a1_lower, a1_upper, a1_interval, a2_lower, a2_upper, a2_interval, a_lower, a_upper, a_interval):
    """
    :param num_robots: number of robots
    :param alpha: learning rate
    :param gamma: discount rate
    :param epsilon: probability of choosing random action while learning
    :param theta_lower: lower limit of theta in state space
    :param theta_upper: upper limit of theta in state space
    :param theta_interval: interval of theta values in state space
    :param a1_lower: lower limit of a1 in state space
    :param a1_upper: upper limit of a1 in state space
    :param a1_interval: interval of a1 values in state space
    :param a2_lower: lower limit of a2 in state space
    :param a2_upper: upper limit of a2 in state space
    :param a2_interval: interval of a2 values in state space
    :param a_lower: lower limit of action space
    :param a_upper: upper limit of action space
    :param a_interval: interval of discretized action space
    :return: a tuple of learned q-values, states, actions
    """

    # initialize state space, action space, and Qvalues
    print('loading state space')
    states = get_state_space(theta_lower, theta_upper, theta_interval,a1_lower,
                             a1_upper, a1_interval, a2_lower, a2_upper, a2_interval)  # state = (theta, a1, a2)
    print(len(states), 'states loaded')
    print('loading action space')
    actions = get_action_space(a_lower, a_upper, a_interval)  # action = (a1dot, a2dot)
    print(len(actions), 'actions loaded')
    Qvalues = {}
    print('initializing Qvalues')
    for state in states:
        for action in actions:
            Qvalues[(state, action)] = 0
    print(len(Qvalues.keys()), 'q-values loaded')

    # learn q-values
    print('\n\n')
    
    for j in range(num_robots):
        state = random.choice(states)
        robot = ThreeLinkRobot(x=0, 
                               y=0, 
                               theta=state[0], 
                               a1=state[1],
                               a2=state[2],
                               link_length=2, 
                               t_interval=0.01, 
                               a_interval=pi/32)
        i = 0
        
        # plotting
        xs = [robot.x]
        a1s = [robot.a1]
        a2s = [robot.a2]
        timesteps = [i]
        
        while True:
            i += 1
            print('For', j, 'th robot', 'In', i, 'th iteration the initial state is: ', state)
            # employ an epsilon-greedy strategy for exploration vs exploitation
            best_actions = []
            if random.random() < epsilon:

                # choose a random action
                best_actions = actions
            else:

                # find the best actions (the ones with largest q-value)
                maxQ = -float("inf")
                for action in actions:
                    Q = Qvalues[(state, action)]
                    if Q > maxQ:
                        best_actions = [action]
                        maxQ = Q
                    elif Q == maxQ:
                        best_actions.append(action)

            # randomly select a tie-breaking, valid action
            while True:
                best_action = random.choice(best_actions)
                print('The action randomly chosen is', best_action)
                temp_robot = copy.deepcopy(robot)
                temp_robot.move(best_action[0], best_action[1], 1)
                if (temp_robot.theta, temp_robot.a1, temp_robot.a2) in states:
                    break
            robot = temp_robot
            print('In', i, 'th iteration the best action is: ', best_action)

            # transition to new state
            new_state = robot.state
            print('In', i, 'th iteration the new state is: ', new_state)
            
            # add values to lists
            xs.append(robot.x)
            a1s.append(robot.a1)
            a2s.append(robot.a2)
            timesteps.append(i)

            # plotting
            plt.title('Q-learning Monitor for robot ' + str(j))
            plt.subplot(3,1,1)
            plt.plot(timesteps, xs, '.-')
            plt.ylabel('x')

            plt.subplot(3,1,2)
            plt.plot(timesteps, a1s, '.-')
            plt.ylabel('a1')

            plt.subplot(3,1,3)
            plt.plot(timesteps, a2s, '.-')
            plt.ylabel('a2')

            plt.xlabel('timestep')
            plt.tight_layout()
            plt.pause(0.000000000000001)

            # find the maximum Q value for new state
            Q = -float("inf")
            for action in actions:
                Q = max(Q, Qvalues[(new_state, action)])

            # find the reward of this transition
            reward = 0
            a1, a2, R, v, a1dot, a2dot = robot.a1, robot.a2, robot.R, robot.body_v[0], robot.a1dot, robot.a2dot
            
            '''
            # penalize according to joint proximity
            if a1 == a2:
                reward += -10*R
            
            else:
                print('In ', i, 'th iteration the penalty for joint angle proximity is: ', log(a1-a2), 'for joint angles: ', a1, a2)
                reward += log(a1-a2)
            '''
            
            print('In ', i, 'th iteration the reward for x- velocity is: ', v/(a1dot**2 + a2dot**2), 'for velocity, a1dot, a2dot: ', v, a1dot, a2dot)
            reward += v/(a1dot**2 + a2dot**2)

            # TD update
            sample = gamma * Q
            old_Q = Qvalues[(state, best_action)]
            print('In ', i, 'th iteration the Q value before update is: ', old_Q)
            new_Q = (1 - alpha) * old_Q + alpha * (reward + sample)
            Qvalues[(state, best_action)] = new_Q
            print('In ', i, 'th iteration the Q value after update is: ', new_Q)
            state = new_state

            # check for convergence
            if old_Q == 0:
                pass
            elif abs((new_Q-old_Q)/old_Q) <= 0.05:
                print('algorithm converged')
                break
            print('\n')
        
        # show and close plot
        plt.show()
        plt.close()
        
    return Qvalues, states, actions

In [3]:
def get_action_space(lower_limit, upper_limit, interval):
    """
    auxiliary function used by Qlearner() to get action space
    :return: a list of action space values in tuple format (a1dot, a2dot)
    """
    upper_limit += (interval/10)  # to ensure the range covers the rightmost value in the loop
    r = np.arange(lower_limit, upper_limit, interval)
    space = [(rnd(i), rnd(j)) for i in r for j in r]

    # remove a1dot = 0, a2dot = 0 from action space
    space.remove((0,0))

    return space


def get_state_space(theta_lower, theta_upper, theta_interval,
                    a1_lower, a1_upper, a1_interval, a2_lower, a2_upper, a2_interval):
    """
    auxiliary function used by Qlearner() to get action space
    :return: a list of state space values in tuple format (a1, a2, theta)
    """
    # to ensure the range covers the rightmost value in the loop
    theta_upper += (theta_interval/10)
    a1_upper += (a1_interval/10)
    a2_upper += (a2_interval/10)

    theta_range = np.arange(theta_lower, theta_upper, theta_interval)
    a1_range = np.arange(a1_lower, a1_upper, a1_interval)
    a2_range = np.arange(a2_lower, a2_upper, a2_interval)
    space = [(rnd(theta), rnd(a1), rnd(a2)) for theta in theta_range for a1 in a1_range for a2 in a2_range]

    return space

In [4]:
def rnd(number):
    return round(number, 8)

# Policy and Q-value Testing Functions

In [5]:
def extract_policy(Qvalues, states, actions):
    policy = {}
    for state in states:
        maxQ = -float("inf")
        best_action = None
        for action in actions:
            Q = Qvalues[(state, action)]
            maxQ = max(Q, maxQ)
            if Q == maxQ:
                best_action = action
        policy[state] = best_action
    return policy


def test_policy(robot, policy, timestep=20):
    dx = 0
    dxs = []
    a1s = []
    a2s = []
    timesteps = []
    plt.title('Policy Rollout')
    for i in range(timestep):
        
        # rollout
        initial_state = robot.state
        print('In', i, 'th iteration the initial state is: ', initial_state)
        old_x = robot.x
        action = policy[initial_state]
        print('In', i, 'th iteration the chosen action is: ', action)
        robot.move(action[0], action[1], 1)
        new_x = robot.x
        print('In ', i, 'th iteration, the robot moved ', new_x - old_x, ' in x direction')
        dx += (new_x-old_x)
        
        # add values to lists
        dxs.append(dx)
        a1s.append(robot.a1)
        a2s.append(robot.a2)
        timesteps.append(i)
        
        # plotting
        plt.subplot(3,1,1)
        plt.plot(timesteps, dxs, '.-')
        plt.ylabel('dx')
        
        plt.subplot(3,1,2)
        plt.plot(timesteps, a1s, '.-')
        plt.ylabel('a1')
        
        plt.subplot(3,1,3)
        plt.plot(timesteps, a2s, '.-')
        plt.ylabel('a2')

        plt.xlabel('timestep')
        plt.tight_layout()
        plt.pause(0.05)
        
    plt.show()
    return dx

# Testing

In [6]:
# learn Q values
Qvalues, states, actions = Qlearner(num_robots = 2,
                                    alpha=0.5,
                                    gamma=0.9,
                                    epsilon=0.5,
                                    theta_lower=-pi,
                                    theta_upper=pi,
                                    theta_interval=pi/32,
                                    a1_lower=pi/32,
                                    a1_upper=pi/8,
                                    a1_interval=pi/32,
                                    a2_lower=-pi/8,
                                    a2_upper=-pi/32,
                                    a2_interval=pi/32,
                                    a_lower=-pi/8,
                                    a_upper=pi/8,
                                    a_interval=pi/32)

loading state space
1040 states loaded
loading action space
80 actions loaded
initializing Qvalues
83200 q-values loaded



For 0 th robot In 1 th iteration the initial state is:  (2.6507187999999999, 0.19634953999999999, -0.39269907999999998)
The action randomly chosen is (-0.098174769999999995, -0.29452431000000001)
The action randomly chosen is (-0.098174769999999995, 0.39269907999999998)
The action randomly chosen is (0.29452431000000001, -0.19634953999999999)
The action randomly chosen is (-0.098174769999999995, -0.29452431000000001)
The action randomly chosen is (-0.098174769999999995, 0.29452431000000001)
In 1 th iteration the best action is:  (-0.098174769999999995, 0.29452431000000001)
In 1 th iteration the new state is:  (2.6507188, 0.09817477, -0.09817477)
In  1 th iteration the reward for x- velocity is:  -3.71192361873 for velocity, a1dot, a2dot:  -0.3577657946 -0.09817477 0.29452431
In  1 th iteration the Q value before update is:  0
In  1 th iteration the Q value after u



In  3 th iteration the reward for x- velocity is:  -17.5447022231 for velocity, a1dot, a2dot:  -0.169100848417 0.0 0.09817477
In  3 th iteration the Q value before update is:  0
In  3 th iteration the Q value after update is:  -8.77235111156


For 0 th robot In 4 th iteration the initial state is:  (2.55254403, 0.29452431, -0.19634954)
The action randomly chosen is (-0.29452431000000001, -0.098174769999999995)
The action randomly chosen is (0.098174769999999995, -0.39269907999999998)
The action randomly chosen is (0.19634953999999999, 0.098174769999999995)
The action randomly chosen is (-0.19634953999999999, -0.29452431000000001)
The action randomly chosen is (-0.19634953999999999, 0.098174769999999995)
In 4 th iteration the best action is:  (-0.19634953999999999, 0.098174769999999995)
In 4 th iteration the new state is:  (2.6507188, 0.09817477, -0.09817477)
In  4 th iteration the reward for x- velocity is:  3.66393567823 for velocity, a1dot, a2dot:  0.176570289953 -0.19634954 0.098174

In  13 th iteration the reward for x- velocity is:  -8.90114280521 for velocity, a1dot, a2dot:  -0.343167021269 0.0 0.19634954
In  13 th iteration the Q value before update is:  0
In  13 th iteration the Q value after update is:  -4.4505714026


For 0 th robot In 14 th iteration the initial state is:  (2.6507188, 0.19634954, -0.19634954)
The action randomly chosen is (0.19634953999999999, 0.39269907999999998)
The action randomly chosen is (0.29452431000000001, 0.29452431000000001)
The action randomly chosen is (0.39269907999999998, 0.39269907999999998)
The action randomly chosen is (0.39269907999999998, 0.29452431000000001)
The action randomly chosen is (-0.39269907999999998, 0.0)
The action randomly chosen is (0.29452431000000001, 0.0)
The action randomly chosen is (-0.098174769999999995, -0.29452431000000001)
The action randomly chosen is (-0.29452431000000001, -0.19634953999999999)
The action randomly chosen is (-0.098174769999999995, 0.0)
In 14 th iteration the best action is:  (-0

In  25 th iteration the reward for x- velocity is:  -8.15913651992 for velocity, a1dot, a2dot:  -0.786400869233 0.29452431 0.09817477
In  25 th iteration the Q value before update is:  0
In  25 th iteration the Q value after update is:  -4.07956825996


For 0 th robot In 26 th iteration the initial state is:  (2.55254403, 0.39269908, -0.29452431)
The action randomly chosen is (-0.098174769999999995, 0.29452431000000001)
The action randomly chosen is (-0.19634953999999999, -0.098174769999999995)
In 26 th iteration the best action is:  (-0.19634953999999999, -0.098174769999999995)
In 26 th iteration the new state is:  (2.74889357, 0.19634954, -0.39269908)
In  26 th iteration the reward for x- velocity is:  8.28619166682 for velocity, a1dot, a2dot:  0.399323403494 -0.19634954 -0.09817477
In  26 th iteration the Q value before update is:  0
In  26 th iteration the Q value after update is:  4.14309583341


For 0 th robot In 27 th iteration the initial state is:  (2.74889357, 0.19634954, -0.

In  37 th iteration the reward for x- velocity is:  2.0538768168 for velocity, a1dot, a2dot:  0.197958510694 -0.29452431 0.09817477
In  37 th iteration the Q value before update is:  0
In  37 th iteration the Q value after update is:  1.0269384084


For 0 th robot In 38 th iteration the initial state is:  (2.74889357, 0.09817477, -0.29452431)
The action randomly chosen is (-0.19634953999999999, 0.29452431000000001)
The action randomly chosen is (0.19634953999999999, 0.098174769999999995)
In 38 th iteration the best action is:  (0.19634953999999999, 0.098174769999999995)
In 38 th iteration the new state is:  (2.6507188, 0.29452431, -0.19634954)
In  38 th iteration the reward for x- velocity is:  -15.4134314051 for velocity, a1dot, a2dot:  -0.742795259355 0.19634954 0.09817477
In  38 th iteration the Q value before update is:  0
In  38 th iteration the Q value after update is:  -7.70671570257


For 0 th robot In 39 th iteration the initial state is:  (2.6507188, 0.29452431, -0.19634954)


In  47 th iteration the reward for x- velocity is:  17.5447022231 for velocity, a1dot, a2dot:  0.169100848417 0.0 -0.09817477
In  47 th iteration the Q value before update is:  0
In  47 th iteration the Q value after update is:  8.77235111156


For 0 th robot In 48 th iteration the initial state is:  (2.84706834, 0.29452431, -0.39269908)
The action randomly chosen is (-0.29452431000000001, -0.29452431000000001)
The action randomly chosen is (0.39269907999999998, 0.29452431000000001)
The action randomly chosen is (-0.098174769999999995, -0.29452431000000001)
The action randomly chosen is (-0.098174769999999995, 0.39269907999999998)
The action randomly chosen is (0.098174769999999995, -0.19634953999999999)
The action randomly chosen is (-0.19634953999999999, -0.29452431000000001)
The action randomly chosen is (0.39269907999999998, 0.098174769999999995)
The action randomly chosen is (-0.098174769999999995, 0.19634953999999999)
In 48 th iteration the best action is:  (-0.098174769999999995

In  57 th iteration the reward for x- velocity is:  7.11747233064 for velocity, a1dot, a2dot:  0.343001150544 0.09817477 -0.19634954
In  57 th iteration the Q value before update is:  0
In  57 th iteration the Q value after update is:  3.55873616532


For 0 th robot In 58 th iteration the initial state is:  (3.14159265, 0.19634954, -0.39269908)
The action randomly chosen is (0.0, -0.098174769999999995)
The action randomly chosen is (0.39269907999999998, 0.29452431000000001)
The action randomly chosen is (-0.39269907999999998, 0.19634953999999999)
The action randomly chosen is (0.098174769999999995, -0.29452431000000001)
The action randomly chosen is (-0.29452431000000001, -0.29452431000000001)
The action randomly chosen is (-0.29452431000000001, 0.0)
The action randomly chosen is (-0.39269907999999998, 0.39269907999999998)
The action randomly chosen is (0.19634953999999999, 0.0)
In 58 th iteration the best action is:  (0.19634953999999999, 0.0)
In 58 th iteration the new state is:  (3.

In  64 th iteration the reward for x- velocity is:  19.5767599703 for velocity, a1dot, a2dot:  0.188686401065 -0.09817477 0.0
In  64 th iteration the Q value before update is:  0
In  64 th iteration the Q value after update is:  9.78837998517


For 0 th robot In 65 th iteration the initial state is:  (3.14159265, 0.19634954, -0.19634954)
The action randomly chosen is (0.0, -0.098174769999999995)
In 65 th iteration the best action is:  (0.0, -0.098174769999999995)
In 65 th iteration the new state is:  (3.14159265, 0.19634954, -0.29452431)
In  65 th iteration the reward for x- velocity is:  26.1056426074 for velocity, a1dot, a2dot:  0.251613635686 0.0 -0.09817477
In  65 th iteration the Q value before update is:  0
In  65 th iteration the Q value after update is:  13.6320436995


For 0 th robot In 66 th iteration the initial state is:  (3.14159265, 0.19634954, -0.29452431)
The action randomly chosen is (-0.29452431000000001, 0.29452431000000001)
The action randomly chosen is (0.098174769

In  74 th iteration the reward for x- velocity is:  2.77598181814 for velocity, a1dot, a2dot:  0.3478241677 0.19634954 -0.29452431
In  74 th iteration the Q value before update is:  0
In  74 th iteration the Q value after update is:  1.80452583628


For 0 th robot In 75 th iteration the initial state is:  (3.04341788, 0.39269908, -0.39269908)
The action randomly chosen is (-0.19634953999999999, 0.098174769999999995)
In 75 th iteration the best action is:  (-0.19634953999999999, 0.098174769999999995)
In 75 th iteration the new state is:  (3.14159265, 0.19634954, -0.29452431)
In  75 th iteration the reward for x- velocity is:  1.85126634316 for velocity, a1dot, a2dot:  0.0892151674314 -0.19634954 0.09817477
In  75 th iteration the Q value before update is:  0.925633171579
In  75 th iteration the Q value after update is:  1.96767215319


For 0 th robot In 76 th iteration the initial state is:  (3.14159265, 0.19634954, -0.29452431)
The action randomly chosen is (-0.39269907999999998, -0.29

In  85 th iteration the reward for x- velocity is:  -4.81983844287 for velocity, a1dot, a2dot:  -0.232274894027 0.19634954 -0.09817477
In  85 th iteration the Q value before update is:  0
In  85 th iteration the Q value after update is:  -2.40991922143


For 0 th robot In 86 th iteration the initial state is:  (3.04341788, 0.39269908, -0.29452431)
The action randomly chosen is (-0.098174769999999995, 0.098174769999999995)
In 86 th iteration the best action is:  (-0.098174769999999995, 0.098174769999999995)
In 86 th iteration the new state is:  (3.04341788, 0.29452431, -0.19634954)
In  86 th iteration the reward for x- velocity is:  -0.88426067457 for velocity, a1dot, a2dot:  -0.0170455136132 -0.09817477 0.09817477
In  86 th iteration the Q value before update is:  -0.442130337285
In  86 th iteration the Q value after update is:  3.7415754874


For 0 th robot In 87 th iteration the initial state is:  (3.04341788, 0.29452431, -0.19634954)
The action randomly chosen is (-0.098174769999999

In  95 th iteration the reward for x- velocity is:  0.755468980917 for velocity, a1dot, a2dot:  0.0145628513954 0.09817477 -0.09817477
In  95 th iteration the Q value before update is:  0
In  95 th iteration the Q value after update is:  1.26318695939


For 0 th robot In 96 th iteration the initial state is:  (3.04341788, 0.39269908, -0.39269908)
The action randomly chosen is (-0.29452431000000001, -0.39269907999999998)
The action randomly chosen is (-0.29452431000000001, 0.0)
The action randomly chosen is (-0.39269907999999998, 0.0)
The action randomly chosen is (0.39269907999999998, 0.29452431000000001)
The action randomly chosen is (-0.39269907999999998, -0.29452431000000001)
The action randomly chosen is (0.39269907999999998, 0.39269907999999998)
The action randomly chosen is (-0.39269907999999998, -0.29452431000000001)
The action randomly chosen is (-0.19634953999999999, 0.19634953999999999)
In 96 th iteration the best action is:  (-0.19634953999999999, 0.19634953999999999)
In 96 

In  108 th iteration the reward for x- velocity is:  -17.041867237 for velocity, a1dot, a2dot:  -0.657017525116 0.19634954 0.0
In  108 th iteration the Q value before update is:  0
In  108 th iteration the Q value after update is:  5.78182840905


For 0 th robot In 109 th iteration the initial state is:  (3.04341788, 0.29452431, -0.19634954)
The action randomly chosen is (-0.098174769999999995, 0.0)
In 109 th iteration the best action is:  (-0.098174769999999995, 0.0)
In 109 th iteration the new state is:  (3.14159265, 0.19634954, -0.19634954)
In  109 th iteration the reward for x- velocity is:  19.5767599703 for velocity, a1dot, a2dot:  0.188686401065 -0.09817477 0.0
In  109 th iteration the Q value before update is:  31.7839156168
In  109 th iteration the Q value after update is:  40.3673486828


For 0 th robot In 110 th iteration the initial state is:  (3.14159265, 0.19634954, -0.19634954)
The action randomly chosen is (0.098174769999999995, 0.29452431000000001)
The action randomly 

In  118 th iteration the reward for x- velocity is:  9.65958829151 for velocity, a1dot, a2dot:  0.372407477695 -0.19634954 0.0
In  118 th iteration the Q value before update is:  0
In  118 th iteration the Q value after update is:  5.64183077208


For 0 th robot In 119 th iteration the initial state is:  (3.14159265, 0.19634954, -0.09817477)
The action randomly chosen is (-0.29452431000000001, 0.098174769999999995)
The action randomly chosen is (0.29452431000000001, 0.29452431000000001)
The action randomly chosen is (0.0, 0.29452431000000001)
The action randomly chosen is (0.29452431000000001, -0.39269907999999998)
The action randomly chosen is (0.39269907999999998, 0.39269907999999998)
The action randomly chosen is (-0.39269907999999998, 0.19634953999999999)
The action randomly chosen is (0.39269907999999998, 0.19634953999999999)
The action randomly chosen is (-0.19634953999999999, -0.39269907999999998)
The action randomly chosen is (-0.19634953999999999, -0.29452431000000001)
The act

In  126 th iteration the reward for x- velocity is:  -7.49466683873 for velocity, a1dot, a2dot:  -0.288942953813 0.0 0.19634954
In  126 th iteration the Q value before update is:  0
In  126 th iteration the Q value after update is:  -1.20850957193


For 0 th robot In 127 th iteration the initial state is:  (2.94524311, 0.39269908, -0.09817477)
The action randomly chosen is (0.19634953999999999, 0.0)
The action randomly chosen is (-0.19634953999999999, -0.39269907999999998)
The action randomly chosen is (0.29452431000000001, -0.098174769999999995)
The action randomly chosen is (-0.29452431000000001, -0.19634953999999999)
The action randomly chosen is (0.39269907999999998, 0.098174769999999995)
The action randomly chosen is (0.19634953999999999, 0.0)
The action randomly chosen is (0.39269907999999998, -0.29452431000000001)
The action randomly chosen is (0.19634953999999999, 0.0)
The action randomly chosen is (-0.098174769999999995, -0.39269907999999998)
The action randomly chosen is (0.1

In  136 th iteration the reward for x- velocity is:  0.501612598269 for velocity, a1dot, a2dot:  0.00966937082947 0.09817477 -0.09817477
In  136 th iteration the Q value before update is:  0
In  136 th iteration the Q value after update is:  0.250806299135


For 0 th robot In 137 th iteration the initial state is:  (3.14159265, 0.29452431, -0.29452431)
The action randomly chosen is (-0.39269907999999998, -0.29452431000000001)
The action randomly chosen is (-0.098174769999999995, -0.19634953999999999)
The action randomly chosen is (-0.29452431000000001, 0.0)
The action randomly chosen is (-0.19634953999999999, -0.098174769999999995)
The action randomly chosen is (0.29452431000000001, 0.29452431000000001)
The action randomly chosen is (0.098174769999999995, 0.39269907999999998)
The action randomly chosen is (0.29452431000000001, 0.39269907999999998)
The action randomly chosen is (0.19634953999999999, -0.19634953999999999)
The action randomly chosen is (-0.29452431000000001, 0.39269907999

In  146 th iteration the reward for x- velocity is:  34.8355480636 for velocity, a1dot, a2dot:  0.335754956551 0.0 -0.09817477
In  146 th iteration the Q value before update is:  33.4057446461
In  146 th iteration the Q value after update is:  34.8680646913
algorithm converged
For 1 th robot In 1 th iteration the initial state is:  (0.39269907999999998, 0.19634953999999999, -0.19634953999999999)
The action randomly chosen is (0.39269907999999998, 0.0)
The action randomly chosen is (-0.29452431000000001, -0.29452431000000001)
The action randomly chosen is (0.0, 0.19634953999999999)
The action randomly chosen is (0.29452431000000001, 0.29452431000000001)
The action randomly chosen is (-0.19634953999999999, -0.39269907999999998)
The action randomly chosen is (-0.29452431000000001, -0.29452431000000001)
The action randomly chosen is (-0.098174769999999995, 0.19634953999999999)
The action randomly chosen is (-0.29452431000000001, 0.29452431000000001)
The action randomly chosen is (-0.392699

In  9 th iteration the reward for x- velocity is:  25.6040300091 for velocity, a1dot, a2dot:  0.493557900542 -0.09817477 -0.09817477
In  9 th iteration the Q value before update is:  0
In  9 th iteration the Q value after update is:  12.8020150046


For 1 th robot In 10 th iteration the initial state is:  (0.58904862, 0.09817477, -0.29452431)
The action randomly chosen is (-0.19634953999999999, -0.29452431000000001)
The action randomly chosen is (-0.19634953999999999, 0.0)
The action randomly chosen is (-0.19634953999999999, 0.19634953999999999)
The action randomly chosen is (-0.19634953999999999, 0.0)
The action randomly chosen is (0.098174769999999995, -0.29452431000000001)
The action randomly chosen is (0.29452431000000001, 0.098174769999999995)
In 10 th iteration the best action is:  (0.29452431000000001, 0.098174769999999995)
In 10 th iteration the new state is:  (0.39269908, 0.39269908, -0.19634954)
In  10 th iteration the reward for x- velocity is:  -10.2420986043 for velocity, 

In  19 th iteration the reward for x- velocity is:  -26.3594989901 for velocity, a1dot, a2dot:  -0.254060375969 0.0 0.09817477
In  19 th iteration the Q value before update is:  0
In  19 th iteration the Q value after update is:  -13.179749495


For 1 th robot In 20 th iteration the initial state is:  (0.68722339, 0.09817477, -0.19634954)
The action randomly chosen is (-0.29452431000000001, 0.29452431000000001)
The action randomly chosen is (-0.19634953999999999, 0.098174769999999995)
The action randomly chosen is (-0.098174769999999995, 0.098174769999999995)
The action randomly chosen is (-0.19634953999999999, 0.29452431000000001)
The action randomly chosen is (0.39269907999999998, 0.0)
The action randomly chosen is (0.098174769999999995, 0.39269907999999998)
The action randomly chosen is (-0.29452431000000001, -0.19634953999999999)
The action randomly chosen is (-0.098174769999999995, -0.19634953999999999)
The action randomly chosen is (0.39269907999999998, 0.39269907999999998)
The a

In  29 th iteration the reward for x- velocity is:  51.9598736079 for velocity, a1dot, a2dot:  0.500804094535 0.0 -0.09817477
In  29 th iteration the Q value before update is:  0
In  29 th iteration the Q value after update is:  25.9799368039


For 1 th robot In 30 th iteration the initial state is:  (0.78539816, 0.09817477, -0.19634954)
The action randomly chosen is (-0.098174769999999995, 0.29452431000000001)
The action randomly chosen is (-0.39269907999999998, 0.0)
The action randomly chosen is (0.19634953999999999, -0.29452431000000001)
The action randomly chosen is (-0.19634953999999999, -0.098174769999999995)
The action randomly chosen is (-0.39269907999999998, 0.39269907999999998)
The action randomly chosen is (0.0, -0.29452431000000001)
The action randomly chosen is (-0.098174769999999995, -0.19634953999999999)
The action randomly chosen is (-0.098174769999999995, 0.19634953999999999)
The action randomly chosen is (-0.098174769999999995, -0.19634953999999999)
The action randoml

In  39 th iteration the reward for x- velocity is:  4.07389827487 for velocity, a1dot, a2dot:  0.510450128847 0.19634954 -0.29452431
In  39 th iteration the Q value before update is:  0
In  39 th iteration the Q value after update is:  2.03694913743


For 1 th robot In 40 th iteration the initial state is:  (0.88357293, 0.29452431, -0.39269908)
The action randomly chosen is (0.29452431000000001, -0.098174769999999995)
The action randomly chosen is (0.39269907999999998, 0.098174769999999995)
The action randomly chosen is (0.098174769999999995, -0.098174769999999995)
The action randomly chosen is (0.0, 0.29452431000000001)
In 40 th iteration the best action is:  (0.0, 0.29452431000000001)
In 40 th iteration the new state is:  (0.78539816, 0.29452431, -0.09817477)
In  40 th iteration the reward for x- velocity is:  -5.08230568825 for velocity, a1dot, a2dot:  -0.440862417374 0.0 0.29452431
In  40 th iteration the Q value before update is:  0
In  40 th iteration the Q value after update is:

In  49 th iteration the reward for x- velocity is:  -21.3452813195 for velocity, a1dot, a2dot:  -0.205731914678 0.0 0.09817477
In  49 th iteration the Q value before update is:  0
In  49 th iteration the Q value after update is:  -10.6726406597


For 1 th robot In 50 th iteration the initial state is:  (1.27627202, 0.09817477, -0.29452431)
The action randomly chosen is (-0.19634953999999999, 0.0)
The action randomly chosen is (-0.098174769999999995, -0.29452431000000001)
The action randomly chosen is (0.0, 0.39269907999999998)
The action randomly chosen is (-0.29452431000000001, 0.0)
The action randomly chosen is (0.19634953999999999, -0.39269907999999998)
The action randomly chosen is (-0.19634953999999999, 0.098174769999999995)
The action randomly chosen is (0.19634953999999999, 0.29452431000000001)
The action randomly chosen is (0.29452431000000001, -0.39269907999999998)
The action randomly chosen is (0.29452431000000001, -0.098174769999999995)
In 50 th iteration the best action is:

In  57 th iteration the reward for x- velocity is:  -10.04101398 for velocity, a1dot, a2dot:  -0.38711263637 0.19634954 0.0
In  57 th iteration the Q value before update is:  0
In  57 th iteration the Q value after update is:  -5.02050698998


For 1 th robot In 58 th iteration the initial state is:  (1.27627202, 0.29452431, -0.39269908)
The action randomly chosen is (-0.29452431000000001, -0.29452431000000001)
The action randomly chosen is (0.0, 0.29452431000000001)
In 58 th iteration the best action is:  (0.0, 0.29452431000000001)
In 58 th iteration the new state is:  (1.17809725, 0.29452431, -0.09817477)
In  58 th iteration the reward for x- velocity is:  -5.08230568825 for velocity, a1dot, a2dot:  -0.440862417374 0.0 0.29452431
In  58 th iteration the Q value before update is:  0
In  58 th iteration the Q value after update is:  -2.54115284413


For 1 th robot In 59 th iteration the initial state is:  (1.17809725, 0.29452431, -0.09817477)
The action randomly chosen is (0.0, -0.39269

In  67 th iteration the reward for x- velocity is:  5.25872695799 for velocity, a1dot, a2dot:  0.456166004411 -0.29452431 0.0
In  67 th iteration the Q value before update is:  0
In  67 th iteration the Q value after update is:  2.629363479


For 1 th robot In 68 th iteration the initial state is:  (1.47262156, 0.09817477, -0.19634954)
The action randomly chosen is (-0.098174769999999995, 0.19634953999999999)
The action randomly chosen is (-0.19634953999999999, -0.29452431000000001)
The action randomly chosen is (0.39269907999999998, 0.0)
The action randomly chosen is (0.39269907999999998, -0.19634953999999999)
The action randomly chosen is (-0.39269907999999998, 0.19634953999999999)
The action randomly chosen is (0.19634953999999999, 0.0)
In 68 th iteration the best action is:  (0.19634953999999999, 0.0)
In 68 th iteration the new state is:  (1.37444679, 0.29452431, -0.19634954)
In  68 th iteration the reward for x- velocity is:  -17.041867237 for velocity, a1dot, a2dot:  -0.657017525

In  76 th iteration the reward for x- velocity is:  0.755468980917 for velocity, a1dot, a2dot:  0.0145628513954 0.09817477 -0.09817477
In  76 th iteration the Q value before update is:  0
In  76 th iteration the Q value after update is:  0.377734490458


For 1 th robot In 77 th iteration the initial state is:  (1.37444679, 0.39269908, -0.39269908)
The action randomly chosen is (0.29452431000000001, 0.098174769999999995)
The action randomly chosen is (0.19634953999999999, -0.39269907999999998)
The action randomly chosen is (-0.39269907999999998, 0.19634953999999999)
The action randomly chosen is (-0.39269907999999998, 0.0)
The action randomly chosen is (0.39269907999999998, -0.39269907999999998)
The action randomly chosen is (-0.19634953999999999, -0.39269907999999998)
The action randomly chosen is (-0.29452431000000001, 0.29452431000000001)
In 77 th iteration the best action is:  (-0.29452431000000001, 0.29452431000000001)
In 77 th iteration the new state is:  (1.47262156, 0.09817477, 

In  85 th iteration the reward for x- velocity is:  10.4920551181 for velocity, a1dot, a2dot:  0.505627111691 0.09817477 -0.19634954
In  85 th iteration the Q value before update is:  7.86904133859
In  85 th iteration the Q value after update is:  9.18054822835


For 1 th robot In 86 th iteration the initial state is:  (1.47262156, 0.19634954, -0.29452431)
The action randomly chosen is (0.39269907999999998, 0.29452431000000001)
The action randomly chosen is (-0.098174769999999995, 0.29452431000000001)
The action randomly chosen is (-0.39269907999999998, -0.39269907999999998)
The action randomly chosen is (0.39269907999999998, -0.19634953999999999)
The action randomly chosen is (-0.098174769999999995, 0.29452431000000001)
The action randomly chosen is (0.19634953999999999, 0.0)
In 86 th iteration the best action is:  (0.19634953999999999, 0.0)
In 86 th iteration the new state is:  (1.37444679, 0.39269908, -0.29452431)
In  86 th iteration the reward for x- velocity is:  -9.91530817649 fo

In  96 th iteration the reward for x- velocity is:  6.73919785781 for velocity, a1dot, a2dot:  0.259817251023 -0.19634954 0.0
In  96 th iteration the Q value before update is:  0
In  96 th iteration the Q value after update is:  3.3695989289


For 1 th robot In 97 th iteration the initial state is:  (1.76714587, 0.09817477, -0.39269908)
The action randomly chosen is (0.19634953999999999, 0.0)
In 97 th iteration the best action is:  (0.19634953999999999, 0.0)
In 97 th iteration the new state is:  (1.6689711, 0.29452431, -0.39269908)
In  97 th iteration the reward for x- velocity is:  -10.04101398 for velocity, a1dot, a2dot:  -0.38711263637 0.19634954 0.0
In  97 th iteration the Q value before update is:  0
In  97 th iteration the Q value after update is:  -3.50418747197


For 1 th robot In 98 th iteration the initial state is:  (1.6689711, 0.29452431, -0.39269908)
The action randomly chosen is (-0.19634953999999999, 0.0)
In 98 th iteration the best action is:  (-0.19634953999999999, 0.0

In  105 th iteration the reward for x- velocity is:  -17.041867237 for velocity, a1dot, a2dot:  -0.657017525116 0.19634954 0.0
In  105 th iteration the Q value before update is:  0
In  105 th iteration the Q value after update is:  -8.52093361849


For 1 th robot In 106 th iteration the initial state is:  (1.76714587, 0.29452431, -0.19634954)
The action randomly chosen is (0.098174769999999995, 0.098174769999999995)
In 106 th iteration the best action is:  (0.098174769999999995, 0.098174769999999995)
In 106 th iteration the new state is:  (1.6689711, 0.39269908, -0.09817477)
In  106 th iteration the reward for x- velocity is:  -20.2053007599 for velocity, a1dot, a2dot:  -0.389488913243 0.09817477 0.09817477
In  106 th iteration the Q value before update is:  0
In  106 th iteration the Q value after update is:  -10.10265038


For 1 th robot In 107 th iteration the initial state is:  (1.6689711, 0.39269908, -0.09817477)
The action randomly chosen is (0.29452431000000001, 0.39269907999999

In  116 th iteration the reward for x- velocity is:  -3.88143597384 for velocity, a1dot, a2dot:  -0.486335043067 0.29452431 -0.19634954
In  116 th iteration the Q value before update is:  0
In  116 th iteration the Q value after update is:  -1.94071798692


For 1 th robot In 117 th iteration the initial state is:  (1.86532064, 0.39269908, -0.29452431)
The action randomly chosen is (-0.098174769999999995, 0.098174769999999995)
In 117 th iteration the best action is:  (-0.098174769999999995, 0.098174769999999995)
In 117 th iteration the new state is:  (1.86532064, 0.29452431, -0.19634954)
In  117 th iteration the reward for x- velocity is:  -0.88426067457 for velocity, a1dot, a2dot:  -0.0170455136132 -0.09817477 0.09817477
In  117 th iteration the Q value before update is:  0
In  117 th iteration the Q value after update is:  -0.442130337285


For 1 th robot In 118 th iteration the initial state is:  (1.86532064, 0.29452431, -0.19634954)
The action randomly chosen is (0.0, -0.09817476999

In  126 th iteration the reward for x- velocity is:  33.832322867 for velocity, a1dot, a2dot:  0.326085585721 -0.09817477 0.0
In  126 th iteration the Q value before update is:  0
In  126 th iteration the Q value after update is:  16.9161614335


For 1 th robot In 127 th iteration the initial state is:  (2.06167018, 0.09817477, -0.09817477)
The action randomly chosen is (-0.39269907999999998, -0.29452431000000001)
The action randomly chosen is (0.29452431000000001, 0.098174769999999995)
The action randomly chosen is (-0.29452431000000001, 0.0)
The action randomly chosen is (0.0, 0.29452431000000001)
The action randomly chosen is (0.29452431000000001, 0.39269907999999998)
The action randomly chosen is (-0.098174769999999995, 0.0)
The action randomly chosen is (0.29452431000000001, 0.29452431000000001)
The action randomly chosen is (-0.098174769999999995, -0.19634953999999999)
The action randomly chosen is (0.19634953999999999, 0.0)
In 127 th iteration the best action is:  (0.19634953999

In  135 th iteration the reward for x- velocity is:  16.0337642613 for velocity, a1dot, a2dot:  0.154537997022 -0.09817477 0.0
In  135 th iteration the Q value before update is:  0
In  135 th iteration the Q value after update is:  8.01688213064


For 1 th robot In 136 th iteration the initial state is:  (2.25801972, 0.19634954, -0.29452431)
The action randomly chosen is (-0.39269907999999998, 0.0)
The action randomly chosen is (0.39269907999999998, -0.39269907999999998)
The action randomly chosen is (0.19634953999999999, 0.0)
In 136 th iteration the best action is:  (0.19634953999999999, 0.0)
In 136 th iteration the new state is:  (2.15984495, 0.39269908, -0.29452431)
In  136 th iteration the reward for x- velocity is:  -9.91530817649 for velocity, a1dot, a2dot:  -0.382266282696 0.19634954 0.0
In  136 th iteration the Q value before update is:  0
In  136 th iteration the Q value after update is:  -4.95765408825


For 1 th robot In 137 th iteration the initial state is:  (2.15984495, 0

In  144 th iteration the reward for x- velocity is:  17.2920682283 for velocity, a1dot, a2dot:  0.666663559428 0.0 -0.19634954
In  144 th iteration the Q value before update is:  0
In  144 th iteration the Q value after update is:  8.64603411416


For 1 th robot In 145 th iteration the initial state is:  (2.55254403, 0.19634954, -0.29452431)
The action randomly chosen is (0.19634953999999999, -0.098174769999999995)
In 145 th iteration the best action is:  (0.19634953999999999, -0.098174769999999995)
In 145 th iteration the new state is:  (2.45436926, 0.39269908, -0.39269908)
In  145 th iteration the reward for x- velocity is:  -3.71470695476 for velocity, a1dot, a2dot:  -0.179017030236 0.19634954 -0.09817477
In  145 th iteration the Q value before update is:  0
In  145 th iteration the Q value after update is:  -1.73221493351


For 1 th robot In 146 th iteration the initial state is:  (2.45436926, 0.39269908, -0.39269908)
The action randomly chosen is (-0.29452431000000001, 0.196349539

In  155 th iteration the reward for x- velocity is:  -12.5545181198 for velocity, a1dot, a2dot:  -0.605020147541 0.09817477 0.19634954
In  155 th iteration the Q value before update is:  0
In  155 th iteration the Q value after update is:  -0.629215142439


For 1 th robot In 156 th iteration the initial state is:  (2.6507188, 0.19634954, -0.19634954)
The action randomly chosen is (0.0, 0.19634953999999999)
The action randomly chosen is (-0.29452431000000001, -0.19634953999999999)
The action randomly chosen is (-0.19634953999999999, 0.0)
The action randomly chosen is (-0.098174769999999995, 0.19634953999999999)
The action randomly chosen is (0.19634953999999999, -0.29452431000000001)
The action randomly chosen is (0.19634953999999999, 0.39269907999999998)
The action randomly chosen is (0.098174769999999995, -0.098174769999999995)
In 156 th iteration the best action is:  (0.098174769999999995, -0.098174769999999995)
In 156 th iteration the new state is:  (2.6507188, 0.29452431, -0.294524

In  162 th iteration the reward for x- velocity is:  -19.5767599703 for velocity, a1dot, a2dot:  -0.188686401065 0.09817477 0.0
In  162 th iteration the Q value before update is:  0
In  162 th iteration the Q value after update is:  -9.78837998517


For 1 th robot In 163 th iteration the initial state is:  (2.55254403, 0.39269908, -0.19634954)
The action randomly chosen is (0.098174769999999995, -0.19634953999999999)
The action randomly chosen is (-0.39269907999999998, 0.39269907999999998)
The action randomly chosen is (-0.19634953999999999, -0.19634953999999999)
In 163 th iteration the best action is:  (-0.19634953999999999, -0.19634953999999999)
In 163 th iteration the new state is:  (2.74889357, 0.19634954, -0.39269908)
In  163 th iteration the reward for x- velocity is:  8.26675667861 for velocity, a1dot, a2dot:  0.637418885875 -0.19634954 -0.19634954
In  163 th iteration the Q value before update is:  0
In  163 th iteration the Q value after update is:  4.1333783393


For 1 th rob

In  174 th iteration the reward for x- velocity is:  -17.5447022231 for velocity, a1dot, a2dot:  -0.169100848417 0.0 0.09817477
In  174 th iteration the Q value before update is:  0
In  174 th iteration the Q value after update is:  -8.77235111156


For 1 th robot In 175 th iteration the initial state is:  (2.74889357, 0.29452431, -0.19634954)
The action randomly chosen is (-0.098174769999999995, 0.098174769999999995)
In 175 th iteration the best action is:  (-0.098174769999999995, 0.098174769999999995)
In 175 th iteration the new state is:  (2.74889357, 0.19634954, -0.09817477)
In  175 th iteration the reward for x- velocity is:  -0.628540789593 for velocity, a1dot, a2dot:  -0.0121161111124 -0.09817477 0.09817477
In  175 th iteration the Q value before update is:  0
In  175 th iteration the Q value after update is:  -0.314270394797


For 1 th robot In 176 th iteration the initial state is:  (2.74889357, 0.19634954, -0.09817477)
The action randomly chosen is (-0.098174769999999995, -0.

In  183 th iteration the reward for x- velocity is:  -8.45127633927 for velocity, a1dot, a2dot:  -0.733102325079 0.29452431 0.0
In  183 th iteration the Q value before update is:  0
In  183 th iteration the Q value after update is:  -4.22563816963


For 1 th robot In 184 th iteration the initial state is:  (2.84706834, 0.39269908, -0.29452431)
The action randomly chosen is (0.19634953999999999, 0.29452431000000001)
The action randomly chosen is (-0.19634953999999999, 0.098174769999999995)
In 184 th iteration the best action is:  (-0.19634953999999999, 0.098174769999999995)
In 184 th iteration the new state is:  (2.94524311, 0.19634954, -0.19634954)
In  184 th iteration the reward for x- velocity is:  2.29045819583 for velocity, a1dot, a2dot:  0.11038044968 -0.19634954 0.09817477
In  184 th iteration the Q value before update is:  0
In  184 th iteration the Q value after update is:  6.90613584998


For 1 th robot In 185 th iteration the initial state is:  (2.94524311, 0.19634954, -0.196

In  192 th iteration the reward for x- velocity is:  -13.179749495 for velocity, a1dot, a2dot:  -0.508120751938 0.0 0.19634954
In  192 th iteration the Q value before update is:  0
In  192 th iteration the Q value after update is:  3.38139096159


For 1 th robot In 193 th iteration the initial state is:  (3.14159265, 0.09817477, -0.09817477)
The action randomly chosen is (0.19634953999999999, 0.098174769999999995)
The action randomly chosen is (0.098174769999999995, 0.39269907999999998)
The action randomly chosen is (-0.29452431000000001, 0.29452431000000001)
The action randomly chosen is (-0.19634953999999999, 0.19634953999999999)
The action randomly chosen is (-0.29452431000000001, -0.19634953999999999)
The action randomly chosen is (-0.39269907999999998, -0.098174769999999995)
The action randomly chosen is (-0.29452431000000001, 0.19634953999999999)
The action randomly chosen is (-0.39269907999999998, -0.39269907999999998)
The action randomly chosen is (-0.19634953999999999, 0.09817

In  201 th iteration the reward for x- velocity is:  10.4169207748 for velocity, a1dot, a2dot:  0.401605024355 0.0 -0.19634954
In  201 th iteration the Q value before update is:  0
In  201 th iteration the Q value after update is:  5.20846038738


For 1 th robot In 202 th iteration the initial state is:  (3.04341788, 0.29452431, -0.39269908)
The action randomly chosen is (0.19634953999999999, 0.29452431000000001)
The action randomly chosen is (0.0, 0.29452431000000001)
In 202 th iteration the best action is:  (0.0, 0.29452431000000001)
In 202 th iteration the new state is:  (2.94524311, 0.29452431, -0.09817477)
In  202 th iteration the reward for x- velocity is:  -5.08230568825 for velocity, a1dot, a2dot:  -0.440862417374 0.0 0.29452431
In  202 th iteration the Q value before update is:  0
In  202 th iteration the Q value after update is:  -2.54115284413


For 1 th robot In 203 th iteration the initial state is:  (2.94524311, 0.29452431, -0.09817477)
The action randomly chosen is (-0.2

In  214 th iteration the reward for x- velocity is:  -26.1056426074 for velocity, a1dot, a2dot:  -0.251613635686 0.0 0.09817477
In  214 th iteration the Q value before update is:  -13.0528213037
In  214 th iteration the Q value after update is:  -10.8014141714


For 1 th robot In 215 th iteration the initial state is:  (3.04341788, 0.19634954, -0.09817477)
The action randomly chosen is (-0.098174769999999995, -0.098174769999999995)
In 215 th iteration the best action is:  (-0.098174769999999995, -0.098174769999999995)
In 215 th iteration the new state is:  (3.14159265, 0.09817477, -0.19634954)
In  215 th iteration the reward for x- velocity is:  34.2082296618 for velocity, a1dot, a2dot:  0.659417365436 -0.09817477 -0.09817477
In  215 th iteration the Q value before update is:  19.5062617425
In  215 th iteration the Q value after update is:  42.5478748133


For 1 th robot In 216 th iteration the initial state is:  (3.14159265, 0.09817477, -0.19634954)
The action randomly chosen is (0.0,

In [7]:
# extract policy
policy = extract_policy(Qvalues, states, actions)
for state in policy:
    print('state', rnd(state[0]),rnd(state[1]),rnd(state[2]), 'action', rnd(policy[state][0]),rnd(policy[state][1]))

state -3.14159265 0.09817477 -0.39269908 action 0.39269908 0.39269908
state -3.14159265 0.09817477 -0.29452431 action 0.39269908 0.39269908
state -3.14159265 0.09817477 -0.19634954 action 0.39269908 0.39269908
state -3.14159265 0.09817477 -0.09817477 action 0.39269908 0.39269908
state -3.14159265 0.19634954 -0.39269908 action 0.39269908 0.39269908
state -3.14159265 0.19634954 -0.29452431 action 0.39269908 0.39269908
state -3.14159265 0.19634954 -0.19634954 action 0.39269908 0.39269908
state -3.14159265 0.19634954 -0.09817477 action 0.39269908 0.39269908
state -3.14159265 0.29452431 -0.39269908 action 0.39269908 0.39269908
state -3.14159265 0.29452431 -0.29452431 action 0.39269908 0.39269908
state -3.14159265 0.29452431 -0.19634954 action 0.39269908 0.39269908
state -3.14159265 0.29452431 -0.09817477 action 0.39269908 0.39269908
state -3.14159265 0.39269908 -0.39269908 action 0.39269908 0.39269908
state -3.14159265 0.39269908 -0.29452431 action 0.39269908 0.39269908
state -3.14159265 0.

state -2.06167018 0.29452431 -0.29452431 action 0.39269908 0.39269908
state -2.06167018 0.29452431 -0.19634954 action 0.39269908 0.39269908
state -2.06167018 0.29452431 -0.09817477 action 0.39269908 0.39269908
state -2.06167018 0.39269908 -0.39269908 action 0.39269908 0.39269908
state -2.06167018 0.39269908 -0.29452431 action 0.39269908 0.39269908
state -2.06167018 0.39269908 -0.19634954 action 0.39269908 0.39269908
state -2.06167018 0.39269908 -0.09817477 action 0.39269908 0.39269908
state -1.96349541 0.09817477 -0.39269908 action 0.39269908 0.39269908
state -1.96349541 0.09817477 -0.29452431 action 0.39269908 0.39269908
state -1.96349541 0.09817477 -0.19634954 action 0.39269908 0.39269908
state -1.96349541 0.09817477 -0.09817477 action 0.39269908 0.39269908
state -1.96349541 0.19634954 -0.39269908 action 0.39269908 0.39269908
state -1.96349541 0.19634954 -0.29452431 action 0.39269908 0.39269908
state -1.96349541 0.19634954 -0.19634954 action 0.39269908 0.39269908
state -1.96349541 0.

state -1.17809725 0.19634954 -0.09817477 action 0.39269908 0.39269908
state -1.17809725 0.29452431 -0.39269908 action 0.39269908 0.39269908
state -1.17809725 0.29452431 -0.29452431 action 0.39269908 0.39269908
state -1.17809725 0.29452431 -0.19634954 action 0.39269908 0.39269908
state -1.17809725 0.29452431 -0.09817477 action 0.39269908 0.39269908
state -1.17809725 0.39269908 -0.39269908 action 0.39269908 0.39269908
state -1.17809725 0.39269908 -0.29452431 action 0.39269908 0.39269908
state -1.17809725 0.39269908 -0.19634954 action 0.39269908 0.39269908
state -1.17809725 0.39269908 -0.09817477 action 0.39269908 0.39269908
state -1.07992247 0.09817477 -0.39269908 action 0.39269908 0.39269908
state -1.07992247 0.09817477 -0.29452431 action 0.39269908 0.39269908
state -1.07992247 0.09817477 -0.19634954 action 0.39269908 0.39269908
state -1.07992247 0.09817477 -0.09817477 action 0.39269908 0.39269908
state -1.07992247 0.19634954 -0.39269908 action 0.39269908 0.39269908
state -1.07992247 0.

state -0.29452431 0.19634954 -0.19634954 action 0.39269908 0.39269908
state -0.29452431 0.19634954 -0.09817477 action 0.39269908 0.39269908
state -0.29452431 0.29452431 -0.39269908 action 0.39269908 0.39269908
state -0.29452431 0.29452431 -0.29452431 action 0.39269908 0.39269908
state -0.29452431 0.29452431 -0.19634954 action 0.39269908 0.39269908
state -0.29452431 0.29452431 -0.09817477 action 0.39269908 0.39269908
state -0.29452431 0.39269908 -0.39269908 action 0.39269908 0.39269908
state -0.29452431 0.39269908 -0.29452431 action 0.39269908 0.39269908
state -0.29452431 0.39269908 -0.19634954 action 0.39269908 0.39269908
state -0.29452431 0.39269908 -0.09817477 action 0.39269908 0.39269908
state -0.19634954 0.09817477 -0.39269908 action 0.39269908 0.39269908
state -0.19634954 0.09817477 -0.29452431 action 0.39269908 0.39269908
state -0.19634954 0.09817477 -0.19634954 action 0.39269908 0.39269908
state -0.19634954 0.09817477 -0.09817477 action 0.39269908 0.39269908
state -0.19634954 0.

state 0.58904862 0.19634954 -0.29452431 action 0.39269908 0.39269908
state 0.58904862 0.19634954 -0.19634954 action 0.39269908 0.39269908
state 0.58904862 0.19634954 -0.09817477 action 0.39269908 0.39269908
state 0.58904862 0.29452431 -0.39269908 action 0.39269908 0.39269908
state 0.58904862 0.29452431 -0.29452431 action 0.39269908 0.39269908
state 0.58904862 0.29452431 -0.19634954 action -0.09817477 -0.19634954
state 0.58904862 0.29452431 -0.09817477 action 0.09817477 -0.09817477
state 0.58904862 0.39269908 -0.39269908 action 0.39269908 0.39269908
state 0.58904862 0.39269908 -0.29452431 action 0.39269908 0.39269908
state 0.58904862 0.39269908 -0.19634954 action -0.29452431 0.09817477
state 0.58904862 0.39269908 -0.09817477 action -0.09817477 -0.29452431
state 0.68722339 0.09817477 -0.39269908 action 0.39269908 0.39269908
state 0.68722339 0.09817477 -0.29452431 action 0.39269908 0.39269908
state 0.68722339 0.09817477 -0.19634954 action 0.0 -0.09817477
state 0.68722339 0.09817477 -0.098

state 1.47262156 0.19634954 -0.39269908 action 0.39269908 0.39269908
state 1.47262156 0.19634954 -0.29452431 action 0.39269908 0.39269908
state 1.47262156 0.19634954 -0.19634954 action 0.0 -0.09817477
state 1.47262156 0.19634954 -0.09817477 action 0.39269908 0.39269908
state 1.47262156 0.29452431 -0.39269908 action 0.39269908 0.39269908
state 1.47262156 0.29452431 -0.29452431 action 0.39269908 0.39269908
state 1.47262156 0.29452431 -0.19634954 action 0.39269908 0.39269908
state 1.47262156 0.29452431 -0.09817477 action -0.19634954 0.0
state 1.47262156 0.39269908 -0.39269908 action 0.39269908 0.39269908
state 1.47262156 0.39269908 -0.29452431 action 0.39269908 0.39269908
state 1.47262156 0.39269908 -0.19634954 action 0.39269908 0.39269908
state 1.47262156 0.39269908 -0.09817477 action -0.09817477 -0.29452431
state 1.57079633 0.09817477 -0.39269908 action 0.39269908 0.39269908
state 1.57079633 0.09817477 -0.29452431 action 0.39269908 0.39269908
state 1.57079633 0.09817477 -0.19634954 acti

state 2.35619449 0.09817477 -0.19634954 action 0.39269908 0.39269908
state 2.35619449 0.09817477 -0.09817477 action 0.39269908 0.39269908
state 2.35619449 0.19634954 -0.39269908 action 0.39269908 0.39269908
state 2.35619449 0.19634954 -0.29452431 action 0.39269908 0.39269908
state 2.35619449 0.19634954 -0.19634954 action 0.39269908 0.39269908
state 2.35619449 0.19634954 -0.09817477 action 0.39269908 0.39269908
state 2.35619449 0.29452431 -0.39269908 action 0.39269908 0.39269908
state 2.35619449 0.29452431 -0.29452431 action 0.39269908 0.39269908
state 2.35619449 0.29452431 -0.19634954 action -0.19634954 -0.19634954
state 2.35619449 0.29452431 -0.09817477 action 0.39269908 0.39269908
state 2.35619449 0.39269908 -0.39269908 action 0.39269908 0.39269908
state 2.35619449 0.39269908 -0.29452431 action 0.39269908 0.39269908
state 2.35619449 0.39269908 -0.19634954 action 0.39269908 0.39269908
state 2.35619449 0.39269908 -0.09817477 action 0.0 -0.29452431
state 2.45436926 0.09817477 -0.3926990

In [8]:
robot2 = ThreeLinkRobot(x=0,
                       y=0,
                       theta=0,
                       a1=pi/16,
                       a2=-pi/16,
                       link_length=2,
                       t_interval=0.01,
                       a_interval=pi/32)
x_displacement = test_policy(robot=robot2, policy=policy)

In 0 th iteration the initial state is:  (0, 0.19634954, -0.19634954)
In 0 th iteration the chosen action is:  (0.39269907999999998, 0.39269907999999998)
In  0 th iteration, the robot moved  -1.9742316021690622  in x direction
In 1 th iteration the initial state is:  (-0.39269908, 0.58904862, 0.19634954)


KeyError: (-0.39269908, 0.58904862, 0.19634954)