# Q-Learning (Crawling Robot)

In [1]:
from agent import Q_Agent
import numpy as np
import math
from crawler_env import CrawlingRobotEnv
all_rewards = 0
# On this reference environment the action space is 4 (simpler than lego)
# Also it's state space is a bit more complex, including velocities, arm positions, etc...
env = CrawlingRobotEnv(render=False)
current_state = env.reset()
agent = Q_Agent(env, gamma=0.9, alpha=0.2)
total_reward = 0
num_iterations_train = 900000

### Action and State space

In [2]:
# Get the action space
print('Robot action space:', env.action_space.n)
print('Robot state-space:', np.prod([state.n for state in env.observation_space]))
print('Reset State:', current_state)

Robot action space: 4
Robot state-space: 91
Reset State: 45


In [15]:
agent.q_val_table

[[11.049823912065587, 12.277582124517323, 11.049823912065587, 19],
 [15.382608129309352,
  16.159438674988447,
  17.091786810343727,
  13.094816537256268],
 [20.590124328477685,
  21.004888134298643,
  22.877915920530768,
  17.695336873386296],
 [26.053087095820352,
  26.16626641360053,
  28.947874550913838,
  22.578140452741025],
 [31.274519782505045,
  31.122963019341405,
  34.74946643080842,
  27.287277278219953],
 [35.88518043181768, 35.48723744547883, 39.87242506255739, 31.48219392526628],
 [39.3376716350436, 38.87730540050842, 44.029875108453645, 34.709695487602154],
 [40.75699850456008, 38.90118175351365, 47.03408100965877, 32.70881676574417],
 [35.31206693995191, 24.374345349847655, 48.2534598877342, 18.2369402464616],
 [16.570479322666692, 12.20028813813147, 44.03047734694619, 3.825136409235457],
 [4.297889584975276,
  0.5256416074238697,
  31.874102570438378,
  1.3663903630241037],
 [1.934387791987332, 0.0, 13.60186932915964, 0.0],
 [0.0, 0.0, 0.1934403165937519, 0.0],
 [14.9

#### Training

In [4]:
# Training
i = 0
while i < num_iterations_train:
    i = i + 1
    action = agent.choose_action(current_state)
    next_state, reward, done, info = env.step(action)
    agent.update_q_table(current_state, action, reward, next_state)
    current_state = next_state
    total_reward += reward

    # Evaluate
    if i % 5000 == 0:
        print("average_reward in last 1 steps", total_reward / i)
        # Stop training if total reward is big enough
        if (total_reward / i) > 1.2:
            break
        average_reward = 0
        env.render = False

average_reward in last 1 steps 0.37879887684800795
average_reward in last 1 steps 0.5666098552194944
average_reward in last 1 steps 0.7108542332899885
average_reward in last 1 steps 0.8220256857134491
average_reward in last 1 steps 0.8929182768743005
average_reward in last 1 steps 0.9545332921871967
average_reward in last 1 steps 0.9947200580750234
average_reward in last 1 steps 1.0238176085476296
average_reward in last 1 steps 1.0518133994487684
average_reward in last 1 steps 1.071263456580339
average_reward in last 1 steps 1.0848720500359188
average_reward in last 1 steps 1.0980634487285799
average_reward in last 1 steps 1.1103445887069516
average_reward in last 1 steps 1.1173599645889467
average_reward in last 1 steps 1.130140879085972
average_reward in last 1 steps 1.1417864195800853
average_reward in last 1 steps 1.1500251893814426
average_reward in last 1 steps 1.1543304254011484
average_reward in last 1 steps 1.1618896829601966
average_reward in last 1 steps 1.1648269957862536
a

#### Evaluate

In [5]:
# Evaluating
env = CrawlingRobotEnv(render=True)
current_state=env.reset()
total_reward = 0
# Force epsilon-greedy to always use the max Q
agent.e_greedy_prob = 0

for i in range(100):
    action = agent.choose_action(current_state)
    next_state, reward, done, info = env.step(action)
    current_state = next_state
    total_reward += reward

#### Attention, possible bug defining lists of lists

In [11]:
#test = [[0.,0.,0.,0.], [0.,0.,0.,0.], [0.,0.,0.,0.], [0.,0.,0.,0.]]
#test = [[0.]*4]*4
test = [[0.] * 4 for _ in range(4)]

In [12]:
test

[[0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0]]

In [13]:
test[0][3] = 19.8

In [14]:
test

[[0.0, 0.0, 0.0, 19.8],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0]]