# Setup

In [1]:
# Import packages
import gym
import numpy as np
import random

In [2]:
# Make car_racing env
env = gym.make('CarRacing-v0')



# Q-Learing

In [3]:
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

Action Space Box(3,)
State Space Box(96, 96, 3)


In [4]:
actions = {
    0: [-.9,  0, 0],
    1: [-.45, 0, 0],
    2: [0,    0, 0],
    3: [.45,  0, 0],
    4: [.9,   0, 0],
    5: [-.9, .6,.1],
    6: [-.45,.6,.1],
    7: [0,   .6,.1],
    8: [.45, .6,.1],
    9: [.9,  .6,.1],
    10:[-.9, .9,.1],
    11:[-.45,.9,.1],
    12:[0,   .9,.1],
    13:[.45, .9,.1],
    14:[.9,  .9,.1]
}

In [5]:
state_num =  512 # 2ˆˆ9
q_table = np.zeros([state_num, len(actions)])

In [12]:
q_table.shape

(512, 15)

In [13]:
# Hyperparameters
alpha = 0.2
gamma = 0.9
epsilon = 0.1

# Optimize

In [None]:
total_reward = 0

# Outer loop is for multiple simulations (increase for learning over a long time)
for i in range(20):
    state = env.reset() # Reset to start a new simulation

    # Inner loop for simulating the environment:
    # 1. Observe 2. Take action 3. Read reward 4. Learn 1. Observe ...
    for i in range(1000):
        env.render() # For seeing the simulation step
        discrete_state = to_num(binarize(state))

        #epochs, penalties, reward, = 0, 0, 0
        
        if random.uniform(0, 1) < epsilon:
            action_idx = random.choice(list(actions.keys()))
            action = actions.get(action_idx)
        else:
            action_idx = np.argmax(q_table[discrete_state])
            action = actions.get(action_idx)
        
        next_state, reward, done, info = env.step(action)
        discrete_next_state = to_num(binarize(next_state))
        
        old_value = q_table[discrete_state][action_idx]
        next_max = np.max(q_table[discrete_next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[discrete_state][action_idx] = new_value
        
        state = next_state
                    
        # Pring the observations, rewards, whether the round has ended
        """
        print("Observation: ", discrete_next_state,
              "Reward: ", reward,
              "Done?: ", done,
              "Action taken: ", action)
        """
        
        if i < 1000: # Calculate reward for only first 2000 episodes
            total_reward += reward
        
        if done: # Car steered off stop the simulation and restart it
            break
            
print(total_reward)

Track generation: 1146..1447 -> 301-tiles track
Track generation: 1092..1369 -> 277-tiles track
Track generation: 1212..1519 -> 307-tiles track
Track generation: 1180..1479 -> 299-tiles track
Track generation: 1064..1334 -> 270-tiles track
Track generation: 1209..1515 -> 306-tiles track
Track generation: 1201..1509 -> 308-tiles track
retry to generate track (normal if there are not many of this messages)
Track generation: 1231..1543 -> 312-tiles track
Track generation: 1089..1365 -> 276-tiles track
Track generation: 1072..1344 -> 272-tiles track
Track generation: 1151..1443 -> 292-tiles track
Track generation: 1092..1369 -> 277-tiles track
Track generation: 1150..1441 -> 291-tiles track
Track generation: 1213..1520 -> 307-tiles track
Track generation: 1089..1366 -> 277-tiles track
Track generation: 1084..1358 -> 274-tiles track
Track generation: 1349..1691 -> 342-tiles track
Track generation: 1100..1379 -> 279-tiles track


RecursionError: maximum recursion depth exceeded while calling a Python object

In [8]:
total_reward = 0

# Outer loop is for multiple simulations (increase for learning over a long time)
for i in range(10):
    env.reset() # Reset to start a new simulation
    
    # Inner loop for simulating the environment:
    # 1. Observe 2. Take action 3. Read reward 4. Learn 1. Observe ...
    for i in range(100):
        state = i
        env.render() # For seeing the simulation step
        
        #epochs, penalties, reward, = 0, 0, 0
        
        if random.uniform(0, 1) < epsilon:
            action_idx = random.choice(list(actions.keys()))
            action = actions.get(action_idx)
        else:
            action_idx = np.argmax(q_table[state])
            action = actions.get(action_idx)
        
        observation, reward, done, info = env.step(action)
        
        old_value = q_table[state][action_idx]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state][action_idx] = new_value
        
        state = next_state
                    
        # Pring the observations, rewards, whether the round has ended
        print("Observation: ", observation.shape,
              "Reward: ", reward,
              "Done?: ", done)
        
        if i < 1000: # Calculate reward for only first 2000 episodes
            total_reward += reward
        
        if done: # Car steered off stop the simulation and restart it
            break
            
print(total_reward)

Track generation: 1094..1379 -> 285-tiles track


NameError: name 'next_state' is not defined

In [7]:
def discrete(elem):
    res = [0]
    am = np.argmax(elem)
    if am < 1:
        res[0] = 1
    return res

def binarize(obs):
    # Binarize
    res = ds = np.array(
        list(
            map(discrete, obs.reshape(96 * 96, 3))
        )
    ).reshape(96, 96, 1)
    
    res = res[15:-15, 15:-15, :]
    
    # Downsample
    acc = res[0::22, 0::22, 0]
    for i in range(1,22):
        for j in range(1,22):
            acc += res[i::22, j::22, 0]
    
    acc = (acc / (22 * 22)) > 0.3

    #acc = acc[:-1,:]
    
    return np.array(acc, dtype=int)

def to_num(obs):
    rav = obs.ravel()
    res = 0
    for i in range(len(rav)):
        res = res + rav[i] * 2**i
    return res