Copied from:  
    - https://scientific-python.readthedocs.io/en/latest/notebooks_rst/6_Machine_Learning/04_Exercices/02_Practical_Work/02_RL_2_CarRacing.html  
    - which is based on: https://gist.github.com/lmclupr/b35c89b2f8f81b443166e88b787b03ab


In [1]:
import gym
import numpy as np 
import numpy.random
import matplotlib.pyplot as plt
import pyvirtualdisplay # only needed if running on server/headless
import cv2

from collections import deque
from pyglet.window import key # might need to comment out on server/headless
import time

from keras.models import Sequential, Model
from keras.layers import Dense, Conv2D, Dropout, Activation, BatchNormalization, MaxPool2D, Input, concatenate, Reshape, Flatten
from keras.activations import softmax, sigmoid
from keras.optimizers import SGD, Adam, Adamax

Using TensorFlow backend.


In [2]:
# Hand-crafted image processing to extract: 10*10 pixels track, upper and car images
def transform(state):
    # crop_img = img[200:400, 100:300]  # Crop from x, y, w, h -> 100, 200, 300, 400
    # NOTE: its img[y: y + h, x: x + w] and *not* img[x: x + w, y: y + h]
    # bottom_black_bar is the section of the screen with steering, speed, abs and gyro information.
    # we crop off the digits on the right as they are illigible, even for ml.
    # since color is irrelavent, we grayscale it.
    bottom_black_bar = state[84:, 12:]
    img = cv2.cvtColor(bottom_black_bar, cv2.COLOR_RGB2GRAY)
    bottom_black_bar_bw = cv2.threshold(img, 1, 255, cv2.THRESH_BINARY)[1]
    bottom_black_bar_bw = cv2.resize(bottom_black_bar_bw, (84, 12), interpolation = cv2.INTER_NEAREST)

    # upper_field = observation[:84, :96]  # This is the section of the screen that contains the track
    upper_field = state[:84, 6:90]  # We crop side of screen as they carry little information
    img = cv2.cvtColor(upper_field, cv2.COLOR_RGB2GRAY)
    upper_field_bw = cv2.threshold(img, 120, 255, cv2.THRESH_BINARY)[1]
    upper_field_bw = cv2.resize(upper_field_bw, (10, 10), interpolation = cv2.INTER_NEAREST) # rescaled to 10*10 pixels
    upper_field_bw = upper_field_bw.astype('float') / 255

    # idk whyat car_field_t contains
    car_field = state[66:78, 43:53]
    img = cv2.cvtColor(car_field, cv2.COLOR_RGB2GRAY)
    car_field_bw = cv2.threshold(img, 80, 255, cv2.THRESH_BINARY)[1]
    car_field_t = [
        car_field_bw[:, 3].mean() / 255,
        car_field_bw[:, 4].mean() / 255,
        car_field_bw[:, 5].mean() / 255,
        car_field_bw[:, 6].mean() / 255]

    return bottom_black_bar_bw, upper_field_bw, car_field_t

# This function uses the black bar at the window botttom to extract steering setting, speed and gyro data
def compute_steering_speed_gyro_abs(bottom_black_bar_bw):
    a = bottom_black_bar_bw
    steering = (a[6, 36:46].mean() - a[6, 26:36].mean() + 1.0) / 2. # (right_steering - left_steering)/ (255 * 2)
    gyro = (a[6, 60:76].mean() - a[6, 46:60].mean() + 1.0) / 2. # (right_gyro - left_gyro)/ (255 * 2)
    speed = a[:, 0][:-2].mean()
    # I have no idea what these are for
    abs1 = a[:, 6][:-2].mean()
    abs2 = a[:, 8][:-2].mean()
    abs3 = a[:, 10][:-2].mean()
    abs4 = a[:, 12][:-2].mean()

    return np.array([steering, speed, gyro, abs1, abs2, abs3, abs4]) / 255


def qval_to_action(argmax_qval):
    """
    21 (steer) x (5 (gas) + 5 (brake)), can't use gas and brake at the same time
    e.g. argmax 0:9 -> steer=0, 0:4 for gas, 5:9 for brake
         argmax 10:19 -> steer=1
    """
    temp = argmax_qval % 10
    return [(argmax_qval // 10 - 10) / 10., # steer
            (temp < 5) * (temp / 5.),       # gas
            (temp >= 5) * ((temp-5) / 5.)]  # brake

def create_model():
    inputImg = Input(shape=(10,10,1,))
    inputScalar = Input(shape=(1,1,7,)) # steering, speed, gyro

    # CNN - img
    x = Conv2D(16, 3, activation='relu')(inputImg)
    x = MaxPool2D()(x)
    x = Conv2D(24, 3, activation='relu', padding='same')(x)
    x = MaxPool2D()(x)
    x = Conv2D(32, 3, activation='relu', padding='same')(x)
    x = MaxPool2D()(x)
    x = Dense(32)(x)
    # x = Reshape((32, 1, 1))(x)
    x = Model(inputs=inputImg, outputs=x)

    # FCN - steering, speed, gyro
    y = Dense(12, activation='relu')(inputScalar)
    y = Dense(24, activation='relu')(y)
    y = Model(inputs=inputScalar, outputs=y)

    # Concatenate outputs
    combined = concatenate([x.output, y.output]) #img, vect
    z = Dense(96, activation='relu')(combined)
    z = Dense(210, activation=softmax)(z)
    z = Reshape((-1,1))(z)
    model = Model(inputs=[x.input, y.input], outputs=z)
    model.compile(loss='mse', optimizer=SGD(), metrics='accuracy')

    # output: steer [-1,1], gas/brake [0,1]
    # dim: steer x gas x break = 21 x (5 + 5) = 210, cant use gas/break simultaneously
    return model

class DeepQModel():
    def __init__(self, env):
        self.env = env
        self.model = create_model()

    def predict(self, s):
        stateImg, stateScalar = s
        return self.model.predict([stateImg, stateScalar], verbose=0)[0]

    def update(self, s, G):
        stateImg, stateScalar = s
        self.model.fit([stateImg, stateScalar], G, epochs=1, verbose=0)

    def sample_action(self, s, eps):
        qval = self.predict(s)
        # ok my network setup is wrong. it doesnt work with this
        if np.random.rand() < eps:
            return np.random.randint(0, 210), qval
        else:
            return np.argmax(qval), qval

def observation_to_state(observation):
    bottom_black_bar_bw, imgTrack, _ = transform(observation)
    inputVect = compute_steering_speed_gyro_abs(bottom_black_bar_bw)
    state = (imgTrack[np.newaxis,:,:,np.newaxis], inputVect[np.newaxis,np.newaxis,np.newaxis,:])
    return state

def run(env, model, eps, gamma):
    observation = env.reset()
    done = False
    full_reward_received = False
    total_reward = 0
    iters = 0
    while not done:
        state = observation_to_state(observation)
        argmax_qval, qval = model.sample_action(state, eps) # inference
        prev_state = state
        action = qval_to_action(argmax_qval)
        observation, reward, done, info = env.step(action)
        state = observation_to_state(observation)

        # Update, uses Q-Learning TD(0)
        next_qval = model.predict(state)
        G = reward + gamma*np.max(next_qval)
        y = qval[:]
        y[argmax_qval] = G
        model.update(prev_state, y[np.newaxis,:])
        total_reward += reward
        iters += 1

        if iters > 1500:
            print("Episode stuck")
            break

    return total_reward, iters

In [3]:
env = gym.make('CarRacing-v0')
model = DeepQModel(env)
gamma = 0.99

N = 10
totalrewards = np.empty(N)
costs = np.empty(N)
for n in range(N):
    eps = 0.5 / np.sqrt(n+1 + 900)
    totalrewards[n], iters = run(env, model, eps, gamma)
    print("episode:", n, "iters", iters, "total reward:", totalrewards[n], "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n-100):(n+1)].mean())  
    model.model.save('race-car.h5')

env.close()
print("Training Finished")

Track generation: 1278..1601 -> 323-tiles track
episode: 0 iters 1000 total reward: -75.15527950310559 eps: 0.01665741511631924 avg reward (last 100): -75.15527950310559
Track generation: 1152..1454 -> 302-tiles track
episode: 1 iters 1000 total reward: -80.06644518272394 eps: 0.01664817895530067 avg reward (last 100): -77.61086234291477
Track generation: 1076..1349 -> 273-tiles track
