## Note: I can't get the video to render in Collab, but if you copy and paste the following into a Jupyter Notebook, it works fine.

# Version 1: DQN From Github
https://github.com/andywu0913/OpenAI-GYM-CarRacing-DQN/

## Helper Functions

In [1]:
# import cv2
import numpy as np

def process_state_image(state, env_name='CarRacing-v0'):
    if env_name != 'CarRacing-v0': return state
    state = cv2.cvtColor(state, cv2.COLOR_BGR2GRAY)
    state = state.astype(float)
    state /= 255.0
    return state

def generate_state_frame_stack_from_queue(deque, env_name = 'CarRacing-v0'):
    frame_stack = np.array(deque)
    # Move stack dimension to the channel dimension (stack_len, x, y) -> (x, y, stack_len)
    if env_name == 'CarRacing-v0':
        return np.transpose(frame_stack, (1, 2, 0))
    else:
        return frame_stack

## DQN Agent

In [2]:
import random
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Conv1D, BatchNormalization
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

class CarRacingDQNAgent:
    def __init__(
        self,
        action_space    = [
            (-1, 1, 0.2), (0, 1, 0.2), (1, 1, 0.2), #           Action Space Structure
            (-1, 1,   0), (0, 1,   0), (1, 1,   0), #        (Steering Wheel, Gas, Break)
            (-1, 0, 0.2), (0, 0, 0.2), (1, 0, 0.2), # Range        -1~1       0~1   0~1
            (-1, 0,   0), (0, 0,   0), (1, 0,   0)
        ],
        frame_stack_num = 3,
        memory_size     = 5000,
        gamma           = 0.95,  # discount rate
        epsilon         = 1.0,   # exploration rate
        epsilon_min     = 0.1,
        epsilon_decay   = 0.9999,
        learning_rate   = 0.001,
        env_name        = 'CarRacing-v0'
    ):
        self.env_name        = env_name
        self.action_space    = action_space
        self.frame_stack_num = frame_stack_num
        self.memory          = deque(maxlen=memory_size)
        self.gamma           = gamma
        self.epsilon         = epsilon
        self.epsilon_min     = epsilon_min
        self.epsilon_decay   = epsilon_decay
        self.learning_rate   = learning_rate
        if self.env_name == 'CartPole-v0':
            self.frame_stack_num = 1
            self.action_space = [(0), (1)]

        self.model           = self.build_model()
        self.target_model    = self.build_model()
        self.update_target_model()

    def build_model(self):
        if self.env_name == 'CarRacing-v0':
            # Neural Net for Deep-Q learning Model
            model = Sequential()
            model.add(Conv2D(filters=6, kernel_size=(7, 7), strides=3, activation='relu', input_shape=(96, 96, self.frame_stack_num)))
            model.add(MaxPooling2D(pool_size=(2, 2)))
            model.add(Conv2D(filters=12, kernel_size=(4, 4), activation='relu'))
            model.add(MaxPooling2D(pool_size=(2, 2)))
            model.add(Flatten())
            model.add(Dense(216, activation='relu'))
            model.add(Dense(len(self.action_space), activation=None))
            model.compile(loss='mean_squared_error', optimizer=Adam(lr=self.learning_rate, epsilon=1e-7))
            return model
        elif self.env_name == 'CartPole-v0':
            model = Sequential()
            model.add(Dense(24, input_shape=(4,), activation="relu"))
            model.add(Dense(24, activation='relu'))
            model.add(Dense(2, activation='linear'))
            model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate, epsilon=1e-7))
            return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def memorize(self, state, action, reward, next_state, done):
        self.memory.append((state, self.action_space.index(action), reward, next_state, done))

    def act(self, state):
        if np.random.rand() > self.epsilon:
            act_values = None
            if self.env_name == 'CarRacing-v0':
                act_values = self.model.predict(np.expand_dims(state, axis=0))[0]
            else:
                act_values = self.model.predict(state)[0]
            action_index = np.argmax(act_values)
        else:
            action_index = random.randrange(len(self.action_space))
        return self.action_space[action_index]

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, min(batch_size, len(self.memory)))
        train_state  = []
        train_target = []
        for state, action_index, reward, next_state, done in minibatch:
            target = None
            # print('*** REPLAY 1: MODEL.PREDICT: state', state.T[:,0].shape)
            if ENV_NAME == 'CarRacing-v0':
                target = self.model.predict(np.expand_dims(state, axis=0))[0]
            elif ENV_NAME == 'CartPole-v0':
                target = self.model.predict(state)[0]
            if done:
                target[action_index] = reward
            else:
                # print('*** REPLAY 2: MODEL.PREDICT: next_state', next_state.T[:,0].shape)
                t = None
                if ENV_NAME == 'CarRacing-v0':
                    t = self.target_model.predict(np.expand_dims(next_state, axis=0))
                elif ENV_NAME == 'CartPole-v0':
                    t = self.target_model.predict(next_state)[0]
                target[action_index] = reward + self.gamma * np.amax(t)
            train_state.append(state)
            train_target.append(target)
        # print('*** MODEL.FIT: np.array(train_state), np.array(train_target)', np.array(train_state).shape, np.array(train_target).shape)

        self.model.fit(np.array(train_state).reshape((64,4)), np.array(train_target), epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)
        self.update_target_model()

    def save(self, name):
        self.target_model.save_weights(name)

KeyboardInterrupt: 

## Training the DQN

In [12]:
import argparse
import gym
from collections import deque

ENV_NAME                      = 'CartPole-v0' # 'CarRacing-v0'
STARTING_EPISODE              = 1
ENDING_EPISODE                = 1000
SKIP_FRAMES                   = 2
TRAINING_BATCH_SIZE           = 64
SAVE_TRAINING_FREQUENCY       = 25
UPDATE_TARGET_MODEL_FREQUENCY = 5
EPSILON                       = 1.0
EPSILON_MIN                   = 0.1
PLOT_FREQUENCY                = 25
LEARNING_RATE                 = 0.1
if ENV_NAME == 'CartPole-v0':
    SKIP_FRAMES = 0
    EPSILON_MIN = 0

env = gym.make(ENV_NAME)
agent = CarRacingDQNAgent(epsilon=EPSILON, env_name=ENV_NAME, epsilon_min=EPSILON_MIN, learning_rate=LEARNING_RATE)

scores_buffer = []
rewards_buffer = []

for e in range(STARTING_EPISODE, ENDING_EPISODE+1):
    init_state = env.reset()
    init_state = process_state_image(init_state, ENV_NAME)

    total_reward = 0
    negative_reward_counter = 0
    state_frame_stack_queue = None
    state_frame_stack_queue = deque([init_state]*agent.frame_stack_num, maxlen=agent.frame_stack_num)
    time_frame_counter = 1
    done = False
    
    while True:

        current_state_frame_stack = generate_state_frame_stack_from_queue(state_frame_stack_queue, env_name=ENV_NAME)
        action = agent.act(current_state_frame_stack)

        reward = 0
        next_state, r, done, info = None, None, None, None
        if ENV_NAME == 'CarRacing-v0':
            for _ in range(SKIP_FRAMES+1):
                next_state, r, done, info = env.step(action)
                reward += r
                if done:
                    break
        else:
            next_state, reward, done, info = env.step(action)

        # If continually getting negative reward 10 times after the tolerance steps, terminate this episode
        if ENV_NAME == 'CarRacing-v0':
            negative_reward_counter = negative_reward_counter + 1 if time_frame_counter > 100 and reward < 0 else 0

        # Extra bonus for the model if it uses full gas
        if ENV_NAME == 'CarRacing-v0':
            if action[1] == 1 and action[2] == 0:
                reward *= 1.5

        total_reward += reward

        next_state = process_state_image(next_state, ENV_NAME)
        state_frame_stack_queue.append(next_state)
        next_state_frame_stack = generate_state_frame_stack_from_queue(state_frame_stack_queue, env_name=ENV_NAME)

        agent.memorize(current_state_frame_stack, action, reward, next_state_frame_stack, done)

        if (done or negative_reward_counter >= 25 or total_reward < 0):
            print('Episode: {}/{}, Scores(Time Frames): {}, Total Rewards(adjusted): {:.2}, Epsilon: {:.2}'.format(e, ENDING_EPISODE, time_frame_counter, float(total_reward), float(agent.epsilon)))
            scores_buffer.append(ENDING_EPISODE)
            rewards_buffer.append(total_reward)
#             axs[0].plot(np.arange(e), np.array(scores_buffer))
#             axs[1].plot(np.arange(e), np.array(rewards_buffer))
            break
        if len(agent.memory) > TRAINING_BATCH_SIZE:
            agent.replay(TRAINING_BATCH_SIZE)
        time_frame_counter += 1

    if e % UPDATE_TARGET_MODEL_FREQUENCY == 0:
        agent.update_target_model()

    if e % SAVE_TRAINING_FREQUENCY == 0:
        agent.save('./save/trial_{}.h5'.format(e))
        
    if not (e-1) % PLOT_FREQUENCY:
        plt.plot(np.arange(e), np.array(scores_buffer))

env.close()

Episode: 1/1000, Scores(Time Frames): 13, Total Rewards(adjusted): 1.3e+01, Epsilon: 1.0
Episode: 2/1000, Scores(Time Frames): 14, Total Rewards(adjusted): 1.4e+01, Epsilon: 1.0
Episode: 3/1000, Scores(Time Frames): 11, Total Rewards(adjusted): 1.1e+01, Epsilon: 1.0
Episode: 4/1000, Scores(Time Frames): 11, Total Rewards(adjusted): 1.1e+01, Epsilon: 1.0


# REINFORCE

In [1]:
import random
import numpy as np
from collections import deque
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Conv1D, BatchNormalization
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

class REINFORCE:
    def __init__(
        self,
        action_space    = [
            (-1, 1, 0.2), (0, 1, 0.2), (1, 1, 0.2), #           Action Space Structure
            (-1, 1,   0), (0, 1,   0), (1, 1,   0), #        (Steering Wheel, Gas, Break)
            (-1, 0, 0.2), (0, 0, 0.2), (1, 0, 0.2), # Range        -1~1       0~1   0~1
            (-1, 0,   0), (0, 0,   0), (1, 0,   0)
        ],
        gamma           = 0.95,  # discount rate
        learning_rate   = 0.001,
        env_name        = 'CarRacing-v0',
    ):
        self.env_name        = env_name
        self.action_space    = action_space
        self.gamma           = gamma
        self.learning_rate   = learning_rate
        if self.env_name == 'CartPole-v0':
            self.frame_stack_num = 1
            self.action_space = [0, 1]
        self.action_space_size = len(self.action_space)
        self.model           = self.build_model()

    def build_model(self):
        if self.env_name == 'CarRacing-v0':
            # Neural Net for Deep-Q learning Model
            model = Sequential()
            model.add(Conv2D(filters=6, kernel_size=(7, 7), strides=3, activation='relu', input_shape=(96, 96, self.frame_stack_num)))
            model.add(MaxPooling2D(pool_size=(2, 2)))
            model.add(Conv2D(filters=12, kernel_size=(4, 4), activation='relu'))
            model.add(MaxPooling2D(pool_size=(2, 2)))
            model.add(Flatten())
            model.add(Dense(216, activation='relu'))
            model.add(Dense(len(self.action_space), activation=None))
            model.compile(loss='mean_squared_error', optimizer=Adam(lr=self.learning_rate, epsilon=1e-7))
            return model
        elif self.env_name == 'CartPole-v0':
            model = Sequential()
            model.add(Dense(24, input_shape=(4,), activation="relu"))
            model.add(Dense(12, activation='relu'))
            model.add(Dense(2, activation='softmax'))
            model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=self.learning_rate, epsilon=1e-7))
            return model

    def act(self, state):
        act_values = None
        if self.env_name == 'CarRacing-v0':
            act_values = self.model.predict(np.expand_dims(state, axis=0))[0]
        else:
            act_values = self.model.predict(state)[0]
        # action_index = np.argmax(act_values)
        action_index = np.random.choice(self.action_space, p=act_values)
        return self.action_space[action_index], act_values

    def save(self, name):
        self.model.save_weights(name)

    def action_one_hot(self, actions):
        a = np.zeros((actions.shape[0], self.action_space_size))
        # a = np.zeros(self.action_space_size)
        a[np.arange(actions.shape[0]),actions] = 1
        return a

    def train_on_batch(self, states, gradients):
        self.model.train_on_batch(states, gradients)

In [12]:
import argparse
import gym
from collections import deque

ENV_NAME                      = 'CartPole-v0' # 'CarRacing-v0'
STARTING_EPISODE              = 1
ENDING_EPISODE                = 800
SAVE_TRAINING_FREQUENCY       = 50
LEARNING_RATE                 = 0.01
GAMMA                         = 0.99
ALPHA                         = 1e-4
EPISODE_LENGTH                = 500
RANDOM_SEED                   = 1

env = gym.make(ENV_NAME)
np.random.rand(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
env.seed(RANDOM_SEED)
agent = REINFORCE(env_name=ENV_NAME, learning_rate=LEARNING_RATE)

avg_reward_buffer = np.zeros(ENDING_EPISODE+1-STARTING_EPISODE)
reward_sum_arr = []
counter = 0
Gamma = (GAMMA*np.ones(ENDING_EPISODE))**np.arange(ENDING_EPISODE)

for e in range(STARTING_EPISODE, ENDING_EPISODE+1):
    state = env.reset()
    states, rewards, actions,  actions_prob = [], [], [], []
    reward_sum = 0
    done = False

    # Play an Episode
    for t in range(EPISODE_LENGTH):
        action, action_prob = agent.act(state[np.newaxis,:])
        next_state, reward, done, info = env.step(action)
        reward_sum += reward

        states.append(state)
        actions_prob.append(action_prob)
        actions.append(action)
        rewards.append(reward)

        state = next_state

        if done: break
    reward_sum_arr.append(reward_sum)
    if not e % 50:
        print("Episode:", e, " | Reward Sum Avg Last 50:", np.average(np.array(reward_sum_arr[-50:])))
        reward_sum_arr = []

    # Calculate Discounted Rewards
    discounted_rewards = np.cumsum((Gamma[:len(rewards)] * np.array(rewards))[::-1])[::-1] # Calculating all Gt --> discounted rewards
    # Gamma:                                                                [Gamma^0, Gamma^1, ... Gamma^(T-1)] * [R1, R2, ... RT]
    # Gamma[:len(rewards)] * np.array(rewards):                             Gamma^0*R1, Gamma^1*R2, ... , Gamma^(T-1)*RT
    # (Gamma[:len(rewards)] * np.array(rewards))[::-1]):                    Gamma^(T-1)*RT, Gamma^(T-2)*R(T-1), ..., Gamma^0*R0
    # np.cumsum((Gamma[:len(rewards)] * np.array(rewards))[::-1]):          Gamma^(T-1)*RT, Gamma^(T-1)*RT + Gamma^(T-2)*R(T-1), ..., Sum(all)
    # np.cumsum((Gamma[:len(rewards)] * np.array(rewards))[::-1])[::-1]:    
    #       Gt = Sum(all), Sum(all except Gamma element 1), Sum(all except element 1,2), ... , Gamma^(T-1)*RT

    # discounted_rewards = np.zeros(len(rewards))
    # for t in range(len(rewards)):
    #     Gt = 0
    #     power = 0
    #     for reward in rewards[t:]:
    #         Gt = Gt + GAMMA**power * reward
    #         power += 1
    #     discounted_rewards[t] = Gt
    # discounted_rewards = ((discounted_rewards - np.average(discounted_rewards)) \
    #                 / (np.std(discounted_rewards) + 1e-7)) # avoid /0

    # Policy Gradient Update
    actions_prob = np.array(actions_prob)
    actions, states = np.array(actions), np.array(states)
    gradients = ALPHA*(agent.action_one_hot(actions) - actions_prob)*discounted_rewards[:,np.newaxis] + actions_prob
    history = agent.train_on_batch(states, gradients)
    
    avg_reward_buffer[counter] = reward_sum
    counter += 1
    if e % SAVE_TRAINING_FREQUENCY == 0:
        agent.save('./save/reinforce_trial_{}.h5'.format(e))

env.close()
plt.plot(np.arange(ENDING_EPISODE+1-STARTING_EPISODE), avg_reward_buffer)
plt.show()

Episode: 50  | Reward Sum Avg Last 50: 30.06
Episode: 100  | Reward Sum Avg Last 50: 34.14
Episode: 150  | Reward Sum Avg Last 50: 37.36
Episode: 200  | Reward Sum Avg Last 50: 36.62


KeyboardInterrupt: 