In [9]:
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt

import numpy as np
import gym
import random
from collections import deque
import tensorflow as tf
import tensorflow.keras as tfk
import tensorflow.keras.layers as tfkl
import pickle
import time


In [10]:
class ReplayMemory(object):
    def __init__(self, list, max_size):
        self.max_size = max_size
        self.cntr = 0
        self.mem = deque(list, maxlen = self.max_size)

    def store(self, curr_state, action, reward, next_state, done):
        self.mem.append((curr_state, action, reward, next_state, done))
        self.cntr += 1

    def sample(self, batch_size):
        batch_size = min(batch_size, self.cntr)
        return batch_size, random.sample(self.mem, batch_size)


In [11]:
#set base_path to current directory (path of folder where this nootebook is saved)
base_dir = "/home/kavitawagh/KAVITA/CP314/Project/"
project_path = base_dir + "DQN/CartPole/"

In [19]:

class DQNAgent(object):
    def __init__(self):
        self.input_shape = (4, ) #position of cart, velocity of cart, angle of pole, rotation rate of pole
        self.action_count = 2
        self.epsilon_end = 0.01
        self.epsilon_step = 0.996
        self.no_of_episodes = 500
        self.replay_mem_max_size = 10000 #replay memory capacity
        self.batch_size = 16 
        self.skip_frames = 0
        self.gamma = 0.99 #discount factor
        self.C = 5000 
        self.l_rate = 0.0005 
        self.discrete_action_ids = [i for i in range(self.action_count)]
        self.epi_score = 0
        self.total_score = []
        self.skip_cntr = 0 #No. of frames to skip
        self.data_path = project_path + "data/"
        
        #These file are saved every 10 episodes to recover training from system crash 
        self.log_file_path = self.data_path + "dqn_log.txt"
        self.log_file = open(self.log_file_path, "a+")
        self.replay_mem_file = self.data_path + "replay_mem_data.file"
        self.epsilon_file = self.data_path + "epsilon.file"
        self.trained_model_file = self.data_path + "trained_model.h5"
        
        #Load replay memory and epsilon value if already exist in file 
        try:
            self.replay_mem = pickle.load(open(self.replay_mem_file, "rb"))
            if len(self.replay_mem.mem) > self.replay_mem_max_size:
                self.replay_mem = ReplayMemory(self.replay_mem.mem, self.replay_mem_max_size)
            print("Replay Memory loaded. Length: ", len(self.replay_mem.mem))
        except Exception as e:
            self.replay_mem = ReplayMemory([], self.replay_mem_max_size)
            print("Error in loading Replay Memory. ", e)

        try:
            self.epsilon = pickle.load(open(self.epsilon_file, "rb"))
            print("Epsilon loaded. Value: ", self.epsilon)
        except Exception as e:
            self.epsilon = 1
            print("Error in loading Epsilon. ", e)

        #Load trained model in exists otherwise create new model
        try:
            self.model = tfk.models.load_model(self.trained_model_file)
            print("Trained model loaded.")
        except Exception as e:
            print("Error in loading model. ", e)
            self.model = tfk.models.Sequential([
            tfkl.Dense(16, activation = 'relu', input_shape=self.input_shape), 
            tfkl.Dense(8, activation = 'relu'),
            tfkl.Dense(4, activation='relu'),
            tfkl.Dense(self.action_count) 
            ])

            #self.model.compile(optimizer=tfk.optimizers.RMSprop(learning_rate=self.l_rate, momentum=0.95), loss='mse')
            self.model.compile(optimizer=tfk.optimizers.Adam(learning_rate=self.l_rate), loss='mse')
        self.new_weights = self.old_weights = self.model.get_weights()

    def take_skip_action(self, curr_state):
        if self.skip_cntr % self.skip_frames != 0:
            next_x, reward, done, info = env.step(self.skip_action)
            self.epi_score += reward
            self.skip_cntr += 1
            skipped = True
        else:
            next_x = None
            reward = 0
            done = False
            info = None
            self.skip_cntr = 0
            skipped = False
        return next_x, reward, done, info, skipped

    def take_e_greedy_action(self, curr_state):
        if random.uniform(0, 1) < self.epsilon:
            action = np.random.choice(self.discrete_action_ids)
        else:
            self.model.set_weights(self.new_weights)
            actions = self.model.predict(curr_state[np.newaxis, : ])
            action = np.argmax(actions)
        self.skip_action = action
        next_x, reward, done, info = env.step(action)
        self.epi_score += reward
        return next_x, action, reward, done, info
    
    def take_predicted_action(self, curr_state):
        actions = self.model.predict(current_state[np.newaxis, : ])
        action = np.argmax(actions)
        next_x, reward, done, info = env.step(self.action)
        self.epi_score += reward
        return next_x, action, reward, done, info

    def update_epsilon(self):
        if self.epsilon > self.epsilon_end:
            self.epsilon = self.epsilon * self.epsilon_step 
        else:
            self.epsilon = self.epsilon_end        

    def store_transition(self, curr_state, action, reward, next_state, done):
        self.replay_mem.store(curr_state, action, reward, next_state, float(done))

    def sample_transition(self):
        batch_size, samples = self.replay_mem.sample(self.batch_size)
        curr_states, actions, rewards, next_states, dones = zip(*samples)
        curr_states = np.stack(curr_states)
        actions = np.stack(actions)
        rewards = np.stack(rewards)
        next_states = np.stack(next_states)
        dones = np.stack(dones)
        return batch_size, curr_states, actions, rewards, next_states, dones

    def optimize_loss(self, batch_size, curr_states, actions, rewards, next_states, dones):
        self.model.set_weights(self.old_weights)
        q_vals = self.model.predict(next_states)
        q_vals = np.max(q_vals, axis=1)
        batch_list = np.arange(batch_size, dtype=np.int32)
        self.model.set_weights(self.new_weights)
        target_q_vals = self.model.predict(curr_states)
        target_q_vals[batch_list, actions] = rewards + (1 - dones) * self.gamma * q_vals
        self.old_weights = self.new_weights        
        hist = self.model.fit(curr_states, target_q_vals, verbose=0)
        self.new_weights = self.model.get_weights()        

    def reset_target_params(self, epochs):
        if epochs % self.C == 0:
            self.old_weights = self.new_weights

    def end_episode(self):
        self.total_score.append(self.epi_score)
        self.epi_score = 0

    def print_status(self, episode, elapsed_time):
        str = "\nepisode: {}  score: {:.5f}  epsilon: {:.3f}  {:.2f}s {}"
        str = str.format(episode, self.epi_score, self.epsilon, \
                         elapsed_time, time.strftime("%Y-%m-%d %H:%M:%S"))
        self.log_file.write(str)
        print(str)

    def close(self):
        self.log_file.close()

    def save_partial_status(self, episode):
        if episode % 10 == 0 or episode == self.no_of_episodes:
            agent.model.save(self.trained_model_file)
            self.log_file.close()
            self.log_file = open(self.log_file_path, "a+")
            with open(self.epsilon_file, "wb") as f:
                pickle.dump(self.epsilon, f, pickle.HIGHEST_PROTOCOL)
                
           #If size of replay memory is too large then following code might crash. Hence it is commented. 
           # with open(self.replay_mem_file, "wb") as f:
           #     pickle.dump(self.replay_mem, f, pickle.HIGHEST_PROTOCOL)        
           #print("Mem length: ", len(self.replay_mem.mem))
    
    def plot(self):
        if len(self.total_score) > 1:
            N = np.arange(0, self.no_of_episodes)
            plt.figure()
            plt.plot(N, self.total_score, label = "reward")
            plt.title("")
            plt.xlabel("Episode #")
            plt.ylabel("Reward")
            plt.legend()
            plt.savefig(self.data_path + 'train_RewardPlot.png')
            plt.close()

In [21]:
#MODEL TRAINING

agent = DQNAgent()

epi_start = 1
epochs = 0
env = gym.make ('CartPole-v0') 
#env = wrap_env(gym.make('CarRacing-v0'))
print(env.observation_space, env.action_space)

for episode in range(epi_start, agent.no_of_episodes + epi_start):
    start_time = time.time()
    done = False
    curr_state = env.reset()

    while not done:
        epochs += 1

        if agent.skip_frames > 0 :
            curr_state, reward, done, info, skipped = agent.take_skip_action()
            if skipped:
                continue

        next_state, action, reward, done, info = agent.take_e_greedy_action(curr_state)
        agent.store_transition(curr_state, action, reward, next_state, done)
        curr_state = next_state
        batch_size, curr_states, actions, rewards, next_states, dones = agent.sample_transition()

        agent.optimize_loss(batch_size, curr_states, actions, rewards, next_states, dones)

        agent.reset_target_params(epochs)

    agent.update_epsilon()
    agent.print_status(episode, time.time() - start_time)
    agent.save_partial_status(episode)
    agent.end_episode()

env.close()
agent.close()
agent.plot()


Error in loading Replay Memory.  [Errno 2] No such file or directory: '/home/kavitawagh/KAVITA/CP314/Project/DQN/CartPole/data/replay_mem_data.file'
Epsilon loaded. Value:  0.1347935812106403
Trained model loaded.
Box(4,) Discrete(2)




KeyboardInterrupt: 

In [59]:
#If you want to save replay memory then run this code
with open(agent.replay_mem_file, "wb") as f:
    pickle.dump(agent.replay_mem, f, pickle.HIGHEST_PROTOCOL)

In [22]:

with open(agent.data_path + "train_score.file", "wb") as f:
    pickle.dump(agent.total_score, f, pickle.HIGHEST_PROTOCOL)

In [7]:
#TESTING 

env = gym.make ('CartPole-v0') 

model = tfk.models.load_model(project_path + "data/trained_model.h5")
no_of_episodes = 10
score = 0
score_list = []
for epi in range(1, no_of_episodes+1):
    start_time = time.time()
    state = env.reset()
    env.render()
    done = False
    while not done:
        #actions = model.predict(state[np.newaxis, :])
        #action = np.argmax(actions)
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        score += reward 
        env.render()
    print("Episode: {} Score: {:.3f} Time: {}s".format(epi, score, time.time() - start_time))
    score_list.append(score)
    score = 0
    
with open(project_path + "data/test_score.file", "wb") as f:
    pickle.dump(score_list, f, pickle.HIGHEST_PROTOCOL)
env.close()

if epi > 1:
    N = np.arange(0, no_of_episodes)
    plt.figure()
    plt.plot(N, score_list, label = "reward")
    plt.title("Testing rewards")
    plt.xlabel("Episode #")
    plt.ylabel("Reward")
    plt.legend()
    plt.savefig(project_path + 'data/test_RewardPlot.png')
    plt.close()

Episode: 1 Score: 40.000 Time: 0.5703773498535156s
Episode: 2 Score: 14.000 Time: 0.24984049797058105s
Episode: 3 Score: 32.000 Time: 0.5498428344726562s
Episode: 4 Score: 35.000 Time: 0.5999195575714111s
Episode: 5 Score: 11.000 Time: 0.19954442977905273s
Episode: 6 Score: 21.000 Time: 0.36696863174438477s
Episode: 7 Score: 14.000 Time: 0.24939298629760742s
Episode: 8 Score: 52.000 Time: 0.8832278251647949s
Episode: 9 Score: 21.000 Time: 0.36673593521118164s
Episode: 10 Score: 26.000 Time: 0.449840784072876s
