### Defining the agent parameters

In [1]:
import gym
class DQNAgent:
    def __init__(self):
      
        #RL parameters
        self.gamma=1.0 # Discount factor
        self.epsilon=1.0 # Exploration
        self.epsilon_min=0.01
        self.epsilon_decay=0.999
        self.alpha=0.01 # lr
        self.alpha_decay=0.01
        self.env=gym.make('CartPole-v1')
        self.start_train=800 #epsilon decay indication

### Building the Network & Functions

In [4]:
import random
import numpy as np
import math
import tensorflow as tf
from collections import deque
import gym
from threading import Thread
#Training parameters
n_episodes=220
n_win_ticks=195 #number of timesteps, total ticks number is 500.
memory= deque(maxlen=100000)

agent=DQNAgent()

batch_size=64
quiet = False

# Building the neural network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Model Definition
model=Sequential()
model.add(Dense(96, input_dim=4, activation='relu'))
model.add(Dense(48,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(2,activation='relu'))
model.compile(loss='mse', optimizer=Adam(lr=agent.alpha,decay=agent.alpha_decay))

# Agent Functions
def Remember(state,action,reward,next_state,done):
    memory.append((state,action,reward,next_state,done))
    if agent.epsilon > agent.epsilon_min and len(memory) > agent.start_train:
        agent.epsilon*= agent.epsilon_decay
           

def choose_Action(state,epsilon):
    isRand= (np.random.rand() <= epsilon)
    if isRand:
        act=agent.env.action_space.sample()
    else:
        act=np.argmax(model.predict(state))

    return act
#at the start we will choose random actions, as epsilon decays the agent will start predicting the actions.

def preprocess_State(state):
    return np.reshape(state,[1,4])

def Replay(batch_size):
    if len(memory) < agent.start_train:
        return

    minibatch=random.sample(memory , min(len(memory),batch_size))
    state = np.zeros((batch_size, agent.env.observation_space[0]))
    next_state = np.zeros((batch_size, agent.env.observation_space[0]))
    action, reward, done = [], [], []

        #May change it to a tensor implementation for better performance/speed
    for i in range(batch_size):
        state[i] = minibatch[i][0]
        action.append(minibatch[i][1])
        reward.append(minibatch[i][2])
        next_state[i] = minibatch[i][3]
        done.append(minibatch[i][4])

        # batch prediction
    target = model.predict(state)
    target_next = model.predict(next_state)

    for i in range(batch_size):
            # correction on the Q value for the action used
        if done[i]:
            target[i][action[i]] = reward[i]

        else:
                # Standard - DQN
                # DQN chooses the max Q value among next actions
                # selection and evaluation of action is on the Q network target
                # Q_max = max_a' Q_target(s', a')
            target[i][action[i]] = reward[i] + agent.gamma * (np.amax(target_next[i]))

    model.fit(state, target, batch_size=batch_size, verbose=0)

# Define Run program function
def Run(): 
    maxTicks=0
    scores=deque(maxlen=100)
    for episode in range(n_episodes):
        reset=agent.env.reset()
        state=preprocess_State(reset)
        done=False
        num_ticks=0
        while not done:
            action = choose_Action(state,agent.epsilon)
            next_state, reward, done, _ = agent.env.step(action)
            agent.env.render() 
            next_state= preprocess_State(next_state)
            Remember(state,action,reward,next_state,done)
            state = next_state
            num_ticks += 1
            if episode%2==0:
                Replay(batch_size) #replays every two episodes

        scores.append(num_ticks)
        mean_score=np.mean(scores)

        if num_ticks > maxTicks:
            maxTicks=num_ticks
            print("max ticks this episode: {}".format(maxTicks))
            
        if mean_score >= n_win_ticks:   ## The game ends when the agent gets an average ticks of the same number of ticks needed to win.
            model.save('cpModel')
            if not quiet: print("Ran {} episodes, And Mean score Solved.".format(episode))
            return episode-100

        if num_ticks >= n_win_ticks:
            print("Solved in the {} episode, num of ticks:{}".format(episode,num_ticks))

        if episode%20==0 and not quiet:
            print("[Episode {}]--> Mean survival time over last episodes was {} ticks, total ticks {} eps: {}".format(episode,mean_score,num_ticks,agent.epsilon))

    if not quiet: print('Did not solve after {} episodes'.format(episode))
    return episode

In [5]:
Run()

max ticks this episode: 23
[Episode 0]--> Mean survival time over last episodes was 23.0 ticks, total ticks 23 eps: 1.0
max ticks this episode: 29
max ticks this episode: 38
[Episode 20]--> Mean survival time over last episodes was 19.142857142857142 ticks, total ticks 15 eps: 1.0
max ticks this episode: 58
[Episode 40]--> Mean survival time over last episodes was 21.975609756097562 ticks, total ticks 21 eps: 0.9038873549665959
[Episode 60]--> Mean survival time over last episodes was 20.42622950819672 ticks, total ticks 13 eps: 0.6400409317729626
max ticks this episode: 61
max ticks this episode: 73
max ticks this episode: 93
max ticks this episode: 181
[Episode 80]--> Mean survival time over last episodes was 27.765432098765434 ticks, total ticks 67 eps: 0.2346348076972527
max ticks this episode: 278
Solved in the 85 episode, num of ticks:278
Solved in the 87 episode, num of ticks:259
max ticks this episode: 286
Solved in the 97 episode, num of ticks:286
Solved in the 99 episode, num

40

In [6]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 96)                480       
_________________________________________________________________
dense_1 (Dense)              (None, 48)                4656      
_________________________________________________________________
dense_2 (Dense)              (None, 24)                1176      
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 50        
Total params: 6,362
Trainable params: 6,362
Non-trainable params: 0
_________________________________________________________________
