In [60]:
import gym
import random
import numpy as np
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential
from keras.callbacks import TensorBoard

EPISODES = 300

In [61]:
class Agent:
    def __init__(self, maxlen, state_size, action_size, epsilon, epsilonMin, epsilonDecay, learningRate, discountFactor, batchSize, trainStart):

        self.state_size = state_size
        self.action_space = action_size
        self.epsilon = epsilon
        self.epsilonMin = epsilonMin
        self.epsilonDecay = epsilonDecay
        self.learningrate = learningRate
        self.batchSize = batchSize
        self.trainStart = trainStart
        self.discountFactor = discountFactor
        self.memory = deque(maxlen=maxlen)
        self.model = self.buildModel()
        self.targetModel = self.buildModel()

    def buildModel(self):
        model = Sequential([
            Dense(24, input_dim=self.state_size, activation='relu',
                  kernel_initializer='he_uniform'),
            Dense(24, activation='relu',
                  kernel_initializer='he_uniform'),
            Dense(self.action_space, activation = "linear", kernel_initializer='he_uniform' )
        ])
        model.summary()
        model.compile(loss='mse', optimizer=Adam(self.learningrate))
        return model

    def update_target_model(self):
        self.targetModel.set_weights(self.model.get_weights)

    def takeAction(self, state, env):
        if np.random.rand() <= self.epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(self.model.predict(state))
        return action
    
    def decayEpsilon(self):
        self.epsilon = self.epsilon * self.epsilonDecay if self.epsilon > self.epsilonMin else self.epsilonMin

    def remember(self, state, action, reward, sprime, done):
        self.memory.append((state, action, reward, sprime, done))

    def train(self, tb):
        if (self.trainStart > len(self.memory)):
            return
        batch = random.sample(self.memory, self.batchSize)
        states = np.zeros((self.batchSize, self.state_size))
        # temporal difference error targets
        tdetStates = np.zeros((self.batchSize, self.state_size))
        action, reward, done = np.empty(self.batchSize), np.empty(self.batchSize), np.empty(self.batchSize)
        for i in range(self.batchSize):
            states[i] = batch[i][0]
            action[i] = batch[i][1]
            reward[i] = batch[i][2]
            tdetStates[i] = batch[i][3]
            done[i] = batch[i][4]
        targets = self.model.predict(states)
        tdet = self.model.predict(tdetStates)
        for i in range(self.batchSize):
            targets[i][int(action[i])] = reward[i] + self.discountFactor * (1 - int(done[i])) * np.amax(tdet[i])
        self.model.fit(states, targets, batch_size=self.batchSize, verbose=0, epochs=1, callbacks=tb)


In [62]:
if __name__ == "__main__":
    tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True)
    env = gym.make('CartPole-v1')
    stateSize = env.observation_space.shape[0]; actionSpace = env.action_space.n
    print(stateSize, actionSpace)
    agent = Agent(2000, stateSize, actionSpace, 1, 0.1, 0.95, 0.01, 0.9, 32, 1000)
    results = []
    for i in range(EPISODES):
        if i % 5 == 0:
            agent.update_target_model
        state, info = env.reset()
        state = np.reshape(state, (1, stateSize))
        done = False
        score = 0
        while not done:
            action = agent.takeAction(state, env)
            sprime, reward, fell, limit, info =  (env.step(action))
            sprime = np.reshape(state, (1, stateSize))
            done = fell or limit
            agent.remember(state, action, reward, sprime, done)
            agent.train(tensorboard)
            state = sprime
            score += 1
        results.append(score)
        agent.decayEpsilon()
        print(f'''episode: {i}, epsilon: {agent.epsilon:.2f},
memoryLen: {len(agent.memory)}, score: {score}''')

4 2
Model: "sequential_33"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_113 (Dense)           (None, 24)                120       
                                                                 
 dense_114 (Dense)           (None, 24)                600       
                                                                 
 dense_115 (Dense)           (None, 2)                 50        
                                                                 
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_34"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_116 (Dense)           (None, 24)                120       
                                                                 
 dense_117 (Dense)          

KeyboardInterrupt: 