Skip to content
This repository has been archived by the owner on Apr 25, 2023. It is now read-only.

Update ddqn.py #30

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
58 changes: 34 additions & 24 deletions ddqn.py
Expand Up @@ -17,17 +17,16 @@ def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen=2000)
self.gamma = 0.95 # discount rate
self.gamma = 0.98 # discount rate
self.epsilon = 1.0 # exploration rate
self.epsilon_min = 0.01
self.epsilon_decay = 0.99
self.epsilon_decay = 0.999
self.learning_rate = 0.001
self.model = self._build_model()
self.target_model = self._build_model()
self.update_target_model()

"""Huber loss for Q Learning

References: https://en.wikipedia.org/wiki/Huber_loss
https://www.tensorflow.org/api_docs/python/tf/losses/huber_loss
"""
Expand All @@ -47,8 +46,7 @@ def _build_model(self):
model.add(Dense(24, input_dim=self.state_size, activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(self.action_size, activation='linear'))
model.compile(loss=self._huber_loss,
optimizer=Adam(lr=self.learning_rate))
model.compile(loss=self._huber_loss, optimizer=Adam(lr=self.learning_rate))
return model

def update_target_model(self):
Expand All @@ -66,16 +64,24 @@ def act(self, state):

def replay(self, batch_size):
minibatch = random.sample(self.memory, batch_size)
stateBatch = np.zeros((batch_size,self.state_size))
targetBatch = np.zeros((batch_size,self.action_size))
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you use snake case to match with the rest of the code?

x = 0

for state, action, reward, next_state, done in minibatch:
target = self.model.predict(state)
stateBatch[x] = state[0]

targetBatch[x] = self.model.predict(state)[0]
if done:
target[0][action] = reward
targetBatch[x][action] = reward
else:
# a = self.model.predict(next_state)[0]
t = self.target_model.predict(next_state)[0]
target[0][action] = reward + self.gamma * np.amax(t)
# target[0][action] = reward + self.gamma * t[np.argmax(a)]
self.model.fit(state, target, epochs=1, verbose=0)
targetBatch[x][action] = reward + self.gamma * np.amax(t)

x+=1


self.model.fit(stateBatch, targetBatch, epochs=1, verbose=0)
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay

Expand All @@ -92,31 +98,35 @@ def save(self, name):
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
# agent.load("./save/cartpole-ddqn.h5")
done = False
batch_size = 32


batch_size = 128
for e in range(EPISODES):
state = env.reset()
state = np.reshape(state, [1, state_size])
for time in range(500):
done = False
cumReward = 0
while not done:
# env.render()
action = agent.act(state)
next_state, reward, done, _ = env.step(action)
#reward = reward if not done else -10
env.render()

cumReward+=reward
reward = reward if not done else -10
x,x_dot,theta,theta_dot = next_state
r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
reward = r1 + r2
reward = reward + r1 + r2



next_state = np.reshape(next_state, [1, state_size])
agent.memorize(state, action, reward, next_state, done)
state = next_state
if done:
agent.update_target_model()
print("episode: {}/{}, score: {}, e: {:.2}"
.format(e, EPISODES, time, agent.epsilon))
break
if len(agent.memory) > batch_size:
agent.replay(batch_size)
agent.replay(min(batch_size,len(agent.memory)))

agent.update_target_model()
print("episode: {}/{}, score: {}, e: {:.2}".format(e, EPISODES, cumReward, agent.epsilon))
# if e % 10 == 0:
# agent.save("./save/cartpole-ddqn.h5")