In [1]:
import gym
import tensorflow as tf
import random
import numpy as np
import cv2
from collections import deque
import matplotlib.pyplot as plt

from tensorflow.keras.models import Model
from tensorflow.keras.layers import *

from memory import frame_stack, replay_buffer

In [2]:
MAX_EPSILON = 1
MIN_EPSILON = 0.1
EPSILON_DECAY = 0.99
DELAY_TRAINING = 500
GAMMA = 0.99
BATCH_SIZE = 32
TAU = 0.08

env = gym.make("Breakout-v0")

In [3]:
def preprocess_frame(frame):
    resized = frame[10:210,0:160]
    gray = cv2.cvtColor(resized, cv2.COLOR_RGB2GRAY)/255.
    return np.expand_dims(gray, axis=2).astype(np.float32)

In [4]:
def Learner(obs_dim,action_dim):
    
    input = Input(shape=(obs_dim))
    c1 = Conv2D(4, (5, 5), activation='elu', kernel_initializer='he_normal', padding='same') (input)
    c1 = Dropout(0.1) (c1)
    c1 = Conv2D(4, (5, 5), activation='elu', kernel_initializer='he_normal', padding='same') (c1)
    p1 = MaxPooling2D((2, 2)) (c1)

    c2 = Conv2D(4, (5, 5), activation='elu', kernel_initializer='he_normal', padding='same') (p1)
    c2 = Dropout(0.1) (c2)
    c2 = Conv2D(4, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (c2)
    p2 = MaxPooling2D((2, 2)) (c2)
    
    c3 = Conv2D(2, (5, 5), activation='elu', kernel_initializer='he_normal', padding='same') (p2)
    c3 = Dropout(0.1) (c3)
    c3 = Conv2D(2, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (c3)
    p3 = MaxPooling2D((2, 2)) (c3)
    
    c4 = Conv2D(2, (5, 5), activation='elu', kernel_initializer='he_normal', padding='same') (p3)
    c4 = Dropout(0.1) (c4)
    c4 = Conv2D(2, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (c4)
    p4 = MaxPooling2D((2, 2)) (c4)

    c5 = Conv2D(2, (5, 5), activation='elu', kernel_initializer='he_normal', padding='same') (p4)
    c5 = Dropout(0.2) (c5)
    c5 = Conv2D(1, (5, 5), activation='elu', kernel_initializer='he_normal', padding='same') (c5)

    y = Flatten()(c5)
    x = Dense(32)(y)
    x = Dense(32)(x)

    advantage = Dense(32, activation="relu")(y)
    advantage = Dense(action_dim)(advantage)
    
    advantage_norm = Lambda(lambda x: x - tf.reduce_mean(x))(advantage)
    
    value = Dense(32, activation="relu")(y)
    value = Dense(1)(value)
    
    out = Add()([value, advantage_norm])

    model = Model(inputs=input, outputs=out)
    return model

In [5]:
class DQNAgent:
    def __init__(self, env):
        
        #Environment
        self.state_size = (200,160,1)
        self.num_actions = 4
        
        #Initiate networks
        self.primary_network = Learner(self.state_size, self.num_actions)
        self.target_network = Learner(self.state_size, self.num_actions)
        
        self.optimizer = tf.keras.optimizers.Adam()
        self.MSE = tf.keras.losses.MeanSquaredError()
        # make target_network = primary_network
        for t, e in zip(self.target_network.trainable_variables, self.primary_network.trainable_variables):
            t.assign(e)
        
        #Initiate memory
        self.replay_buffer = replay_buffer(100000)
        self.frame_stack = frame_stack(4, (200,160))
        

    def update_network(self):
        for t, e in zip(self.target_network.trainable_variables, self.primary_network.trainable_variables):
            t.assign(t * (1 - TAU) + e * TAU)# update target network parameters slowly from primary network
    
    def choose_action(self, state, eps):
        if random.random() < eps:
            return env.action_space.sample()
        else:
            return np.argmax(self.primary_network.predict(np.expand_dims(state, axis=0))[0])
        
    def train(self):
        batch = self.replay_buffer.sample(BATCH_SIZE)
        states = np.array([val[0] for val in batch], dtype=np.float32)
        actions = np.array([val[1] for val in batch])
        rewards = np.array([val[2] for val in batch], dtype=np.float32)
        next_states = np.array([val[3] for val in batch], dtype=np.float32)

        with tf.GradientTape() as tape:
            prim_qt = self.primary_network(states)
            prim_qtp1 = self.primary_network(next_states)
            target_q = prim_qt.numpy()
            updates = rewards
            batch_idxs = np.arange(BATCH_SIZE)
            prim_action_tp1 = np.argmax(prim_qtp1.numpy(), axis=1)
            q_from_target = self.target_network(next_states)
            updates += GAMMA * q_from_target.numpy()[batch_idxs, prim_action_tp1]
            target_q[batch_idxs, actions] = updates
            loss = self.MSE(prim_qt, target_q)
            
        gradients = tape.gradient(loss, self.primary_network.trainable_variables)   
        self.optimizer.apply_gradients(zip(gradients, self.primary_network.trainable_variables))
        self.update_network()
        

In [6]:
Agent = DQNAgent(env)

In [7]:
num_episodes = 10000
eps = MAX_EPSILON
steps = 0
rewards = []
avg_rewards = []

print(eps)

for episode in range(num_episodes):
    episode_reward = 0
    state = env.reset()
    DQNAgent.frame_stack.query()
    while True:
        env.render()
        action = Agent.choose_action(preprocess_frame(state), eps)
        next_state, reward, done, info = env.step(action)
        episode_reward += reward

        # store in memory
        #plt.imshow(next_state)
        experience = preprocess_frame(state), action, reward, preprocess_frame(next_state), done
        Agent.replay_buffer.store(experience)

        # linearly decay the eps value
        if steps > DEL AY_TRAINING:
            Agent.train()
            
        steps += 1
        if done:
            if steps > DELAY_TRAINING:
                print("episode: {}, eps: {}, reward: {}, average reward: {}".format(episode, eps, np.round(episode_reward, decimals=2), np.mean(rewards[-10:])))
            else:
                print("episode: {}, eps: {}, pretraining...".format(episode, eps, decimals=2))
            if eps > MIN_EPSILON:
                eps = eps * EPSILON_DECAY
            break

        state = next_state
        
    rewards.append(episode_reward)
    avg_rewards.append(np.mean(rewards[-10:]))

env.close()
plt.plot(rewards)
plt.plot(avg_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()        

1
episode: 0, eps: 1, pretraining...
episode: 1, eps: 0.99, pretraining...
episode: 2, eps: 0.9801, reward: 0.0, average reward: 0.5
episode: 3, eps: 0.9702989999999999, reward: 1.0, average reward: 0.3333333333333333
episode: 4, eps: 0.96059601, reward: 1.0, average reward: 0.5
episode: 5, eps: 0.9509900498999999, reward: 2.0, average reward: 0.6
episode: 6, eps: 0.9414801494009999, reward: 1.0, average reward: 0.8333333333333334


KeyboardInterrupt: 

In [26]:
batch = DQNAgent.memory.sample(BATCH_SIZE)
states = np.array([val[0] for val in batch], dtype=np.float32)
actions = np.array([val[1] for val in batch])
rewards = np.array([val[2] for val in batch], dtype=np.float32)
next_states = np.array([val[3] for val in batch], dtype=np.float32)

In [41]:
prim_qt = DQNAgent.primary_network(states)
prim_qtp1 = DQNAgent.primary_network(next_states)
target_q = prim_qt.numpy()
updates = rewards
#valid_idxs = np.squeeze(np.array(next_states).sum(axis=1) != 0)
#batch_idxs = np.arange(BATCH_SIZE)
prim_action_tp1 = np.argmax(prim_qtp1.numpy(), axis=1)
q_from_target = DQNAgent.target_network(next_states)

In [53]:
updates += GAMMA * q_from_target.numpy()[batch_idxs, prim_action_tp1]

In [None]:
env.close()

In [24]:
print(steps)

52338


In [29]:
np.argmax(DQNAgent.primary_network.predict(np.float32(np.expand_dims(preprocess_frame(state), axis=0)))[0])

1

In [None]:
env.close()

In [10]:
     batch = self.memory.sample(BATCH_SIZE)
        states = np.array([val[0] for val in batch], dtype=np.float32)
        actions = np.array([val[1] for val in batch])
        rewards = np.array([val[2] for val in batch], dtype=np.float32)
        next_states = np.array([(np.zeros(self.state_size) if val[3] is None else val[3]) for val in batch], dtype=np.float32)

IndentationError: unexpected indent (<ipython-input-10-9f183f3a8ace>, line 2)