In [7]:
%load_ext autoreload
%autoreload 2
import numpy as np
import gym
from keras.layers import Dense, LSTM, ConvLSTM2D, Activation, Flatten, Conv2D, MaxPooling2D
from keras.models import Sequential
from preprocessImages import preprocess
import matplotlib.pyplot as plt
from collections import deque
from tqdm import tqdm_notebook as tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Using TensorFlow backend.


AttributeError: module 'tensorflow' has no attribute 'name_scope'

# Load Environment

In [None]:
environment = gym.make("Pong-v0")

# Hyper Params

In [None]:
total_episodes = 20000
learning_rate = 10e-3
discount_rate = 0.99
batch_size = 5

# Exploitation vs Exploration params.
exploration_probability = 1
max_exploration_probability = 1
min_exploration_probability = 0.01
exploration_probability_decay_rate = 0.005

# Game Actions:
move_up = 2
move_down = 3
possible_actions = [move_up, move_down]
action_space = {2:0, 3:1}

# Image params
image_size = (80,80,1)
memory_size = 11

# DeepQ LSTM Model

In [None]:
def deep_q(input_shape, num_actions, loss, optimizer):
    model = Sequential()
    
#     model.add(ConvLSTM2D(3, 2, input_shape=input_shape))
#     model.add(MaxPooling2D())
#     model.add(Activation('relu'))
    
    model.add(Conv2D(32, 6, input_shape=input_shape))
    model.add(MaxPooling2D())
    model.add(Activation('relu'))
    
    model.add(Conv2D(32, 6))
    model.add(MaxPooling2D())
    model.add(Activation('relu'))
    
    model.add(Flatten())
    model.add(Dense(200))
    model.add(Activation('relu'))
    
    model.add(Dense(num_actions))
#     model.add(Activation('softmax'))
    
    model.compile(loss=loss, optimizer=optimizer)
    return model

# Replay Memory

In [None]:
class ReplayMemory():
    def __init__(self, max_replay_memory_size):
        self.memory = deque(maxlen = max_replay_memory_size)
            
    def add_memory(self, current_state, reward, action, next_state, done):
        self.memory.append([current_state, reward, action, next_state, done])
    
    def sample_memory(self, batch_size):
        samples = np.random.choice(np.arange(len(self.memory)), size = batch_size, replace=False)
        current_states, rewards, actions, next_states, dones = zip(*[self.memory[sample] for sample in samples])
        
        return np.array(current_states), np.array(rewards), np.array(actions), np.array(next_states), np.array(dones)

In [4]:
def next_action(model, state, episode, possible_actions, exploration_probability, max_exploration_probability, min_exploration_probability, exploration_probability_decay_rate):
    exploration_exploitation_value = np.random.uniform(0,1)
    
    if exploration_exploitation_value > exploration_probability:
        action = np.random.choice(possible_actions)
    else:
        action = np.argmax(model.predict(np.expand_dims(state, axis=0)))
        action = possible_actions[action]
    
    exploration_probability = min_exploration_probability + \
    (max_exploration_probability - min_exploration_probability) * \
    np.exp(-exploration_probability_decay_rate * episode)
    
    return action, exploration_probability

# Train Model

In [5]:
memory = ReplayMemory(memory_size)
model = deep_q((80,80,1), 2, "mse", "adam")
target_model = deep_q((80,80,1), 2, "mse", "adam")
model.summary()

NameError: name 'ReplayMemory' is not defined

In [None]:
for episode in tqdm(range(total_episodes)):
    previous_state = preprocess(environment.reset())
    current_state, _, _, _ = environment.step(np.random.choice(possible_actions))
    current_state =  preprocess(current_state)
#     current_state = np.array([previous_state, current_state])
    current_state = current_state - previous_state
    
    step = 0
    episode_rewards = 0
    done = False
    
    while not done:
        action, exploration_probability = next_action(model, current_state, episode, 
                                        possible_actions, exploration_probability, max_exploration_probability, 
                                        min_exploration_probability, exploration_probability_decay_rate)
        previous_state = current_state
        current_state, reward, done, _ = environment.step(action)
        current_state = preprocess(current_state)
#         current_state = np.array([previous_state[-1], current_state])
        current_state = current_state - previous_state

        # Add state to memory to be used for replay training.
        memory.add_memory(previous_state, reward, action, current_state, done)
        
        if done:
            # Replay Learning
            current_states, rewards, actions, next_states, dones = memory.sample_memory(batch_size)
            
            Q_next_state = target_model.predict(next_states)
            target_Qs = model.predict(current_states)
            
            print(Q_next_state.shape)
            
            for i in range(batch_size):
                if dones[i]:
                    target_Qs[i][action_space[action]] = rewards[i]
                else:
                    target_Qs[i][action_space[action]] = rewards[i] + discount_rate * np.max(Q_next_state[i])

            print(target_Qs.shape)
            
#             dones = dones - 1
#             dones = abs(dones)
#             target_Qs = rewards + (dones * discount_rate * np.amax(Q_next_state, axis=1))
            print(current_states.shape)
            model.fit(current_states, target_Qs)
        
        

In [None]:
# x_data = np.reshape(np.arange(1000), (1000,1))
# y_data = np.reshape(np.arange(1000), (1000,1))

# m = Sequential()
# m.add(Dense(5, input_shape = (1,)))
# m.add(Activation("relu"))

# m.add(Dense(1))
# m.add(Activation("relu"))
# m.compile(loss="mse", optimizer="adam")

# m.fit(x_data, y_data, epochs=1000)