In [None]:
# Deep Q Networks - Getting a Deep Q network to play Ms Pacman

# For image preprocessing used: https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5

# For NN structure used: https://github.com/yxu1168/Reinforcement-Learning-DQN-for-ATARI-s-Pong-Game---TensorFlow-2.0-Keras/



In [None]:
### SANITY TEST ##############################################################

# ENVIRONMENT SUMMARY: -> Pong-v0 
# -> Discrete action space
# -> The easiest atari game with discrete action space

# Observation -> 1 item
# An array of shape (210, 160, 3) corresponding to the screen image

# Action -> 6 items
# [‘NOOP’, ‘FIRE’, ‘RIGHT’, ‘LEFT’, ‘RIGHTFIRE’, ‘LEFTFIRE’]
# NOOP = FIRE, RIGHT = RIGHTFIRE, LEFT = LEFTFIRE
# so really only 3 actual actions

# Reward -> 1 item
# One point is subtracted for conceeding and one point is game for winning
# For a total of 21 games

### MAIN #######################################################################

# ENVIRONMENT SUMMARY: -> MsPacman-v0
# -> Discrete action space

# Observation -> 17 items
# 

# Action -> 6 items
#

# Reward -> 1 item
#

#################################################################################

import gym 
import tensorflow as tf
import numpy as np
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Input, Conv2D, Flatten
from tensorflow.keras.losses import mean_squared_error
from collections import deque

# VARIABLES: ####################################################################

TOTAL_EPISODES = 10
NB_FRAMES = 1000
ACTION_SPACE = 6
ENV_NAME = 'Pong-v0'

### FUNCTIONS ###################################################################

# from Karpathy pong from pixels
# 210x160x3 uint8 frame into 6400 (80x80) 1D float vector
def PreProcessPong(I):
    
    # crop
    I = I[35:195] 
    
    # downsample by factor of 2
    I = I[::2,::2,0] 
    
    # erase background (background type 1)
    I[I == 144] = 0 
    
    # erase background (background type 2)
    I[I == 109] = 0 
    
    # everything else (paddles, ball) just set to 1
    I[I != 0] = 1 
    
    # remove ravel to input into Conv
    return I.astype(np.float)

    
# the DQN
class DeepQNetwork:
    def __init__(self, num_states, num_actions, gamma, batch_size, lr):
        self.num_actions = num_actions
        self.num_states = num_states
        self.batch_size = batch_size
        self.optimizer = Adam(lr)
        self.gamma = gamma
        self.model = Q_Model(num_states, num_actions, self.optimizer)
        self.replay_length = 20000
        self.experience = deque(maxlen=self.replay_length)
    
    # act based on the policy maker
    def act(self, state, epsilon):
        
        # take a random action
        if np.random.rand() < epsilon:
            return np.random.randint(self.num_actions)
        
        # select the action that maximises the Q value
        else:
            Q_values = self.model.Q_model.predict(state[np.newaxis])
            return np.argmax(Q_values[0])
        
    # sample from the experience relay
    def sample_experiences(self):
        
        # sample indices randomly from the replay
        indices = np.random.randint(self.replay_length , size=self.batch_size)
        
        # extract the batch
        batch = [replay_memory[index] for index in indices]
        
        # extract the states, actions, rewards, previous states & dones from the batch
        states, actions, rewards, next_states, dones = [
            np.array([experience[field_index] for experience in batch])
            for field_index in range(5)]
        
        return states, actions, rewards, next_states, dones
        
    
# Takes a state and returns Q values for each possible action
class Q_Model:
    def __init__(self, num_states, num_actions, optimizer):
        
        self.action_space = num_actions
        X_input = Input(shape=(num_states))  
        
        # Convolutional Layers
        X = Conv2D(16, kernel_size=8, strides=4, activation="relu")(X_input)
        X = Conv2D(32, kernel_size=4, strides=2, activation="relu")(X)
        X = Conv2D(32, kernel_size=3, strides=1, activation="relu")(X)
        X = Flatten()(X)
        
        # State -> Q values
        X = Dense(64, activation="relu")(X)
        X = Dense(64, activation="relu")(X)
        output = Dense(self.action_space)(X)

        self.Q_Net = Model(inputs = X_input, outputs = output)
        self.Q_Net.compile(loss=mean_squared_error, optimizer=optimizer)
        
        print(self.Q_Net.summary())
        
        
        
#################################################################################

env = gym.make(ENV_NAME)


agent = DeepQNetwork(num_states = (80, 80, 1),
                     num_actions = 2,
                     gamma = 0.99,
                     batch_size = 32,
                     lr = 1e-03)        

# Initialise the environment   
obs = env.reset()  
prev_x = None 
    
for e in range(1, TOTAL_EPISODES + 1):

    print('Starting Episode {}'.format(e)) 

    for f in range(NB_FRAMES):
        
        # from Karpathy -> take the difference of two images
        s = PreProcessPong(obs)
        
        # sample action 
        a = env.action_space.sample()
        
        # perform action
        obs, r, done, _ = env.step(a) 

        # when episode ends
        if done:
            obs = env.reset()
            break



# close the environment
env.close()

