<a href="https://colab.research.google.com/github/hesller/Bean-factory-and-Application-Context/blob/master/1_DQN_SpaceInvaders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install the prerequired libraries
!pip install tensorflow-gpu==2.0.0-alpha0

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 20 14:23:46 2019

@author: Hesller Huller
"""

"""
    Step 1 - Import libraries
"""

import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from skimage import transform 
from skimage.color import rgb2gray 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Conv2D, Flatten
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
import sys
import os
import random
import warnings
import gym

warnings.filterwarnings('ignore')

In [None]:
print("tensorflow version")
print(tf.__version__)

In [None]:
tf.enable_eager_execution()

In [None]:
print("Is there a GPU available: "),
print(tf.test.is_gpu_available())

Is there a GPU available: 
True


In [None]:
"""
    Step 2 - Create the environment
"""
env = gym.make("SpaceInvaders-v0")
print("The size of the environment: ", env.observation_space)
print("Number of actions: ", env.action_space.n)
a = env.action_space.n
# Hot encoding actions
possible_actions = np.array(np.identity(env.action_space.n, dtype=int).tolist())

The size of the environment:  Box(210, 160, 3)
Number of actions:  6


In [None]:
"""
    Step 3 - Preprocess frames
"""
def preprocess_frame(frame):
    '''
        This functions pre process the frame.
        1 - Pick the frame
        2 - Grayscale it
        3 - resize it
        4 - normalize it

        Args:
            frame - an input frame
        
        Returns:
            preprocessed_frame - gray, scaled, resized frame
    '''
    # Grayscale frame
    gray = rgb2gray(frame)

    # Resize - cropping the unused part of the frame
    # [UP: DOWN: LEFT: RIGHT]
    cropped_frame = gray[8:-12, 4:-12]

    # Normalize pixel values
    normalized_frame = cropped_frame / 255.0

    # Resize
    preprocessed_frame = transform.resize(normalized_frame, [110,84])

    return preprocessed_frame

In [None]:
#
#   Stacking frames
#
stack_size = 4
# Initialize the deque with zeros image, one array for each image
stacked_frames = deque([np.zeros((110,84), dtype = np.int) for i in range(stack_size)], maxlen=stack_size)

def stack_frames(stacked_frames, state, is_new_episode):
    # Preprocess frame
    frame = preprocess_frame(state)
    
    if is_new_episode:
        # Clear our stacked_frames
        stacked_frames = deque([np.zeros((110,84), dtype=np.int) for i in range(stack_size)], maxlen=4)
        
        # Because we're in a new episode, copy the same frame 4x
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        
        # Stack the frames
        stacked_state = np.stack(stacked_frames, axis=2)
        
    else:
        # Append frame to deque, automatically removes the oldest frame
        stacked_frames.append(frame)

        # Build the stacked state (first dimension specifies different frames)
        stacked_state = np.stack(stacked_frames, axis=2) 
    
    return stacked_state, stacked_frames

In [None]:
"""
    Step 4: Set up our hyperparameters
"""

### MODEL HYPERPARAMETERS
state_size = [110, 84, 4]      # Our input is a stack of 4 frames hence 110x84x4 (Width, height, channels) 
action_size = env.action_space.n # 8 possible actions
learning_rate =  0.00025      # Alpha (aka learning rate)

### TRAINING HYPERPARAMETERS
total_episodes = 50            # Total episodes for training
max_steps = 50000              # Max possible steps in an episode
batch_size = 64                # Batch size

# Exploration parameters for epsilon greedy strategy
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.00001           # exponential decay rate for exploration prob

# Q learning hyperparameters
gamma = 0.9                    # Discounting rate

### MEMORY HYPERPARAMETERS
pretrain_length = batch_size   # Number of experiences stored in the Memory when initialized for the first time
memory_size = 1000000          # Number of experiences the Memory can keep

### PREPROCESSING HYPERPARAMETERS
stack_size = 4                 # Number of frames stacked

### MODIFY THIS TO FALSE IF YOU JUST WANT TO SEE THE TRAINED AGENT
training = False

## TURN THIS TO TRUE IF YOU WANT TO RENDER THE ENVIRONMENT
episode_render = False

# Size of the network
NET = 'bigger' # smaller

In [None]:
# DEfine the network object
class DQNetwork():
    
    def __init__(self, scope='QNet', VALID_ACTIONS=[0,1,2,3,4,5], NET=NET):
        self.NET = NET
        self.scope = scope
        self.VALID_ACTIONS = VALID_ACTIONS
        self._build_model()
        self.epsilon = explore_start
        self.epsilon_min = explore_stop
        self.decay_rate = decay_rate
        self.gamma = gamma
        self.batch_size = 20
        # define memory size
        self.buffer = deque(maxlen = memory_size)
        # initialize the checkpoint
        self.checkpoint = ModelCheckpoint(
                          'models/{}.model'.format(
                          'DQN-SpaceInvaders-{epoch:02d}-Loss-{loss:.6f}', 
                          monitor=['loss'],
                          interval=10,
                          verbose=1, 
                          save_best_only=True, 
                          mode='max'))
    def _build_model(self):
        self.model = Sequential()
        if (self.NET == 'bigger'):
            # this is the 3 convnetwork
            self.model.add(Conv2D(input_shape=(*state_size,), filters=32, kernel_size=8, strides=(4,4), padding='VALID', activation='relu', name='conv1'))
            self.model.add(Conv2D(filters=64, kernel_size=4, strides=(2,2), padding='VALID', activation='relu', name='conv2'))
            self.model.add(Conv2D(filters=64, kernel_size=3, strides=(1,1), padding='VALID', activation='relu'))
            # fully connected layers
            self.model.add(Flatten())
            self.model.add(Dense(512, activation='relu'))
            self.model.add(Dense(len(self.VALID_ACTIONS)))
            self.model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=0.001))
        
        elif(self.NET == 'smaller'):
            self.model.add(Conv2D(filters=16, kernel_size=8, strides=(4,4), padding='VALID', activation='relu', name='conv2'))
            self.model.add(Conv2D(filters=32, kernel_size=32, strides=(2,2), padding='VALID', activation='relu'))
            # fully connected layers
            self.model.add(Flatten())
            self.model.add(Dense(256, activation='relu'))
            self.model.add(Dense(len(self.VALID_ACTIONS)))
            self.model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=0.001))
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randint(1, len(possible_actions))-1
        state = np.reshape(state, [1, *state.shape])
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
    
    def remember(self, state, action, reward, next_state, done):
        # reshaping
        state_m = np.reshape(state, (1, *state.shape))
        next_state_m = np.reshape(next_state, (1, *next_state.shape))
        self.buffer.append((state_m, action, reward, next_state_m, done))
        
    def replay(self, batch_size, decay_step, episode):
        if len(self.buffer) < self.batch_size:
            return 
        
        minibatch = random.sample(self.buffer, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done: # if the game is not over
                target = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
            # when teh game is over
            target_f = self.model.predict(state)
            target_f[0][action] = target
            
            # saving a checkpoint every 10 episodes
            if episode % 10 == 0:
              self.model.fit(state, target_f, verbose=0, 
                             batch_size=self.batch_size, max_queue_size=5,
                             callbacks=[self.checkpoint])
            else:
              self.model.fit(state, target_f, verbose=0, 
                             batch_size=self.batch_size, max_queue_size=5)
              
        # updating the exploration rate   
        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon_min + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
            print('the self.epsilon: ', self.epsilon )

In [None]:
# Create the network object
network = DQNetwork()

In [None]:
# create models directory
!mkdir models

In [None]:

"""
    Pre-populating the memory
    
"""



for i in range(pretrain_length):
    # if it is the first step
    if i == 0:
        state = env.reset()
        
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
    # Get the next state, rewards, and done by taking one step
    choice = random.randint(1, len(possible_actions))-1
    next_state, reward, done, _ = env.step(choice)
    
    env.render()
    
    # Stack the next_state frame
    state, stacked_frames = stack_frames(stacked_frames, next_state, False)
    
    # if episode is finished we are dead 3x
    if done:
        # We finish the episode
        next_state = np.zeros(state.shape)
        
        # Stack the frames
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        
        # reshaping
        state_m = np.reshape(state, (1, *state.shape))
        next_state_m = np.reshape(next_state, (1, *next_state.shape))
        
        # add the experience to memory
        network.remember(state_m, choice, reward, next_state_m, done)
        
        # Start a new episode
        state = env.reset()
        
        # Stack the frames
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    else:
        # reshaping to save
        # reshaping
        state_m = np.reshape(state, (1, *state.shape))
        next_state_m = np.reshape(next_state, (1, *next_state.shape))
        
        # add the experience to the memory
        network.remember((state_m, choice, reward, next_state_m, done))
        
        # the state is now the new_state
        state = next_state

In [None]:
"""
    Train the environment
    
"""

decay_step = 0
for e in range(5000):
    
    # Initialize the rewards of the episode
    episode_rewards = []
    
    # reset the environment
    state = env.reset()
    state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    
    for s in range(max_steps):
        
        decay_step += 1
        
        # rendering the environment
        #env.render()
        
        # choose an action
        action = network.act(state)
        
        # Perform the action the reward and next state and done
        next_state, reward, done, _  = env.step(action)
        
        # Add the reward to total pool
        episode_rewards.append(reward)
        
        # if done finish the game
        if done:
            # the end of the episode, so there is no next_state
            next_state = np.zeros((110, 84), dtype=np.int)
            
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
            
            
            # equals the step to ende the episode
            print('game finished')
            
            # Get the total reward of the episode
            total_reward = np.sum(episode_rewards)
            print('Episode: {}'.format(e),
                                  'Total reward: {}'.format(total_reward),
                                  'Explore P: {:.4f}'.format(network.epsilon))
            
            # store the transition
            network.remember(state, action, reward, next_state, done)

            break
        else:
            # inside playing the game store the states
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
            
            # store the experience
            network.remember(state, action, reward, next_state, done)
            
            state = next_state
            
    # train the agent with the experience of the episode
    network.replay(32, decay_step, e)

In [None]:
import tensorflow as tf
print(tf.__version__)

1.13.1


In [None]:
class MyCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, log={}):
    if(log.get('acc')>0.6):
      print("\nReached the end of the trainin with 60% or above accuracy")
      self.model.stop_training = True
      
mnist = tf.keras.datasets.fashion_mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0 , x_test / 255.0

callbacks = MyCallback()

model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28,28)),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(x_train, y_train, epochs=10, verbose = 1, callbacks=[callbacks])


Epoch 1/10
Reached the end of the trainin with 60% or above accuracy


<tensorflow.python.keras.callbacks.History at 0x7ff810730048>