### This notebook will be me trying to make the DQN algorithm. 
Implementation of DQN for the gym environment CartPole-v0. Performance is quite variable, sure somemore hyperparameter tuning would help, but that's a problem for another day. 

![here](images/dqn-cartpole.png)

In [None]:
import tensorflow as tf 
import numpy as np
import gym
import matplotlib.pyplot as plot

tf.random.set_random_seed(0)
np.random.seed(0)

### Replay buffer 
So this is a buffer of length n, which we can get samples from
Added in form 
[s,a,r,s']

In [None]:
class MemoryBuffer(object):
    
    def __init__(self, length):
        self.length = length +1
        self.first_run = True 
    
    def add_to_buffer(self, e):
        """
        The experience (e) added is in form: 
        [state, action, reward, next_state, int( not done) ]
        The reason for int(not done) is so that if it's a terminal state
        it will be stored as 0 and then will easily allow for calculation
        of the state value without the need for any if statements. 
        """
        if not self.first_run:
            for num, i in enumerate(e):
                self.experience[num] = np.vstack([i,self.experience[num]])                
        else:
            for i in e:
                self.experience = [np.array(e[0]),np.array(e[1]),
                                   np.array(e[2]),np.array(e[3]),
                                  np.array(e[4])]

            self.first_run = False
        self.maintain_length()
        
    def get_batch(self, number):
        number = min(self.experience[0].shape[0],number)
        items_to_get = np.random.randint(0,self.experience[0].shape[0], number)
        return [self.experience[c][items_to_get,:] for c in range(5)]
    
    def maintain_length(self):
        if self.experience[0].shape[0] >= self.length:
            for i in range(len(self.experience)):
                self.experience[i] = self.experience[i][0:-1]
                

    

### DQN Agent 
This is the DQN agent and its architecture.
We need both a target network too, don't forget. 

In [None]:

class DeepQ(object):
    
    def __init__(self, state_size, action_size, lr , y ):
        """
        Initialise model which is defined using keras. 
        We use the RMSPropOptimiser as that is what is specified in the 
        paper. 
        The target model is initialised as a clone of the model. Note, 
        this operation does not clone the weights. 
        """
        self.lr = lr
        self.model = tf.keras.Sequential([
            tf.keras.layers.Dense(24, activation = tf.keras.activations.relu, 
                                  input_shape = state_size,
                                 kernel_initializer=tf.keras.initializers.RandomUniform(0.0, 0.1)),
            tf.keras.layers.Dense(24, activation = tf.keras.activations.relu,
                                 kernel_initializer=tf.keras.initializers.RandomUniform(0.0, 0.1)),
            tf.keras.layers.Dense(action_size, activation = tf.keras.activations.linear,
                                 kernel_initializer=tf.keras.initializers.RandomUniform(0.0, 0.1))
        ])
        self.model.compile(optimizer = tf.train.RMSPropOptimizer(self.lr),
                          loss = tf.losses.huber_loss,
                          metrics = ['accuracy'],)
        self.target_model = tf.keras.models.clone_model(self.model)
        self.y = y #our discount factor
        
    def train(self, experience):
        """
        state, action, reward, next_state, done. 
        Observed = r_t + y max(target)
        The fit method takes in the current state and then computes the 
        MSE difference between the current network and the observed value. 
        """
        batch_size = experience[0].shape[0]
        observed = experience[2].T + (self.y * np.multiply(experience[4].T,self.value_next_state(experience[3]))) #changed from model
        update = self.model.predict(experience[0]) 
        update[list(range(batch_size)),experience[1].flatten()] = observed
        self.model.fit(experience[0],update, verbose = 0 , batch_size = batch_size)
        return update
    
    def replace_model(self):
        """
        The target network aids stability. It provides it with a value to 
        move towards which isn't affected by its own training. 
        This might not be the most effecient way to do this, but it works. 
        """
        #self.model.save_weights('my_model')
        #self.target_model.load_weights('my_model')
        self.target_model.set_weights(self.model.get_weights())
    
    def value_next_state(self, next_state):
        return np.max(deepQ.target_model.predict(next_state),axis = 1).T
                                              
        
        
        

### Average data
This makes it easier to see if the mean value of the data is improving. 

In [None]:
def running_average(data):
    new_data =[]
    for i in range(len(data)):
        new_data.append(np.average(data[max(0, i - 10):i+10]))
    return new_data
        

### Run the environment 
For this example, a constant epsilon is used to specify the exploration. 

In [None]:
lr = 0.002 
y = 0.8
repl_int = 20
deepQ = DeepQ((4,), 2 , lr, y)
ExpR = MemoryBuffer(1000)
env = gym.make('CartPole-v0')
env.seed(0)
episodes, total_iterations = 0,0 
episode_reward = []
epsilon = 0.2

In [None]:
while episodes < 2000:
    state = np.array([env.reset()])
    #epsilon = np.max([0.01, 1 - ((episodes/1950))])
    i, reward, done = 0, 0.0, False
    while i < 200 and done != True:
        if np.random.rand() > epsilon:   
            action = np.argmax(deepQ.model.predict(state)[0])
        else:
            action = env.action_space.sample()
        next_state, r, done, _ = env.step(action)
        reward += r # add reward to monitor episode reward
        ExpR.add_to_buffer([state,action,r,next_state, float(not done)])
        if total_iterations > 200:
            deepQ.train(ExpR.get_batch(20))
        state = np.array([next_state])
        i += 1
        total_iterations += 1     
    if episodes % repl_int == 0 and episodes != 0:
        deepQ.replace_model()
    print('Episodes %i: %i, Epsilon: %f '%(episodes, reward,epsilon))
    episode_reward.append(reward)
    episodes += 1
print("There were %i total iterations"%total_iterations)




In [None]:
plot.plot(episode_reward)
plot.ylabel('Averaged reward')
plot.xlabel('Episodes')
plot.plot(running_average(episode_reward))
plot.show()