In [7]:
import gym
import random
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import deque
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential, load_model, Model


## Build Model
<!-- <h3>Base Model</h3> -->
- Epsilon = 1
- Epsilon_min = 0.01
- Epsilon_decay = 0.99
- Learning_rate = 0.01
- Discount_rate = 0.8
- Train_start = 1000
- Batch_size = 64

In [2]:
EPISODES = 300

In [34]:
class DQNAgent:
    def __init__(self, state_size, epsilon_decay=0.99, learning_rate=0.01, discount_rate=0.8, n_discrete_actions=10, weights=None):
        
        self.state_size = state_size
        self.render = False
        self.epsilon = 1
        self.epsilon_min = 0.01 
        
        # Hyperparameters
        self.MinMemory = 1000
        self.EpsilonDecay = epsilon_decay 
        self.LearningRate = learning_rate 
        self.DiscountRate = discount_rate 
        self.BatchSize = 32
        self.Optimiser = Adam(lr=self.LearningRate)
        self.NActions = n_discrete_actions

        self.Memory = deque(maxlen=3000)
        # Build Main Model & Target Model
        self.model = self.Build_Model()
        if weights != None:
            self.model.load_weights(weights)
        self.t_model = self.Build_Model()
        self.update_t_weights()

    def Build_Model(self):

        # inputs = Input(shape = (self.state_size), name = 'Input')
        # x = Dense(24, activation = 'relu', name = '1stHiddenLayer')(inputs)
        # x = Dense(24, activation = 'relu', name = '2ndHiddenLayer')(x)
        # outputs = Dense(self.NActions, activation = 'linear', name = 'Output')(x)
        
        # NN = Model(inputs, outputs)
        # NN.compile(loss = 'mse', optimizer = Adam(lr = self.LearningRate))
        # NN.summary()



        model = Sequential()
        model.add(Input(shape=(1,3), name="Input_Layer"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(self.NActions, activation="linear"))
        model.summary()
        model.compile(loss="mse", optimizer=Adam(lr=self.LearningRate))
        return model 
            
    def UpdateMemory(self, state, action, reward, next_state, done):
        # Replay Buffer
        self.Memory.append((state, action, reward, next_state, done))

    def update_t_weights(self):
        # Update Target Model
        self.t_model.set_weights(self.model.get_weights())
    
    def PendulumActionConverter(self, A):
        ActualTorque = (A / self.NActions - 0.5) * 4
        return ActualTorque

    def PendulumInverseActionConverter(self, A):
        ActualA = round((A + 2) * (self.NActions - 1) / 4)
        return  (ActualA)

    def Get_Action(self, state, env):
        if np.random.rand() < self.epsilon:
            action = env.action_space.sample()
            a = self.PendulumInverseActionConverter(action[0])
            return action, a
        else:
            q_values = self.model(state).numpy() 
            step = np.argmax(q_values[0])
            a = self.PendulumActionConverter(a)
            action = np.array([a])
            a = self.PendulumInverseActionConverter(step)
            return action, a
        
    def Train(self):
        # Only train if replay memory has enough data. #
        if len(self.Memory) < self.MinMemory:
            # print(f'DID NOT TRAIN..., replay memory = {len(self.Memory)}')
            return
        
        # Get batch of data for training. #
        TrainingData = random.sample(self.Memory, self.BatchSize)
        
        # Get states from training data, then get corresponding Q values. #
        ListOfS = np.array([element[0] for element in TrainingData])
        ListOfQ = np.array([element[0] for element in self.model(ListOfS)])
    
        print(ListOfQ)
        # ListOfQ = ListOfQ[0]
        # print(ListOfQ)

        # Get future states from training data, then get corresponding Q values. #
        ListOfSNext = np.array([element[3] for element in TrainingData])
        ListOfQNext = self.t_model(ListOfSNext)
        
        # Build actual training data for neural network. #
        X = []
        Y = []
        for index, (S, A, R, SNext, Done) in enumerate(TrainingData):
            if not Done:
                MaxQNext = np.max(ListOfQNext[index])
                QNext = R + self.epsilon * MaxQNext
            else:
                QNext = R
            Q = ListOfQ[index]
            Q[A] = QNext
            X.append(S)
            Y.append(Q)
        
        # Train model using tf.GradientTape(), defined below.
        self.GTfit(X, Y)
    
    @tf.function
    def GTfit(self, X, Y):
        # Train the neural network with this batch of data. #
        with tf.GradientTape() as tape:
            Predictions = self.model(tf.convert_to_tensor(X), training = True)
            Loss = tf.math.reduce_mean(tf.math.square(tf.convert_to_tensor(Y) - Predictions))
        Grad = tape.gradient(Loss, self.model.trainable_variables)
        self.Optimiser.apply_gradients(zip(Grad, self.model.trainable_variables))

def main(epsilon_decay=0.99, learning_rate=0.01, discount_rate=0.8, bins=10,filename="base_reward.h5", weights=None):
    
    env = gym.make("Pendulum-v0")
    state_size = env.observation_space.shape[0]
    agent = DQNAgent(state_size, epsilon_decay=epsilon_decay, learning_rate=learning_rate, discount_rate=discount_rate, n_discrete_actions=bins, weights=weights)
    
    scores = []
    avg_score = []
    best_score = -np.inf

    for e in range(4):

        done = False
        score = 0
        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:

            
            action, a = agent.Get_Action(state, env)
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])

            agent.UpdateMemory(state, a, reward, next_state, done)
            agent.Train()

            score += reward
            state = next_state
            
            if done:
                scores.append(score) # Save the score
                avg_score.append(np.mean(scores[-100:])) # Moving Average Score

                print("episode:", e, "  score:", score, "  memory length:",
                      len(agent.Memory), "  epsilon:", agent.epsilon)

    env.close() 
    return avg_score, scores

In [35]:
main()



Model: "sequential_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_80 (Dense)            (None, 1, 24)             96        
                                                                 
 dense_81 (Dense)            (None, 1, 24)             600       
                                                                 
 dense_82 (Dense)            (None, 1, 10)             250       
                                                                 
Total params: 946 (3.70 KB)
Trainable params: 946 (3.70 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________




Model: "sequential_26"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_83 (Dense)            (None, 1, 24)             96        
                                                                 
 dense_84 (Dense)            (None, 1, 24)             600       
                                                                 
 dense_85 (Dense)            (None, 1, 10)             250       
                                                                 
Total params: 946 (3.70 KB)
Trainable params: 946 (3.70 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________




episode: 0   score: -1173.5639201812385   memory length: 200   epsilon: 1
episode: 1   score: -1340.446681800306   memory length: 400   epsilon: 1
episode: 2   score: -1314.0007528072442   memory length: 600   epsilon: 1
episode: 3   score: -1476.6832877709892   memory length: 800   epsilon: 1


([-1173.5639201812385,
  -1257.0053009907722,
  -1276.0037849295961,
  -1326.1736606399445],
 [-1173.5639201812385,
  -1340.446681800306,
  -1314.0007528072442,
  -1476.6832877709892])

In [4]:
model = Sequential()
model.add(Input(shape=(1,3), name="Input_Layer"))
model.add(Dense(24, activation="relu"))
model.add(Dense(24, activation="relu"))
model.add(Dense(24, activation="linear"))
model.summary()
model.compile(loss="mse", optimizer=Adam(lr=0.2))


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 24)                96        
                                                                 
 dense_1 (Dense)             (None, 24)                600       
                                                                 
 dense_2 (Dense)             (None, 24)                600       
                                                                 
Total params: 1296 (5.06 KB)
Trainable params: 1296 (5.06 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________




In [6]:
from keras.models import Model

input_layer = Input(shape=(3,))
hidden_layer_1 = Dense(24, activation="relu")(input_layer)
hidden_layer_2 = Dense(24, activation="relu")(hidden_layer_1)
output_layer = Dense(40, activation="linear")(hidden_layer_2)

model = Model(inputs=input_layer, outputs=output_layer)
model.summary()

Model: "model"


_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 3)]               0         
                                                                 
 dense_5 (Dense)             (None, 24)                96        
                                                                 
 dense_6 (Dense)             (None, 24)                600       
                                                                 
 dense_7 (Dense)             (None, 40)                1000      
                                                                 
Total params: 1696 (6.62 KB)
Trainable params: 1696 (6.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
