<a href="https://colab.research.google.com/github/isaactl/Deep_reinforcement_learning_Course/blob/master/dqn_cart_pole.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install  wheel setuptools --upgrade



In [3]:
%pip install swig

Collecting swig
  Using cached swig-4.2.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (3.6 kB)
Using cached swig-4.2.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
Installing collected packages: swig
Successfully installed swig-4.2.1


In [4]:
%pip install gymnasium[all]==0.29.1 tensorflow==2.15.0 tf-agents==0.19.0

Collecting gymnasium==0.29.1 (from gymnasium[all]==0.29.1)
  Using cached gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting tensorflow==2.15.0
  Using cached tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting tf-agents==0.19.0
  Using cached tf_agents-0.19.0-py3-none-any.whl.metadata (12 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium==0.29.1->gymnasium[all]==0.29.1)
  Using cached Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Collecting ml-dtypes~=0.2.0 (from tensorflow==2.15.0)
  Using cached ml_dtypes-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting wrapt<1.15,>=1.11.0 (from tensorflow==2.15.0)
  Using cached wrapt-1.14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting tensorboard<2.16,>=2.15 (from tensorflow==2.15.0)
  Using cached tensorboard-2.15.2-py3-none-any.wh

In [5]:
import numpy as np
import random
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop
from tensorflow import gather_nd
from tensorflow.keras.losses import mean_squared_error
from collections import deque

In [6]:
class DeepQLearning:
    def __init__(self, env, gamma, epsilon, numberEpisodes) -> None:
        """
        INPUTS:
        env - Cart Pole environment
        gamma - discount factor
        epsilon - parameter for epsilon greedy approach
        numberEpisodes - number of episodes to train the model
        """
        self.env = env
        self.gamma = gamma
        self.epsilon = epsilon
        self.numberEpisodes = numberEpisodes

        # state dimension
        self.stateDimension=4
        # action dimension
        self.actionDimension=2
        # this is the maximum size of the replay buffer
        self.replayBufferSize=300
        # this is the size of the training batch that is randomly sampled from the replay buffer
        self.batchReplayBufferSize=100

        # number of training episodes it takes to update the target network parameters
        # that is, every updateTargetNetworkPeriod we update the target network parameters
        self.updateTargetNetworkPeriod=100

        # this is the counter for updating the target network
        # if this counter exceeds(updateTargetNetworkPeriod-1) we update the network
        # parameters and reset the counter to zero, this process is repeated until the end
        # of the training process
        self.counterUndateTargetNetwork=0

        # this sum is used to store the sum of rewards obtained during each training episode
        self.sumRewardsEpisode=[]

        # replay buffer
        self.replayBuffer=deque(maxlen=self.replayBufferSize)

        # this is the online network
        # create network
        self.mainNetwork=self.createNetwork()

        # this is the target network
        self.targetNetwork=self.createNetwork()

        # copy this initial weights to target network
        self.targetNetwork.set_weights(self.mainNetwork.get_weights())

        # this list is used in the cost function to select certain entries of the
        # predicted and true sample matrices in order to form the loss
        self.actionsAppend=[]

    def my_loss_fn(self,y_true, y_pred):
        """
        function for defining the loss(cost) function
        INPUTS:
        y_true - matrix of dimension (self.bachReplayBufferSize,2) - this is the target
        y_pred - matrix of dimension (self.bachReplayBufferSize,2) - this is predicted by the network

        - this function will select certain row entries from y_true and y_pred to form the output
        the selection is performed on the basis of the action indices in the list self.actionsAppend
        - this function is used in createNetwork(self) to create the network

        OUTPUT:
        loss - watch out here, this is a vector of (self.batchReplayBufferSize,1), with each entry being
            the squared error between the entries of y_true and y_pred. later on, the tensorflow will compute
            the scalar out of this vector (mean squared error)
        """

        # self.mainNetwork.fit(inputNetwork, outputNetwork, batch_size=self.batchReplayBufferSize, epochs=100, verbose=0)
        s1, s2 = y_true.shape

        # this matrix defines indices of a set of entries that we want to
        # extract from y_true and y_pred
        # s2=2
        # s1=self.batchReplayBufferSize
        indices = np.zeros(shape=(s1,s2))
        indices[:,0]=np.arange(s1)
        indices[:,1]=self.actionsAppend

        # gather_nd and mean_squared_error are tensorflow functions
        loss = mean_squared_error(gather_nd(y_true,indices=indices.astype(int)), gather_nd(y_pred,indices=indices.astype(int)))
        return loss

    def createNetwork(self):
        model=Sequential()
        model.add(Dense(128, input_dim=self.stateDimension, activation='relu'))
        model.add(Dense(56, activation='relu'))
        model.add(Dense(self.actionDimension, activation='linear'))

        model.compile(optimizer=RMSprop(), loss=self.my_loss_fn, metrics=['accuracy'])
        return model

    def trainingEpisodes(self):
        # here we loop through the episodes
        for indexEpisode in range(self.numberEpisodes):
            # list that stores rewards per episode - this is necessary for keeping track of convergence
            rewardsEpisode=[]
            print("Simulating episode {}".format(indexEpisode))

            # reset the environment at the beginning of every episode
            (currentState,_)=self.env.reset()

            # here we step from one state to another
            # this will loop until a terminal state is reached
            terminalState=False
            while not terminalState:
                # select an action on the basis of the current state, denoted by currentState
                action = self.selectAction(currentState, indexEpisode)

                # here we step and return the state, reward, and boolean denoting if the state is a terminal state
                (nextState, reward, terminalState, _, _) = self.env.step(action)
                rewardsEpisode.append(reward)

                # add current state, action, reward, next state, and terminal flag to the replay buffer
                self.replayBuffer.append((currentState, action, reward, nextState, terminalState))

                # train network
                self.trainNetwork()

                # set the current state for the next step
                currentState=nextState

            print("Sum of rewards {}".format(np.sum(rewardsEpisode)))
            self.sumRewardsEpisode.append(np.sum(rewardsEpisode))

    def selectAction(self,state,index):
        """
        this function selects an action on the basis of the current state
        INPUTS:
        state - state for which to compute the action
        index - index of the current episode
        """

        # first index episodes we select completely random actions to have enough exploration
        if index<1:
            return np.random.choice(self.actionDimension)

        # returns a random real number in the half-open interval [0.0, 1.0)
        # this number is used for the epsilon greedy approach
        randomNumber = np.random.random()

        # after index episodes, we slowly start to decrease the epsilon parameter
        if index>200:
            self.epsilon=0.999*self.epsilon

        # if this condition is statisfied, we are exploring, that is, we select random actions
        if randomNumber < self.epsilon:
            # return a random action selected from: 0, 1,..., actionNumber-1
            return np.random.choice(self.actionDimension)
        else:
            # we are selecting greedy actions
            # we return the index where Qvalues[state,:] has the max value
            # since the index denotes an action, we select greedy action
            Qvalues = self.mainNetwork.predict(state.reshape(1,4))
            return np.random.choice(np.where(Qvalues[0,:]==np.max(Qvalues[0,:]))[0])
            # here we need to return the minimum index since it can happen
            # that there are several identical maximal entries, for example
            # import numpy as np
            # a=[0,1,1,0]
            # np.where(a==np.max(a))
            # this will return [1,2], but we only need a single index
            # that is why we need to have np.random.choice(np.where(a==np.max(a))[0])
            # note that zero has to be added here since np.where() returns a tuple

    def trainNetwork(self):
        # if the replay buffer has at least batchReplayBufferSize elements,
        # then train the model
        # otherwise wait until the size of the elements exceeds batchReplayBufferSize
        if (len(self.replayBuffer) > self.batchReplayBufferSize):
            # sample a batch from the replay buffer
            randomSampleBatch=random.sample(self.replayBuffer, self.batchReplayBufferSize)

            # here we form current state batch and next state batch
            # they are used as inputs for prediction
            currentStateBatch=np.zeros(shape=(self.batchReplayBufferSize, 4))
            nextStateBatch=np.zeros(shape=(self.batchReplayBufferSize, 4))

            # this will enumerate the tuple entries of the randomSamplebatch
            # index will loop through the number of tuples
            for index, tupleS in enumerate(randomSampleBatch):
                # first entry of the tuple is the current state
                currentStateBatch[index,:]=tupleS[0]
                # fourth entry of the tuple is the next state
                nextStateBatch[index,:]=tupleS[3]

            # here, use the target network to predict Q-values
            QnextStateTargetNetwork=self.targetNetwork.predict(nextStateBatch)
            # here, use the main network to predict Q-values
            QcurrentStateMainNetwork=self.mainNetwork.predict(currentStateBatch)

            # now we form batches for training
            inputNetwork = currentStateBatch
            outputNetwork=np.zeros(shape=(self.batchReplayBufferSize,2))

            # this list will contain the actions that are selected from the batch
            # this list is used in my_loss_fn to define the loss function
            self.actionsAppend=[]
            for index, (currentState, action, reward, nextState, terminated) in enumerate(randomSampleBatch):
                # if the state is terminal, then the Q-value is equal to the reward
                if terminated:
                    y=reward
                else:
                    # if the state is not terminal, then the Q-value is equal to the reward
                    # plus the discounted future reward
                    y=reward + self.gamma*np.max(QnextStateTargetNetwork[index])

                # append the action to the list
                self.actionsAppend.append(action)

                # this actually does not matter since we do not use all the entries in the cost function
                outputNetwork[index]=QcurrentStateMainNetwork[index]
                # this is what matters
                outputNetwork[index,action]=y

            # here we train the network
            self.mainNetwork.fit(inputNetwork, outputNetwork, batch_size=self.batchReplayBufferSize, epochs=100, verbose=0)

            # after updateTargetNetworkPeriod training sessions, update the target network
            # increase the counter for training the target network
            self.counterUndateTargetNetwork+=1
            if self.counterUndateTargetNetwork > self.updateTargetNetworkPeriod - 1:
                # copy the weights from the main network to the target network
                self.targetNetwork.set_weights(self.mainNetwork.get_weights())
                print("Target network updated!")
                print("Counter value {}".format(self.counterUndateTargetNetwork))

                # reset the counter
                self.counterUndateTargetNetwork=0

In [None]:
import gymnasium as gym

# create env
env=gym.make('CartPole-v1')

gamma=1
epsilon=0.1
numberEpisodes=100

# create agent
agent = DeepQLearning(env,gamma,epsilon,numberEpisodes)
# run the learning process
agent.trainingEpisodes()
# get the obtained rewards in every episode
agent.sumRewardsEpisode

# summarize the model
agent.mainNetwork.summary()
# save the model
agent.mainNetwork.save('model.h5')

Simulating episode 0
Sum of rewards 17.0
Simulating episode 1
Sum of rewards 28.0
Simulating episode 2
Sum of rewards 26.0
Simulating episode 3
Sum of rewards 20.0
Simulating episode 4
Sum of rewards 22.0
Simulating episode 5
Sum of rewards 36.0
Simulating episode 6
Sum of rewards 43.0
Simulating episode 7
Target network updated!
Counter value 100
Sum of rewards 20.0
Simulating episode 8
Sum of rewards 20.0
Simulating episode 9
Sum of rewards 22.0
Simulating episode 10
Sum of rewards 22.0
Simulating episode 11
Target network updated!
Counter value 100
Sum of rewards 57.0
Simulating episode 12
Sum of rewards 21.0
Simulating episode 13
Sum of rewards 22.0
Simulating episode 14
Sum of rewards 22.0
Simulating episode 15
Target network updated!
Counter value 100
Sum of rewards 67.0
Simulating episode 16
Sum of rewards 20.0
Simulating episode 17
Target network updated!
Counter value 100
Sum of rewards 18.0
Simulating episode 18
Sum of rewards 23.0
Simulating episode 19
