In [1]:
import numpy as np
from numpy import random
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
from matplotlib.collections import LineCollection
from matplotlib.ticker import MaxNLocator
import math
import pickle

rng = random.default_rng(seed = 42) #seeds used = [42, 11, 37, 59, 17]


#Environment settings

In [2]:
rng = random.default_rng(seed = 42)

class Environment:
    def __init__(self, settings):
        self.discreteMap = settings['discreteMap']
        self.sizeRechargeAreas = settings['sizeRechargeAreas']
        self.rechargeValue = settings['rechargeValue']
        self.rechargeAreas = settings['rechargeAreas']

        #Save the recharge stations position and set a high value to them for plot reasons
        RechargeAreaPositions = [[0 for i in range(20)] for j in range(20)]
        station = 100
        for i in range(len(self.rechargeAreas)):
            RechargeAreaPositions[self.rechargeAreas[i][0]][self.rechargeAreas[i][1]] = station
            RechargeAreaPositions[self.rechargeAreas[i][0]][self.rechargeAreas[i][1] + self.sizeRechargeAreas] = station
            RechargeAreaPositions[self.rechargeAreas[i][0] + self.sizeRechargeAreas][self.rechargeAreas[i][1]] = station
            RechargeAreaPositions[self.rechargeAreas[i][0] + self.sizeRechargeAreas][self.rechargeAreas[i][1] + self.sizeRechargeAreas] = station
            station += 100

        mat = np.matrix(RechargeAreaPositions)
        with open('RechargeAreaPositions.txt','wb') as f:
            for line in mat:
                np.savetxt(f, line, fmt='%s', delimiter=",")
        f.close()

    def ifInRechargeArea(self, agentY, agentX, numVisitsRechargeArea, visitsRechargeAreaEpisode, time):
        i = 0
        for rA in self.rechargeAreas:
            if (agentY >= rA[0] and agentY <= (rA[0] + self.sizeRechargeAreas)) and (agentX >= rA[1] and agentX <= (rA[1] + self.sizeRechargeAreas)):
                numVisitsRechargeArea[i] += 1
                visitsRechargeAreaEpisode.append([time, i])
                return self.rechargeValue[i]
            i += 1
        return 0


#Robot settings

In [3]:
rng = random.default_rng(seed = 42)

class Agent:
    def __init__(self, env, actions, survive, distanceMeasure):
        self.agentX = 0
        self.agentY = 0
        self.env = env
        self.totalActions = len(actions)
        self.totalFeatures = 12 #FEATURES
        self.actions = actions

        self.coordinates = []
        self.numVisitsMap = [[0 for i in range(self.env.discreteMap)] for j in range(self.env.discreteMap)]
        self.numVisitsMap_accumulated = [[0 for i in range(self.env.discreteMap)] for j in range(self.env.discreteMap)]
        
        self.initialPositions = []

        self.timeVisitsRechargeArea = []
        self.time = 0
        self.visitsRechargeAreaEpisode = []
        self.numVisitsRechargeArea = [0 for i in range(len(env.rechargeAreas))]

        self.distanceMeasure = distanceMeasure
        
        #BATTERY
        self.FOV = survive['FOV']
        self.homeostasisSurvive = survive['homeostasisSurvive']
        self.maxEnergy = survive['maxEnergy']
        self.minEnergy = survive['minEnergy']
        self.discountEnergy = survive['discountEnergy']
        self.energy = rng.integers(self.minEnergy, self.maxEnergy + 1)
       
        self.surviveDriveAll = []
        self.energyAll = []
        
        self.allY_test = [0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 11, 12, 12, 13, 13, 13, 14, 14, 15, 16, 16, 16, 17, 17, 18, 19, 19, 19, 19, 19]
        self.allX_test = [0, 4, 6, 19, 8, 12, 15, 3, 10, 1, 6, 12, 16, 18, 2, 5, 7, 4, 10, 13, 0, 15, 19, 7, 14, 5, 12, 16, 2, 10, 19, 1, 13, 5, 8, 16, 0, 3, 10, 1, 4, 17, 9, 13, 7, 0, 3, 11, 16, 19]
        self.cxy = 0

        self.firstEpisode = True
        
        
    def reset(self, test):
        self.numVisitsRechargeArea = [0 for i in range(len(self.env.rechargeAreas))]
       
        if(test):
            self.energy = self.homeostasisSurvive
            self.agentX = self.allX_test[self.cxy]
            self.agentY = self.allY_test[self.cxy]
            self.cxy += 1
            self.energyAll = []
            self.surviveDriveAll = []
        else:
            self.energy = rng.integers(self.minEnergy, self.maxEnergy + 1)
            self.agentX = rng.integers(0, self.env.discreteMap)
            self.agentY = rng.integers(0, self.env.discreteMap)
            
        
        self.energyAll.append(self.energy)
        currentDriveSurvive = self.computeDriveSurvive(self.energy)
        
        self.initialPositions.append([self.agentY, self.agentX])
        self.coordinates = []
        self.numVisitsMap = [[0 for i in range(self.env.discreteMap)] for j in range(self.env.discreteMap)]
        
        if(self.firstEpisode == True):
            self.firstEpisode = False
        else:
            self.timeVisitsRechargeArea.append(self.visitsRechargeAreaEpisode)

        self.time = 0
        self.visitsRechargeAreaEpisode = []
        
        minDist, up, down, left, right, station = self.closestRechargeArea(self.agentY, self.agentX)
        reICanSee = self.RechargeArea_ICanSee(self.agentY, self.agentX)
        
        return [currentDriveSurvive, minDist, up, down, left, right, reICanSee, self.agentY, self.agentX] #FEATURES

    
    #DRIVE SURVIVE
    def computeDriveSurvive(self, sensor):
        return -(self.homeostasisSurvive - sensor)

    #REWARD FUNCTIONS     
    def rewardFunction(self, currentDriveSurvive):
        if ((abs(int(currentDriveSurvive)) == 0) and (currentDriveSurvive * (-1) <= 0)):
            return 1
        elif (currentDriveSurvive < 0):
            return currentDriveSurvive 
        else:
            return -(currentDriveSurvive * 0.5) 

    def computeDriveSurviveReward(self, currentDriveSurvive):
        return self.rewardFunction(currentDriveSurvive)

    def step(self, action):
        #MOVE
        # Up 
        if (action == 1):
            self.agentY += 1
            if self.agentY > self.env.discreteMap-1: self.agentY = self.env.discreteMap-1

        # Down
        elif (action == 2):
            self.agentY -= 1
            if self.agentY < 0: self.agentY = 0

        # Left
        elif (action == 3):
            self.agentX -=1
            if self.agentX < 0: self.agentX = 0

        # Right 
        elif (action == 4):
            self.agentX += 1
            if self.agentX > self.env.discreteMap-1: self.agentX = self.env.discreteMap-1

        self.coordinates.append([self.agentX, self.agentY])
        self.numVisitsMap[self.agentY][self.agentX] += 1

        self.numVisitsMap_accumulated[self.agentY][self.agentX] += 1


        #UPDATE BATTERY STUFF
        self.energy -= self.discountEnergy

        recharge = self.checkIfRecharge()#Recharge
        self.energy = min(self.maxEnergy, self.energy + recharge)
        self.energyAll.append(self.energy)

        #Compute drives and reward
        currentDriveSurvive = self.computeDriveSurvive(self.energy)
        rewardSurvive = self.computeDriveSurviveReward(currentDriveSurvive)
        self.surviveDriveAll.append(currentDriveSurvive)

        totalReward = rewardSurvive

        done = self.death()

        minDist, up, down, left, right, station = self.closestRechargeArea(self.agentY, self.agentX)
        reICanSee = self.RechargeArea_ICanSee(self.agentY, self.agentX)
        
        new_state = [currentDriveSurvive, minDist, up, down, left, right, reICanSee, self.agentY, self.agentX]
        
        self.time += 1
        
        return new_state, totalReward, done
    

    def death(self):
        if self.energy <= self.minEnergy:
            return 1
        return 0

    def checkIfRecharge(self):
        recharge = self.env.ifInRechargeArea(self.agentY, self.agentX, self.numVisitsRechargeArea, self.visitsRechargeAreaEpisode, self.time)
        return recharge
    
    def manhattanDistance(self, agentY, agentX, middleY_RA, middleX_RA):
        return abs(agentX - middleX_RA) + abs(agentY - middleY_RA)

    def euclideanDistance(self, agentY, agentX, middleY_RA, middleX_RA):
        return math.sqrt(((agentX - middleX_RA) ** 2) + ((agentY - middleY_RA) ** 2))

    def closestRechargeArea(self, agentY, agentX):
        minDist = 99999
        station = 0
        i = 0
        
        for rA in self.env.rechargeAreas:
            middleX_RA = rA[1] + self.env.sizeRechargeAreas/2
            middleY_RA = rA[0] + self.env.sizeRechargeAreas/2
            if (self.distanceMeasure == 'Euclidean'):
                dist = self.euclideanDistance(agentY, agentX, middleY_RA, middleX_RA)
            else:
                dist = self.manhattanDistance(agentY, agentX, middleY_RA, middleX_RA)
            

            if ((self.distanceMeasure == 'Manhattan' and dist <= self.env.sizeRechargeAreas) or (self.distanceMeasure == 'Euclidean' and dist < self.env.sizeRechargeAreas)):#fiz isso pq to usando o ponto central da estação, mas se dist for menor que sizerecharge quer dizer que ja esta na estação (mas nao no meio dela)
                dist = 0

            if dist < minDist:
                minDist = dist
                up = 0
                down = 0
                left = 0
                right = 0
                station = i

                horizon = agentX - middleX_RA 
                vertical = agentY - middleY_RA

                if(agentX < rA[1] and (horizon != self.env.sizeRechargeAreas/2)):
                    right = 1
                elif(agentX > rA[1] and (horizon != self.env.sizeRechargeAreas/2)):
                    left = 1

                if(agentY > rA[0] and (vertical != self.env.sizeRechargeAreas/2)):
                    down = 1
                elif(agentY < rA[0] and (vertical != self.env.sizeRechargeAreas/2)):
                    up = 1
                    
            i += 1

        return minDist, up, down, left, right, station
    
    def RechargeArea_ICanSee(self, agentY, agentX):
        reICanSee = np.zeros(len(self.env.rechargeAreas))
        i = 0
        
        for rA in self.env.rechargeAreas:
            middleX_RA = rA[1] + self.env.sizeRechargeAreas/2
            middleY_RA = rA[0] + self.env.sizeRechargeAreas/2
            if (self.distanceMeasure == 'Euclidean'):
                dist = self.euclideanDistance(agentY, agentX, middleY_RA, middleX_RA)
            else:
                dist = self.manhattanDistance(agentY, agentX, middleY_RA, middleX_RA)
            
            if ((self.distanceMeasure == 'Manhattan' and dist <= self.env.sizeRechargeAreas) or (self.distanceMeasure == 'Euclidean' and dist < self.env.sizeRechargeAreas)):#fiz isso pq to usando o ponto central da estação, mas se dist for menor que sizerecharge quer dizer que ja esta na estação (mas nao no meio dela)
                dist = 0

            if dist <= self.FOV:
                reICanSee[i] = 1
                
            i+= 1

        return reICanSee

    def getRobot_Data(self):
        minDist, up, down, left, right, station = self.closestRechargeArea(self.agentY, self.agentX)
        return self.agentX, self.agentY, minDist, station, self.energy, self.discountEnergy, self.minEnergy, self.homeostasisSurvive
    
    def save(self, sensor, fileName):
        mat = np.matrix(sensor)
        with open(fileName,'wb') as f:
            for line in mat:
                np.savetxt(f, line, fmt='%s', delimiter=",")
        f.close()

    def saveSensorsData(self, expIDX):
        self.save(self.energyAll, 'Energy' + str(expIDX) + '.txt')
        self.save(self.surviveDriveAll, 'SurviveDrive' + str(expIDX) + '.txt')
        self.save(self.numVisitsRechargeArea, 'NumVisitsRechargeArea' + str(expIDX) + '.txt')
        self.save(self.numVisitsMap, 'NumVisitsMap_Test' + str(expIDX) + '.txt')
        self.save(self.coordinates, 'Coordinates' + str(expIDX) + '.txt')
        

    def dataTrain_Visits(self):
        self.save(self.numVisitsMap_accumulated, 'NumVisitsMap_TrainAccumulated0.txt')
        
    def saveTrain_InitialPosition(self):
        with open('Train_InitialPositions.txt','wb') as f:
            np.savetxt(f, self.initialPositions, fmt='%s', delimiter=",")
        f.close()

    def saveRechargeAreaVisits(self, fileName):
        with open(fileName + '.txt', 'wb') as fp:
            pickle.dump(self.timeVisitsRechargeArea, fp)
        fp.close()

#Q-Learning Function Approximation

In [4]:
rng = random.default_rng(seed = 42)

class ApproximateQAgent:
    def __init__(self, robot, learning_parameters, exploration_parameters, glie):
        self.robot = robot 

        # learning parameters (dict)
        self.alpha = learning_parameters['alpha']#learning rate
        self.gamma = learning_parameters['gamma'] #discount factor

        # exploration parameters
        self.epsilon = exploration_parameters['epsilon']
        self.epsilon_min = exploration_parameters['epsilon_min']
        self.epsilon_decay = exploration_parameters['epsilon_decay']
        self.glie = glie

        self.featuresS = 0
        self.featuresSL = 0

        self.featuresPerAction = 0
        
        self.stationFeatures  = len(self.robot.env.rechargeAreas)
        self.independFeatures = self.robot.totalFeatures - self.stationFeatures - 2 #All features, except SeeA, SeeB, SeeC, SeeD, X and Y

################################## FILES #################################################

    def recoverWeights(self):
        with open('FeaturesPerAction_weightsLast.txt', 'r') as f:
            self.featuresPerAction = [[float(num) for num in line.split(',')] for line in f]
        f.close()

    def saveWeights(self, fileName, data):
        mat = np.matrix(data)
        with open(fileName + '.txt','wb') as f:
            for line in mat:
                np.savetxt(f, line, fmt='%s', delimiter=",")
        f.close()

    def saveDataTraining(self, episode_rewards, episode_steps, filename):
        z = zip(episode_rewards, episode_steps)
        f = open(filename, 'w')
        for t in z:
            line = ' '.join(str(x) for x in t)
            f.write(line + '\n')
        f.close()

    def saveBestReward(self, bestEpisode, episode_rewards):
        best = [bestEpisode, episode_rewards[bestEpisode]]
        print(best)
        with open('BestEpisodeReward.txt','wb') as f:
            np.savetxt(f, best, fmt='%s', delimiter=",")
        f.close()
        
    def saveInitialVars(self, minDist, up, down, left, right, station, agentX, agentY, energy, expIDX):
        with open('InitialVars_Test' + str(expIDX) + '.txt','wb') as f:
            np.savetxt(f, [minDist, up, down, left, right, station, agentX, agentY, energy], fmt='%s', delimiter=",")
        f.close()
        
################################## Qlaerning #################################################

    def init_featuresWeight(self):
        self.featuresPerAction = [[rng.random() * 0.01 for i in range(self.independFeatures + self.stationFeatures + (self.robot.totalFeatures - self.independFeatures - self.stationFeatures) * self.robot.env.discreteMap)] for j in range(self.robot.totalActions)] 
   
    def setFeatures_binario(self, stateFeatures_, state_S_or_SL):
        size = self.robot.env.discreteMap
        stateFeatures = np.zeros(self.independFeatures + self.stationFeatures + (self.robot.totalFeatures - self.independFeatures - self.stationFeatures) * self.robot.env.discreteMap)
        
        for i in range(self.independFeatures):
            stateFeatures[i] = stateFeatures_[i]#currentDriveSurvive, minDist, up, down, left, right
        
        i += 1
        k = 0
        for j in range(i, i + self.stationFeatures):
            stateFeatures[j] = stateFeatures_[i][k] #can see Recharge Area A, B, C, D
            k += 1
            
        inter = np.zeros(size)
        j = 0
        for i in range(self.independFeatures + 1, len(stateFeatures_)):
            inter[stateFeatures_[i]] = 1
            stateFeatures[j * size + self.independFeatures + self.stationFeatures : (j+1) * size + self.independFeatures + self.stationFeatures] = inter
            inter[stateFeatures_[i]] = 0
            j += 1

        if state_S_or_SL == 0:
            self.featuresS = stateFeatures
        else:
            self.featuresSL = stateFeatures

    def getFeatures(self, state_S_or_SL):
        if state_S_or_SL == 0:
            return self.featuresS
        else:
            return self.featuresSL

    def getQvalue(self, actionIndex, state_S_or_SL):
        qValue = 0
        features = self.getFeatures(state_S_or_SL)

        for i in range(self.independFeatures + self.stationFeatures + (self.robot.totalFeatures - self.independFeatures - self.stationFeatures) * self.robot.env.discreteMap):
            qValue += features[i] * self.featuresPerAction[actionIndex][i]

        return qValue
  
    def getMaxQValue(self):
        maxQinSL = self.getQvalue(0, 1)#I assume that the first is the best
        for i in range(1, self.robot.totalActions):
            value = self.getQvalue(i, 1)
            if value > maxQinSL:
                maxQinSL = value
        return maxQinSL

    def update(self, actionIdid, reward):
        Q_sa = self.getQvalue(actionIdid, 0)
        Max_Qsl = self.getMaxQValue()
        TD_target = reward + self.gamma * Max_Qsl
        for i in range(self.independFeatures + self.stationFeatures + (self.robot.totalFeatures - self.independFeatures - self.stationFeatures) * self.robot.env.discreteMap):
            self.featuresPerAction[actionIdid][i] += self.alpha * ((TD_target - Q_sa) * self.featuresS[i])

        self.featuresS = self.featuresSL[:]

    def updateEpsilon(self, totalEpisodes):
        if self.glie == 'linear':
            self.epsilon = max(self.epsilon_min, self.epsilon - (1/totalEpisodes))
        elif self.glie == 'exponential':
            self.epsilon = max(self.epsilon_min, self.epsilon * (1 - self.epsilon_decay))
        elif self.glie == 'constant':
            self.epsilon = self.epsilon

    def getAction(self):
        if rng.random() < self.epsilon:
          # exploration, random choice
            action = rng.integers(0,self.robot.totalActions)
            selectedAction = action
        else:
          # exploitation, max value for given state
            selectedAction = 0 #Assume that the first is the better
            maxQinSL = self.getQvalue(0, 0)

            for i in range(1, self.robot.totalActions):
                value = self.getQvalue(i, 0)
                if value > maxQinSL:
                    maxQinSL = value
                    selectedAction = i

        return selectedAction

################################## TRAIN #################################################

    def learn(self, max_steps = 5000, total_episodes = 25000):
        self.init_featuresWeight()
        self.episode_rewards = np.zeros(total_episodes)
        self.episode_steps = np.zeros(total_episodes) 
        self.discountedReward = np.zeros(total_episodes) 
        bestEpisode = 0
        self.robotData_init = []
        for episode in range(total_episodes):
            state = self.robot.reset(0)
            R_agentX, R_agentY, R_minDist, R_station, R_energy, R_discountEnergy, R_minEnergy, R_homeostasisSurvive = self.robot.getRobot_Data()
            self.setFeatures_binario(state[:], 0)
            for step in range(max_steps):
                action = self.getAction()
                new_state, reward, done = self.robot.step(action)
                self.setFeatures_binario(new_state[:], 1)#setting features of state S'
                self.update(action, reward)
                self.episode_rewards[episode] += reward
                self.discountedReward[episode] += reward * self.gamma ** step
                if done:  
                    print("died episode ", episode ,  " " , np.sum(self.robot.numVisitsRechargeArea), "reward:", self.episode_rewards[episode])
                    break
            if (done == 0):
                print("episode ", episode ,  " " ,  np.sum(self.robot.numVisitsRechargeArea), "reward:", self.episode_rewards[episode])  
            self.robotData_init.append([R_agentX, R_agentY, R_minDist, R_station, R_energy, R_discountEnergy, R_minEnergy, R_homeostasisSurvive, done])
            self.episode_steps[episode] = step + 1
            self.updateEpsilon(total_episodes)
            if (self.episode_rewards[episode]/self.episode_steps[episode] >= self.episode_rewards[bestEpisode]/self.episode_steps[bestEpisode]):
                self.saveWeights('FeaturesPerAction_weightsBEST', self.featuresPerAction)
                bestEpisode = episode
            if (episode % 1000 == 0):
                self.saveDataTraining(self.episode_rewards, self.episode_steps, 'RewardsLearning.txt')
                self.saveDataTraining(self.discountedReward, self.episode_steps, 'DiscountedRewardsLearning.txt')
                self.saveBestReward(bestEpisode, self.episode_rewards)
                self.robot.saveRechargeAreaVisits('TimeVisitsRechargeArea_Train')
                self.robot.dataTrain_Visits()

        state = self.robot.reset(0)#Isso ta aqui pra salvar o TimeVisitsRechargeArea do ultimo episodio
        self.saveWeights('FeaturesPerAction_weightsLast', self.featuresPerAction)
        self.saveDataTraining(self.episode_rewards, self.episode_steps, 'RewardsLearning.txt')
        self.saveDataTraining(self.discountedReward, self.episode_steps, 'DiscountedRewardsLearning.txt')
        self.robot.saveRechargeAreaVisits('TimeVisitsRechargeArea_Train')
        self.saveBestReward(bestEpisode, self.episode_rewards)
        self.robot.dataTrain_Visits()
        self.robot.saveTrain_InitialPosition()
        np.savetxt('robotData_init.txt', self.robotData_init, fmt='%.2f')
################################## TEST #################################################

    def evaluate(self, max_steps, expIDX):
        self.epsilon = 0
        self.recoverWeights()
        state = self.robot.reset(1)
        self.reward_want = []
        self.reward_like = []
        minDist, up, down, left, right, station = self.robot.closestRechargeArea(self.robot.agentY, self.robot.agentX)
        self.saveInitialVars(minDist, up, down, left, right, station, self.robot.agentX, self.robot.agentY, self.robot.energy, expIDX)
        self.setFeatures_binario(state[:], 0)
        for step in range(max_steps):
            action = self.getAction()
            new_state, reward, done = self.robot.step(action)
            self.setFeatures_binario(new_state[:], 0)
            if done:
                print("died EXPLORING " , step, " steps")
                print(np.sum(self.robot.numVisitsRechargeArea))
                break
        self.robot.saveSensorsData(expIDX)
        self.robot.saveRechargeAreaVisits('TimeVisitsRechargeArea_Test')
        
################################## ONLY ANALYZE ACTION CHOSE #################################################

    def chooseActionPerPosition(self,energy):
        self.epsilon = 0
        self.recoverWeights()

        actions = [[-1  for i in range(self.robot.env.discreteMap)] for j in range(self.robot.env.discreteMap)]

        energyDrive = self.robot.computeDriveSurvive(energy)

        for i in range(self.robot.env.discreteMap):
            for j in range(self.robot.env.discreteMap):
                minDist, up, down, left, right, station = self.robot.closestRechargeArea(i, j)
                reICanSee = self.robot.RechargeArea_ICanSee(i, j)
                state = [energyDrive, minDist, up, down, left, right, reICanSee, i, j]
                self.setFeatures_binario(state[:], 0)
                action = self.getAction()
                actions[i][j] = action
        self.saveWeights('ActionsPerPosition_Energy' + str(energy), actions)

#Experiments

##Start the Environment

In [6]:
settings = {
    'discreteMap': 20,
    'sizeRechargeAreas': 1,
    'labelsRechargeArea': ['A', 'B', 'C', 'D'],
    'rechargeAreas': [[2,4], [4,13], [16, 2], [14,15]],#[yInicial, XInicial]
    'rechargeValue': [1, 4, 3, 2]#[1, 4, 3, 2] #[3, 3, 3, 3] #
}

env = Environment(settings)

In [7]:
#Need: Energy, Drive: Survive
survive = {
    'homeostasisSurvive': 30,
    'maxEnergy': 50,
    'minEnergy': 0,
    'discountEnergy': 0.1,
    'FOV': 6
}

actions = {
    0: 'Stop',
    1: 'Up',
    2: 'Down',
    3: 'Left',
    4: 'Right'
}

distanceMeasure = ['Euclidean', 'Manhattan']
robot = Agent(env, actions, survive, distanceMeasure[1]) 

In [8]:
# q-learning parameters
learning_parameters = {
    'alpha': 0.0001,
    'gamma': 0.9
} 
# exploration-exploitation parameters
exploration_parameters = {
    'epsilon': 1.0, #exploration probability at start
    'epsilon_min': 0.01, #minimum exploration probability
    'epsilon_decay': 0.0003  #exponential decay rate for exploration prob
}

glie = ['linear', 'exponential', 'constant']
qApp_Agent_1 = ApproximateQAgent(robot, learning_parameters, exploration_parameters, glie[0])

In [None]:
max_steps = 5000
total_episodes = 25000
qApp_Agent_1.learn(max_steps, total_episodes)

In [None]:
max_steps = 8000
numExps = 50
for i in range(numExps):
    qApp_Agent_1.evaluate(max_steps, i)
    print('End of the experiment ', i)