In [561]:
import random
import time

In [562]:
start = time.time()
print(start)

1638853219.302423


In [563]:
myfile = open("./benchmarks/sokoban01.txt")

lines = []
for eachline in myfile:
    lines.append(eachline)
myfile.close()

In [564]:
# Input Line0
sizeH = int(lines[0].split()[0])
sizeV = int(lines[0].split()[1])

In [565]:
# Loading data into a set
# Arrays are in (y,x), with top row as 1, and leftmost column as 1
def createCoordinatesSet(inputArray):
    coordinates = set([])
    for i,j in zip(inputArray[0::2], inputArray[1::2]):
        coordinates.add((int(i), int(j)))
    return coordinates

In [566]:
# Input Line1
line1Array = lines[1].split()
nWallSquares = int(line1Array[0])
wallCoordinates = createCoordinatesSet(line1Array[1:])

if len(wallCoordinates)!=nWallSquares:
    print("Wall square does not match, check input")

In [567]:
# Input Line2
line2Array = lines[2].split()
nBoxes = int(line2Array[0])
boxCoordinates = createCoordinatesSet(line2Array[1:])

if len(boxCoordinates)!=nBoxes:
    print("Boxes does not match, check input")

In [568]:
# Input Line3
line3Array = lines[3].split()
nStorageLocations = int(line3Array[0])
storageCoordinates = createCoordinatesSet(line3Array[1:])

if len(storageCoordinates)!=nStorageLocations:
    print("Wall square does not match, check input")

In [569]:
# Input Line4
line4Array = lines[4].split()
initialLocation = (int(line4Array[0]), int(line4Array[1]))

In [570]:
#constants
actions = ['U', 'D', 'L', 'R']
discount = 1.0
learningRate = 0.7

In [571]:
class SokobanState:
    def __init__(self, boxCoordinates, location):
        self.BoxCoordinates = frozenset(boxCoordinates.copy())
        self.Location = location
    
    def __hash__(self):
        return hash((self.BoxCoordinates, self.Location))

    def __eq__(self, other):
        return (self.BoxCoordinates, self.Location) == (other.BoxCoordinates, other.Location)

    def __repr__(self):
        return str(self.BoxCoordinates) + " " + str(self.Location)

    @staticmethod
    def minStepsBetweenCoordinates(coordinates1, coordinates2):
        return abs(coordinates1[0]-coordinates2[0]) + abs(coordinates1[1]-coordinates2[1])
    
    def totalBoxToClosestStorageSteps(self):
        result = 0
        for box in self.BoxCoordinates.difference(storageCoordinates):
            minFound = float("inf")
            for storage in storageCoordinates.difference(self.BoxCoordinates):
                steps = self.minStepsBetweenCoordinates(box, storage)
                if steps < minFound:
                    minFound = steps
            result += minFound
        return result
    
    def agentToClosestBoxSteps(self):
        minFound = float("inf")
        for box in self.BoxCoordinates.difference(storageCoordinates):
            steps = self.minStepsBetweenCoordinates(self.Location, box)
            if steps < minFound:
                minFound = steps
        return minFound
    
    def remainingBoxes(self):
        count = 0
        for box in self.BoxCoordinates.difference(storageCoordinates):
            count += 1
        return count
    
    def isTerminal(self):
        return all(map(lambda BoxCoor: BoxCoor in storageCoordinates, self.BoxCoordinates))
    
    def boxStuckAtCorner(self, box):
        if box in storageCoordinates:
            return False
        coordinateDifference = [(-1,-1), (-1,+1), (+1, -1), (+1,+1)]
        for difference in coordinateDifference:
            if (box[0]+difference[0], box[1]) in wallCoordinates and (box[0], box[1]+difference[1]) in wallCoordinates:
                return True
        return False
    
    def noSolution(self):
        return any(map(lambda BoxCoor: self.boxStuckAtCorner(BoxCoor), self.BoxCoordinates))
    
    def getTargetLocations(self, action):
        currentLocation = self.Location
        targetLocation = currentLocation
        targetNextLocation = (currentLocation[0]+1, currentLocation[1])
        if action == 'U':
            targetLocation = (currentLocation[0]-1, currentLocation[1])
            targetNextLocation = (currentLocation[0]-2, currentLocation[1])
        elif action == 'D':
            targetLocation = (currentLocation[0]+1, currentLocation[1])
            targetNextLocation = (currentLocation[0]+2, currentLocation[1])
        elif action == 'L':
            targetLocation = (currentLocation[0], currentLocation[1]-1)
            targetNextLocation = (currentLocation[0], currentLocation[1]-2)
        elif action == 'R':
            targetLocation = (currentLocation[0], currentLocation[1]+1)
            targetNextLocation = (currentLocation[0], currentLocation[1]+2)
        return targetLocation, targetNextLocation
    
    def isInvalidAction(self, action):
        targetLocation, targetNextLocation = self.getTargetLocations(action)
        return (targetLocation in wallCoordinates) or (targetLocation in self.BoxCoordinates and targetNextLocation in self.BoxCoordinates) or (targetLocation in self.BoxCoordinates and targetNextLocation in wallCoordinates)
    
    # Avoid agent stuck at local maximum by staying at the same location, only allow movable actions
    def getPossibleActions(self):
        return [action for action in actions if not self.isInvalidAction(action)]

In [572]:
class Sokoban:
    qFunction = {}
    
    def __init__(self, sizeH, sizeV, boxCoordinates, location):
        self.SizeH = sizeH
        self.SizeV = sizeV
        self.currentState = SokobanState(boxCoordinates.copy(), location)

    def resetQFunction(self):
        self.__class__.qFunction.clear()
        
    def getQValue(self, state, action):
        if state.isTerminal():
            return 10000000000
        if state.noSolution():
            return -10000000000
        return self.__class__.qFunction.get((state, action), 0)
    
    def setQValue(self, state, action, newValue):
        self.__class__.qFunction[(state, action)] = newValue
    
    def getCurrentStateActions(self):
        return self.currentState.getPossibleActions()
    
    def getMaxQValue(self, state, possibleActions):
        return max(map(lambda action: self.getQValue(state, action), possibleActions))
    
    def getBestAction(self, state):
        possibleActions = state.getPossibleActions()
        maxQValue = self.getMaxQValue(state, possibleActions)
        for action in possibleActions:
            if self.getQValue(state, action) == maxQValue:
                return action
    
    # create S'
    @staticmethod
    def createStateForAction(state, action):
        targetLocation, targetNextLocation = state.getTargetLocations(action)
        newLocation = targetLocation
        # map frozenset back to a mutable set
        newBoxCoordinates = set(state.BoxCoordinates.copy())
        if targetLocation in newBoxCoordinates:
            newBoxCoordinates.remove(targetLocation)
            newBoxCoordinates.add(targetNextLocation)
        return SokobanState(newBoxCoordinates, newLocation)
        
    # Take action A, observe R and S'
    # Update Q value
    # Update S to S'
    def takeAction(self, action):
        newState = self.createStateForAction(self.currentState, action)
        
        # Reward R
        R = -1
        # Reward for moving box closer
        stepDifference = newState.totalBoxToClosestStorageSteps() - self.currentState.totalBoxToClosestStorageSteps()
        if stepDifference < 0:
            R += 3
        elif stepDifference == 0:
            R += -3
        
        # Reward for agent getting closer to a box
        distanceToClosestBoxDifference = newState.agentToClosestBoxSteps() - self.currentState.agentToClosestBoxSteps()
        if distanceToClosestBoxDifference < 0:
            R += 1
        elif distanceToClosestBoxDifference > 0:
            R += -1
        
        # Reward for moving box to Storage
        remainingBoxesDifference = newState.remainingBoxes() - self.currentState.remainingBoxes()
        if remainingBoxesDifference < 0:
            R += 15
        elif remainingBoxesDifference > 0:
            R += -10
        
        currentQValue = self.getQValue(self.currentState, action)
        newQValue = currentQValue + learningRate*(R+discount*self.getMaxQValue(newState, newState.getPossibleActions())-currentQValue)
        self.setQValue(self.currentState, action, newQValue)
        self.currentState = newState

In [573]:
sokoban = Sokoban(sizeH, sizeV, boxCoordinates, initialLocation)
sokoban.resetQFunction()

In [574]:
random.seed(0)
def createRandomStartLocation():
    randomLocation = (1,1)
    while randomLocation in wallCoordinates or randomLocation in boxCoordinates or randomLocation in storageCoordinates:
        randomLocation = (random.randint(1, sizeH), random.randint(1, sizeV))
    return randomLocation

In [575]:
def createAndRunEpisode(startLocation, maxSteps, epsilon):
    # Return True if this episode reached terminal state
    sokoban = Sokoban(sizeH, sizeV, boxCoordinates, startLocation)
    path = ""
    for j in range(maxSteps):
        if random.random() < epsilon:
            action = random.choice(sokoban.currentState.getPossibleActions())
            path += action
        else:
            action = sokoban.getBestAction(sokoban.currentState)
            path += action
        sokoban.takeAction(action)
        if sokoban.currentState.isTerminal():
            print("Stpes used to reach terminal: ", j+1)
            print("Path is %%%%%%% ", path)
            return True
#         if sokoban.currentState.noSolution():
#             return False
    return False

In [576]:
episodes = sizeH*sizeV*nBoxes*4*100
maxSteps = sizeH*sizeV*nBoxes
terminalCount = 0
epsilon = 0.3
print('Total episodes: ', episodes)
# Q-learning
for i in range(episodes):
    episodeReachedTerminal = createAndRunEpisode(initialLocation, maxSteps, epsilon)
    if episodeReachedTerminal:
        terminalCount += 1
        break
    if i%1000 == 0:
        print("Total Completed Episodes: ", i)
        print("Qtable size: ", len(Sokoban.qFunction))
        # print("Qtable vals: ", Sokoban.qFunction)
        timeElapsed = time.time() - start
        print('Time elapsed: ', timeElapsed)
        if timeElapsed > 3600:
            print('Timeout')
            break
        terminalCount = 0

Total episodes:  501600
Total Completed Episodes:  0
Qtable size:  69
Time elapsed:  0.11621618270874023
Total Completed Episodes:  1000
Qtable size:  44058
Time elapsed:  56.713900089263916
Total Completed Episodes:  2000
Qtable size:  86576
Time elapsed:  110.28786993026733
Total Completed Episodes:  3000
Qtable size:  132130
Time elapsed:  162.21441197395325
Total Completed Episodes:  4000
Qtable size:  173044
Time elapsed:  214.9871120452881
Total Completed Episodes:  5000
Qtable size:  209196
Time elapsed:  267.8944671154022
Total Completed Episodes:  6000
Qtable size:  246881
Time elapsed:  322.68310499191284
Total Completed Episodes:  7000
Qtable size:  280978
Time elapsed:  378.3130829334259
Total Completed Episodes:  8000
Qtable size:  314137
Time elapsed:  432.6521008014679
Total Completed Episodes:  9000
Qtable size:  346048
Time elapsed:  485.9786169528961
Total Completed Episodes:  10000
Qtable size:  376508
Time elapsed:  538.8496608734131
Total Completed Episodes:  11000

KeyboardInterrupt: 

In [None]:
end = time.time()
print('Execution time: ', end - start)