In [1]:
import numpy as np

BOARD_ROWS = 3
BOARD_COLS = 4
WIN_STATE = (0,3)
LOSE_STATES = {(1,0),(1,3)}
START = (2,0)
DETERMINISTIC = False

class State:
    def __init__(self, state=START):
        self.board = np.zeros([BOARD_ROWS,BOARD_COLS])
        self.board[1,1] = -1
        self.state = state
        self.isEnd = False
        self.determine = DETERMINISTIC


    def giveReward(self):
        if self.state == WIN_STATE:
            return 1
        elif self.state in LOSE_STATES:
            return -1
        else:
            return 0
        
    def isEndFunc(self):
        if (self.state == WIN_STATE) or (self.state in LOSE_STATES):
            self.isEnd = True
    
    def _chooseActionProb(self,action):
        if action == "up":
            return np.random.choice(["up","left","right"],p=[0.8,0.1,0.1])
        if action == "down":
            return np.random.choice(["down","left","right"],p=[0.8,0.1,0.1])
        if action == "left":
            return np.random.choice(["left","up","down"],p=[0.8,0.1,0.1])
        if action == "right":
            return np.random.choice(["right","up","down"],p=[0.8,0.1,0.1])
    
    
    def nxtPosition(self,action):
    #actions are up, down, left, right
        if self.determine:
            if action == "up":
                nxtState = (self.state[0]-1,self.state[1])
            elif action == "down":
                nxtState = (self.state[0]+1,self.state[1])
            elif action == "left":
                nxtState = (self.state[0],self.state[1]-1)
            else:
                nxtState = (self.state[0],self.state[1]+1)
            self.determine = False
        else: #non-deterministic state movement
            action = self._chooseActionProb(action)
            self.determine = True
            nxtState = self.nxtPosition(action)
            #check if next state is legal
        if (nxtState[0] >= 0) and (nxtState[0]<=2):
            if (nxtState[1] >= 0) and (nxtState[1]<=3):
                if nxtState != (1,1):
                    return nxtState
        return self.state
        
    def showBoard(self):
        self.board[self.state] = 1
        for i in range(0, BOARD_ROWS):
            print('-----------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = '*'
                if self.board[i, j] == -1:
                    token = 'z'
                if self.board[i, j] == 0:
                    token = '0'
                out += token + ' | '
            print(out)
        print('-----------------')

    
class Agent:
    def __init__(self):
        self.states = [] #tracks states reached
        self.actions = ["up","down","left","right"]
        self.State = State()
        self.isEnd = self.State.isEnd
        self.lr = 0.2
        self.exp_rate = 0.3
        self.decay_gamma = 0.9
        
        # initial Q-values
        self.Q_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.Q_values[(i,j)] = {}
                for a in self.actions:
                    self.Q_values[(i,j)][a] = 0 #based on state and action, dictionary; initialize to 0
       
    def chooseAction(self):
        mx_nxt_reward = -1
        action = ""
        
        if np.random.uniform(0,1) <= self.exp_rate: #explore
            action = np.random.choice(self.actions)
        else: #exploit
            for a in self.actions:
                current_position = self.State.state
                nxt_reward = self.Q_values[current_position][a]
            if nxt_reward >= mx_nxt_reward:
                action = a
                mx_nxt_reward = nxt_reward
        return action
    
    def takeAction(self,action):
        position = self.State.nxtPosition(action)
        return State(state=position)
    
    def reset(self):
        self.states = []
        self.State = State()
    
    
    def play(self,rounds = 10):
        i = 0
        while i < rounds:
            if self.State.isEnd:
                reward = self.State.giveReward() #call giveReward function if at end state
                for a in self.actions:
                    self.Q_values[self.State.state][a] = reward #reward is same for all 'actions' taken after final state
                print("Game end reward",reward)
                for s in reversed(self.states): #backward propagation, save reward for each state and use that to update previous state
                    current_q_value = self.Q_values[s[0]][s[1]]
                    reward = current_q_value + self.lr*(self.decay_gamma*reward - current_q_value) #update Q-value
                    self.Q_values[s[0]][s[1]] = round(reward,3)
                self.reset()
                i += 1
            else:
                action = self.chooseAction()
                self.states.append([(self.State.state),action]) #add taken action to list of actions
                print("current position {} action {}".format(self.State.state, action))
                # by taking the action, it reaches the next state
                self.State = self.takeAction(action)
                self.State.isEndFunc()
                print("nxt state", self.State.state)
                print("---------------------")
                self.isEnd = self.State.isEnd
                
        
if __name__ == "__main__":
    ag = Agent()
    print("initial Q-values ... \n")
    print(ag.Q_values)
    
    ag.play(200)
    print("latest Q-values ... \n")
    print(ag.Q_values)

initial Q-values ... 

{(0, 0): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (0, 1): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (0, 2): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (0, 3): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (1, 0): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (1, 1): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (1, 2): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (1, 3): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (2, 0): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (2, 1): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (2, 2): {'up': 0, 'down': 0, 'left': 0, 'right': 0}, (2, 3): {'up': 0, 'down': 0, 'left': 0, 'right': 0}}
current position (2, 0) action up
nxt state (1, 0)
---------------------
Game end reward -1
current position (2, 0) action right
nxt state (2, 1)
---------------------
current position (2, 1) action right
nxt state (2, 2)
---------------------
current position (2, 2) action right
nxt state (2, 3)
---------------------
current position (2,

In [None]:
LOSE_STATES = {(1,3), (0,0)}
L = (1,3)
L in LOSE_STATES