In [1]:
# global variables
BOARD_ROWS = 3
BOARD_COLS = 4
WIN_STATE = (0, 3)
LOSE_STATE = (1, 3)
START = (2, 0)
DETERMINISTIC = True

In [33]:
import numpy as np

# Defining the variables
BOARD_ROWS = 4 # number of rows in the grid
BOARD_COLS = 4  # number of cols in the grid
WIN_STATE = (0, 3) # wining state or state that has higher positive reward.
LOSE_STATE = (1, 3) # lossing state or state that has lower positive reward. 
START = (2, 1)  # statarting state where the toy is located. 
DETERMINISTIC = True


class State:
    def __init__(self, state=START):
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
        self.board[2, 1] = -1
        self.state = state
        self.isEnd = False
        self.determine = DETERMINISTIC

    def giveReward(self):
        if self.state == WIN_STATE:
            return 5
        elif self.state == LOSE_STATE:
            return -5
        else:
            return 0

    def isEndFunc(self):
        if (self.state == WIN_STATE) or (self.state == LOSE_STATE):
            self.isEnd = True

    def nxtPosition(self, action):
        """
        action: up, down, left, right
        -------------
        0 | 1 | 2| 3|
        1 |
        2 |
        3 |
        return next position
        """
        if self.determine:
            if action == "N":
                nxtState = (self.state[0] - 1, self.state[1])
            elif action == "S":
                nxtState = (self.state[0] + 1, self.state[1])
            elif action == "W":
                nxtState = (self.state[0], self.state[1] - 1)
            else:
                nxtState = (self.state[0], self.state[1] + 1)
            # if next state legal
            if (nxtState[0] >= 0) and (nxtState[0] <= (BOARD_ROWS -1)):
                if (nxtState[1] >= 0) and (nxtState[1] <= (BOARD_COLS -1)):
                    if nxtState != (2, 1):
                        return nxtState
            return self.state

    def showBoard(self):
        self.board[self.state] = 1
        for i in range(0, BOARD_ROWS):
            print('-----------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = '*'
                if self.board[i, j] == -1:
                    token = 'z'
                if self.board[i, j] == 0:
                    token = '0'
                out += token + ' | '
            print(out)
        print('-----------------')


# Agent of player

class Agent:

    def __init__(self):
        self.states = []
        self.actions = ["N", "S", "W", "E"]
        self.State = State()
        self.lr = 0.2
        self.exp_rate = 0.3

        # initial state reward
        self.state_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.state_values[(i, j)] = 0  # set initial value to 0
        print(self.state_values)
        print('Hurry got it')
        
    def chooseAction(self):
        # choose action with most expected value
        mx_nxt_reward = 0
        action = ""

        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            # greedy action
            for a in self.actions:
                # if the action is deterministic
                nxt_reward = self.state_values[self.State.nxtPosition(a)]
                
                if nxt_reward >= mx_nxt_reward:
                    action = a
                    mx_nxt_reward = nxt_reward
        return action

    def takeAction(self, action):
        position = self.State.nxtPosition(action)
        return State(state=position)

    def reset(self):
        self.states = []
        self.State = State()

    def play(self, rounds):
        i = 0
        j=0
        while i < rounds:
            # to the end of game back propagate reward
            if self.State.isEnd:
                # back propagate
                reward = self.State.giveReward()
                # explicitly assign end state to reward values
                self.state_values[self.State.state] = reward  # this is optional
                print("Game End Reward", reward)
                print(self.state_values[self.State.state])
                for s in reversed(self.states):
#                     print(self.state_values[s])
#                     print(s)
#                     print('Confuse ++++++........++++++')
                    reward = self.state_values[s] + self.lr * (reward - self.state_values[s])
                    #print(reward)
                    self.state_values[s] = round(reward, 3)
               # print(reward)
                self.reset()
                i += 1
                print('Ok ' + str(i))
            else:
                action = self.chooseAction()
                # append trace
                self.states.append(self.State.nxtPosition(action))
                print("current position {} action {}".format(self.State.state, action))
                # by taking the action, it reaches the next state
                self.State = self.takeAction(action)
               # print('Ok ' + str(j))
                j=j+1
                # mark is end
                self.State.isEndFunc()
                print("nxt state", self.State.state)
                print("---------------------")
               

    def showValues(self):
        for i in range(0, BOARD_ROWS):
            print('----------------------------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                out += str(self.state_values[(i, j)]).ljust(6) + ' | '
            print(out)
        print('----------------------------------')


if __name__ == "__main__":
    ag = Agent()
    ag.play(200)
    print(ag.showValues())

{(0, 0): 0, (0, 1): 0, (0, 2): 0, (0, 3): 0, (1, 0): 0, (1, 1): 0, (1, 2): 0, (1, 3): 0, (2, 0): 0, (2, 1): 0, (2, 2): 0, (2, 3): 0, (3, 0): 0, (3, 1): 0, (3, 2): 0, (3, 3): 0}
Hurry got it
current position (2, 1) action E
nxt state (2, 2)
---------------------
current position (2, 2) action E
nxt state (2, 3)
---------------------
current position (2, 3) action E
nxt state (2, 3)
---------------------
current position (2, 3) action E
nxt state (2, 3)
---------------------
current position (2, 3) action E
nxt state (2, 3)
---------------------
current position (2, 3) action S
nxt state (3, 3)
---------------------
current position (3, 3) action N
nxt state (2, 3)
---------------------
current position (2, 3) action E
nxt state (2, 3)
---------------------
current position (2, 3) action E
nxt state (2, 3)
---------------------
current position (2, 3) action N
nxt state (1, 3)
---------------------
Game End Reward -5
-5
Ok 1
current position (2, 1) action W
nxt state (2, 0)
-------------

Game End Reward 5
5
Ok 119
current position (2, 1) action N
nxt state (1, 1)
---------------------
current position (1, 1) action N
nxt state (0, 1)
---------------------
current position (0, 1) action E
nxt state (0, 2)
---------------------
current position (0, 2) action E
nxt state (0, 3)
---------------------
Game End Reward 5
5
Ok 120
current position (2, 1) action N
nxt state (1, 1)
---------------------
current position (1, 1) action N
nxt state (0, 1)
---------------------
current position (0, 1) action E
nxt state (0, 2)
---------------------
current position (0, 2) action E
nxt state (0, 3)
---------------------
Game End Reward 5
5
Ok 121
current position (2, 1) action N
nxt state (1, 1)
---------------------
current position (1, 1) action N
nxt state (0, 1)
---------------------
current position (0, 1) action E
nxt state (0, 2)
---------------------
current position (0, 2) action E
nxt state (0, 3)
---------------------
Game End Reward 5
5
Ok 122
current position (2, 1) acti

In [12]:
----------------------------------
| 4.843  | 4.983  | 4.995  | 5.0    | 
----------------------------------
| 4.831  | 4.966  | 4.91   | -5.0   | 
----------------------------------
| 4.469  | 0      | 3.021  | 1.163  | 
----------------------------------
| 0.72   | 2.263  | 2.617  | 0.964  | 
----------------------------------

0.13899374374673223