## 4-dim grid example

In [1]:
#Load packages
from NashQLearn import Player, Grid, NashQLearning
import warnings
warnings.filterwarnings('ignore')

This notebook applies the Nash Q Learning algorithm to the following multiagent problem : two robots placed on a grid need to reach the reward. Robots are allowed to move up, down, to the left, and to the right, or to stay at their current position. Robots are not allowed to be on the same tile unless it is the reward tile.

### Prepare the game environment

In [2]:
#Initialize the two players
player1 = Player([3,0])
player2 = Player([2,2])

In [3]:
#Initialize the grid
grid = Grid(length = 4,
            width = 4,
            players = [player1,player2],
           obstacle_coordinates = [[0,0], [1,0],[1,2],[1,3],[0,3],[2,3],[3,1]],
           reward_coordinates = [0,2],
           reward_value = 20,
           collision_penalty = -1)

In [4]:
joint_states = grid.joint_states()
print('Available joint states : %s'%len(joint_states))#Correct
print(joint_states)

Available joint states : 73
[[[0, 1], [0, 2]], [[0, 1], [1, 1]], [[0, 1], [2, 0]], [[0, 1], [2, 1]], [[0, 1], [2, 2]], [[0, 1], [3, 0]], [[0, 1], [3, 2]], [[0, 1], [3, 3]], [[0, 2], [0, 1]], [[0, 2], [1, 1]], [[0, 2], [2, 0]], [[0, 2], [2, 1]], [[0, 2], [2, 2]], [[0, 2], [3, 0]], [[0, 2], [3, 2]], [[0, 2], [3, 3]], [[1, 1], [0, 1]], [[1, 1], [0, 2]], [[1, 1], [2, 0]], [[1, 1], [2, 1]], [[1, 1], [2, 2]], [[1, 1], [3, 0]], [[1, 1], [3, 2]], [[1, 1], [3, 3]], [[2, 0], [0, 1]], [[2, 0], [0, 2]], [[2, 0], [1, 1]], [[2, 0], [2, 1]], [[2, 0], [2, 2]], [[2, 0], [3, 0]], [[2, 0], [3, 2]], [[2, 0], [3, 3]], [[2, 1], [0, 1]], [[2, 1], [0, 2]], [[2, 1], [1, 1]], [[2, 1], [2, 0]], [[2, 1], [2, 2]], [[2, 1], [3, 0]], [[2, 1], [3, 2]], [[2, 1], [3, 3]], [[2, 2], [0, 1]], [[2, 2], [0, 2]], [[2, 2], [1, 1]], [[2, 2], [2, 0]], [[2, 2], [2, 1]], [[2, 2], [3, 0]], [[2, 2], [3, 2]], [[2, 2], [3, 3]], [[3, 0], [0, 1]], [[3, 0], [0, 2]], [[3, 0], [1, 1]], [[3, 0], [2, 0]], [[3, 0], [2, 1]], [[3, 0], [2, 2]],

In [5]:
walls = grid.identify_walls()
walls  

[['left', [0, 1]],
 ['down', [0, 1]],
 ['left', [0, 2]],
 ['right', [0, 2]],
 ['up', [0, 2]],
 ['up', [1, 1]],
 ['down', [1, 1]],
 ['left', [2, 0]],
 ['down', [2, 0]],
 ['right', [2, 1]],
 ['left', [2, 2]],
 ['up', [2, 2]],
 ['right', [3, 0]],
 ['up', [3, 0]],
 ['down', [3, 0]],
 ['right', [3, 2]],
 ['down', [3, 2]],
 ['left', [3, 3]],
 ['right', [3, 3]],
 ['up', [3, 3]]]

###  Run the Nash Q Learning algorithm

In [6]:
nashQ = NashQLearning(grid, 
                      max_iter = 2000,
                      discount_factor = 0.9,
                      learning_rate = 0.7,
                      epsilon = 0.4,
                      decision_strategy = 'epsilon-greedy')

In [7]:
#Retrieve the updated Q matrix after fitting the algorithm
Q0, Q1 = nashQ.fit(return_history = False)

100%|████████████████████████████████████████████████████████████████████████████████| 73/73 [00:00<00:00, 2437.05it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [09:04<00:00,  3.68it/s]


In [8]:
#Best path followed by each player given the values in the q tables
p0, p1 = nashQ.get_best_policy(Q0,Q1)

[[3, 0], [2, 2]]
[[2, 0], [2, 1]]
[[3, 0], [1, 1]]
[[2, 0], [0, 1]]
[[2, 1], [0, 2]]
[[1, 1], [0, 2]]
[[0, 1], [0, 2]]
[[0, 2], [0, 2]]


In [9]:
print('Player 0 follows the  policy : %s of length %s' %('-'.join(p0),len(p0)))
print('Player 1 follows the  policy : %s of length %s'%('-'.join(p1),len(p1)))

Player 0 follows the  policy : left-right-left-up-left-left-up of length 7
Player 1 follows the  policy : down-left-left-up-stay-stay-stay of length 7
