In [2]:
import pandas as pd
import numpy as np
import random

In [3]:
# Defining the grid or the gameboard
# Here S: start state, W: walls, G: goal state

grid = np.array([['S', '.', '.', 'W'],
                 ['.', 'W', '.', 'W'],
                 ['.', '.', '.', '.'],
                 ['W', '.', 'W', 'G']])

In [4]:
# defining actions 

actions = ['U', 'D', 'L', 'R']
numActions = len(actions)
numActions

4

### Initialize Q table 

The Q-table is a fundamental data structure used in the Q-learning algorithm. It stores the learned values associated with state-action pairs. The Q-value (short for "quality value") in the Q-table represents the expected cumulative reward an agent can achieve by taking a particular action in a specific state and then following an optimal policy thereafter.

In [5]:
# initialize q table with zeroes initially

numStates = grid.size
qTable = np.zeros((numStates, numActions))
qTable

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [6]:
# define a mapping between states and their indices 

stateIndices = {
    state: index for index, 
    state in enumerate(grid.flatten())
}
stateIndices

{'S': 0, '.': 13, 'W': 14, 'G': 15}

In [7]:
# defining hyperparameters

learningRate = 0.1
discountFactor = 0.9
explorationProb = 0.3
numEpisodes = 1000

In [8]:
# Function to take action and return the new state
def takeAction(state, action):
    currentRow, currentCol = np.where(grid == state)
    currentRow, currentCol = currentRow[0], currentCol[0]

    if action == 'U':
        newRow, newCol = currentRow - 1, currentCol
    elif action == 'D':
        newRow, newCol = currentRow + 1, currentCol
    elif action == 'L':
        newRow, newCol = currentRow, currentCol - 1
    elif action == 'R':
        newRow, newCol = currentRow, currentCol + 1

    if 0 <= newRow < grid.shape[0] and 0 <= newCol < grid.shape[1]:
        return grid[newRow, newCol]
    else:
        return state

In [9]:
# Main Q-learning loop
for episode in range(numEpisodes):
    currentState = 'S'
    done = False

    while not done:
        if random.random() < explorationProb:
            action = random.choice(actions)  # Exploration
        else:
            action = actions[np.argmax(qTable[stateIndices[currentState]])]  # Exploitation

        newState = takeAction(currentState, action)
        reward = -1 if newState != 'W' else -5  # Define rewards

        qTable[stateIndices[currentState], actions.index(action)] += \
            learningRate * (reward + discountFactor * (np.max(qTable[stateIndices[newState]]) -
            qTable[stateIndices[currentState], actions.index(action)]))

        if newState == 'G':
            done = True
        else:
            current_state = newState