<a href="https://colab.research.google.com/github/jyotidabass/Reinforcement-Learning-from-scratch/blob/main/Reinforcement_Learning_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np

# Define the grid world
ROWS, COLS = 4, 4
grid = np.zeros((ROWS, COLS))

# Define the actions (up, down, left, right) and their displacements
ACTIONS = {'U': (-1, 0), 'D': (1, 0), 'L': (0, -1), 'R': (0, 1)}
MAX_ACTION = len(ACTIONS)

# Define the transition model and the reward function
def transition(state, action):
    new_state = (state[0] + action[0], state[1] + action[1])
    if 0 <= new_state[0] < ROWS and 0 <= new_state[1] < COLS:
        return new_state
    else:
        return state

def reward_fn(state):
    if state == (ROWS-1, COLS-1):
        return 1
    else:
        return -0.01

# Set the hyperparameters
ALPHA = 0.1  # learning rate
GAMMA = 0.9  # discount factor
EPSILON = 0.1  # exploration rate

# Set the initial Q-values to small random values
Q = np.random.rand(ROWS, COLS, MAX_ACTION)

# Run the Q-learning algorithm for a fixed number of episodes
NUM_EPISODES = 10000

for episode in range(NUM_EPISODES):
    # Reset the state at the beginning of each episode
    state = (0, 0)

    done = False

    while not done:
        # Choose an action using an ε-greedy policy
        if np.random.rand() < EPSILON:  # exploration
            action = np.random.choice(list(ACTIONS.keys())) # Select a random action key
            # Map the chosen action to its index for Q-value update
            action_index = list(ACTIONS.keys()).index(action) # Define action_index here as well
        else:  # exploitation
            action_index = np.argmax(Q[state]) # Get the index of the action with the highest Q-value
            action = list(ACTIONS.keys())[action_index] # Get the corresponding action key

        # Take a step in the environment, update the Q-table, and check if the episode is done
        next_state = transition(state, ACTIONS[action])
        reward = reward_fn(next_state)
        max_next_Q = np.max(Q[next_state])
        Q[state][action_index] += ALPHA * (reward + GAMMA * max_next_Q - Q[state][action_index]) # Update Q-value for the selected action
        state = next_state
        done = (state == (ROWS-1, COLS-1))

print("Final Q-table:")
print(Q)

Final Q-table:
[[[0.81769134 0.91965704 0.81769134 0.91965704]
  [0.75731229 0.86089146 0.57570987 1.03295227]
  [0.971803   1.15883586 0.86791497 0.79171929]
  [0.17158449 0.52931121 1.01813399 0.44636646]]

 [[0.81769134 1.03295227 0.91965704 1.03295227]
  [0.91965704 0.98472168 0.91965704 1.15883586]
  [1.03295227 1.29870651 1.03295227 1.03295227]
  [0.66347087 1.11488262 1.15883586 0.90819475]]

 [[0.83009008 1.15883586 0.91332261 0.86333079]
  [0.90073689 0.89456577 1.03295227 1.22493588]
  [1.15883586 1.45411834 0.95260333 1.45411834]
  [1.03295227 1.62679816 1.29870651 1.45411834]]

 [[0.95896109 0.98554742 1.06253414 1.29870651]
  [0.88989251 1.12379537 0.89561404 1.45411834]
  [1.13362384 1.24422233 1.22865785 1.62679816]
  [0.6964424  0.24584916 0.48057785 0.12080963]]]
