## Rules of the game

- if agent tries to go off the field, punish with -10 in reward
- if agent makes a (legal) move, punsh with -1 in reward, as we do not want to encourage endless walking around in the 10x10 grid of the environment
- if agent tries to pick up the item, but it is not there or it has it already, punish with 10 in reward
- if the agent picks up the item correct place, reward with 20
- if the agent tries to drop-off item in wrong place or does not have the item, punish with 10 in reward
- if the agent drops-off item in correct place, reward with 20


In [2]:
class Field:
    def __init__(self, size, item_pickup, item_dropoff, start_position):
        self.size = size
        self.item_pickup = item_pickup
        self.item_dropoff = item_dropoff
        self.position = start_position
        self.item_in_car = False

    def number_of_states(self):
        return self.size * self.size * self.size * self.size * 2

    def get_state(self):
        state = self.position[0] * self.size * self.size * self.size * 2
        state = state + self.position[1] * self.size * self.size * 2
        state = state + self.item_pickup[0] * self.size * 2
        state = state + self.item_pickup[1] * 2
        if self.item_in_car:
            state = state + 1
        return state

    def make_action(self, action):
        (x, y) = self.position
        if action == 0:  # down
            if y == self.size - 1:
                return -10, False
            else:
                self.position = (x, y + 1)
                return -1, False
        elif action == 1:  # up
            if y == 0:
                return -10, False
            else:
                self.position = (x, y - 1)
                return -1, False
        elif action == 2:  # left
            if x == 0:
                return -10, False
            else:
                self.position = (x - 1, y)
                return -1, False
        elif action == 3:  # right
            if x == self.size - 1:
                return -10, False
            else:
                self.position = (x + 1, y)
                return -1, False
        elif action == 4:  # pickup
            if self.item_in_car:
                return -10, False
            elif self.item_pickup != (x, y):
                return -10, False
            else:
                self.item_in_car = True
                return 20, False
        elif action == 5:  # dropoff
            if not self.item_in_car:
                return -10, False
            elif self.item_dropoff != (x, y):
                self.item_pickup = (x, y)
                self.item_in_car = False
                return -10, False
            else:
                self.item_in_car = False
                return 20, True  # Mark done as True when dropoff is successful
        return -10, False  # Default return value to ensure a tuple is always returned

In [3]:
size = 10
item_pickup = (0, 0)
item_dropoff = (9, 9)
start_position = (9, 0)

field = Field(size, item_pickup, item_dropoff, start_position)

In [4]:
for _ in range(9):
    field.make_action(2)

In [5]:
field.position

(0, 0)

In [6]:
field.make_action(4)

for _ in range(9):
    field.make_action(0)

for _ in range(9):
    field.make_action(3)

In [None]:
field.position

In [None]:
field.make_action(5)

In [None]:
field.item_in_car

# RANDOM SOLUTION

In [7]:
import random


def random_solution():
    size = 10
    item_pickup = (0, 0)
    item_dropoff = (9, 9)
    start_position = (9, 0)

    field = Field(size, item_pickup, item_dropoff, start_position)

    done = False
    steps = 0

    while not done:
      action = random.randint(0,5)
      reward, done = field.make_action(action)
      steps += 1
    return steps

In [8]:
random_solution()

41389

In [9]:
run = [random_solution() for _ in range(100)]

In [10]:
sum(run)/len(run)

142927.29

## Q-Learning

- Initialize Q-table with zeros
- Iterate

  - Agent is in state State
  - With probability epsilon choose to explore, else exploit
    - if explore, then choose random action
    - if exploit, then choose action based on the current Q-table
  - Update the Q-table from the new reward to the previos state

  - Q[state,action] = (1-alpha)_Q [state,action] + alpha_(reward + gamma \* max(Q[next_state])-Q[state,action])


# Q-learning Algo

In [11]:
import numpy as np
import random

size = 10
item_pickup = (0, 0)
item_dropoff = (9, 9)
start_position = (9, 0)


field = Field(size, item_pickup, item_dropoff, start_position)

number_of_states = field.number_of_states()
number_of_actions = 6

q_table = np.zeros((number_of_states,number_of_actions))

epsilon = 0.1
alpha = 0.1
gamma = 0.6

for _ in range(10000):
    field = Field(size, item_pickup, item_dropoff, start_position)
    done = False

    while not done:
        state = field.get_state()
        if random.uniform(0, 1) < epsilon:
            action = random.randint(0, 5)  # Explore
        else:
            action = np.argmax(q_table[state])  # Exploit

        reward, done = field.make_action(action)
        new_state = field.get_state()
        new_state_max = np.max(q_table[new_state])

        q_table[state, action] = (1 - alpha) * q_table[state, action] + alpha * (
            reward + gamma * new_state_max - q_table[state, action]
        )

In [12]:
q_table

array([[ 0.23071429, -2.06428571, -2.06428571,  0.23071429,  9.78571429,
        -2.06428571],
       [-0.71428571, -5.21428571, -5.21428571, -0.71428571, -5.21428571,
        -2.06428571],
       [ 1.77131848, -1.        , -1.        , -0.1       , -1.        ,
        -1.        ],
       ...,
       [-1.176964  , -0.02309023, -0.21261948, -1.26446428, -0.46442451,
        10.53735325],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ]])

## REINFORCE LEARNING

In [24]:
def reinforce_learning():
    epsilon = 0.1
    alpha = 0.1
    gamma = 0.6

    field = Field(size, item_pickup, item_dropoff, start_position)
    done = False
    steps = 0

    while not done:
        state = field.get_state()
        if random.uniform(0, 1) < epsilon:
            action = random.randint(0, 5)  # Explore
        else:
            action = np.argmax(q_table[state])  # Exploit

        reward, done = field.make_action(action)
        new_state = field.get_state()
        new_state_max = np.max(q_table[new_state])

        q_table[state, action] = (1 - alpha) * q_table[state, action] + alpha * (
            reward + gamma * new_state_max - q_table[state, action]
        )
        steps += 1
    return steps

In [14]:
random.uniform(0,1)

0.9733999996681615

# EPSILION

In [17]:
ep = 1
explore = 0
exploit = 0

for idx, i in enumerate(range(0, 100)):
    if random.uniform(0, 1) < ep:
        explore += 1
    else:
        exploit += 1

    ep = ep - 0.01
    print(idx, "-->Explore", explore, "Exploit", exploit)

0 -->Explore 1 Exploit 0
1 -->Explore 2 Exploit 0
2 -->Explore 3 Exploit 0
3 -->Explore 4 Exploit 0
4 -->Explore 5 Exploit 0
5 -->Explore 6 Exploit 0
6 -->Explore 7 Exploit 0
7 -->Explore 8 Exploit 0
8 -->Explore 9 Exploit 0
9 -->Explore 10 Exploit 0
10 -->Explore 11 Exploit 0
11 -->Explore 12 Exploit 0
12 -->Explore 13 Exploit 0
13 -->Explore 14 Exploit 0
14 -->Explore 15 Exploit 0
15 -->Explore 16 Exploit 0
16 -->Explore 17 Exploit 0
17 -->Explore 18 Exploit 0
18 -->Explore 19 Exploit 0
19 -->Explore 20 Exploit 0
20 -->Explore 20 Exploit 1
21 -->Explore 21 Exploit 1
22 -->Explore 22 Exploit 1
23 -->Explore 23 Exploit 1
24 -->Explore 24 Exploit 1
25 -->Explore 25 Exploit 1
26 -->Explore 26 Exploit 1
27 -->Explore 27 Exploit 1
28 -->Explore 28 Exploit 1
29 -->Explore 29 Exploit 1
30 -->Explore 30 Exploit 1
31 -->Explore 31 Exploit 1
32 -->Explore 32 Exploit 1
33 -->Explore 32 Exploit 2
34 -->Explore 32 Exploit 3
35 -->Explore 33 Exploit 3
36 -->Explore 34 Exploit 3
37 -->Explore 35 Exp

# GAMMA
- Gamma is a discount factor
- It is used to discount future rewards
- range of gamma just like epsilon is between 0 and 1
- Gamma Represents the importance of the first few steps more than the later steps

# ALPHA
- Alpha is the learning rate
- It is used to update the Q-table
- range of alpha is between 0 and 1
  

# CONTINUING THE CODE

In [25]:
reinforce_learning()

37

In [10]:
run = [reinforce_learning() for _ in range(100)]

In [14]:
sum(run)/len(run)

52.51