In [3]:
# Section 1: Imports
import numpy as np

In [4]:

#Section 2: initializing environment
class Environment:

    def __init__(self):

        #  grid layout
        self.grid = np.array([
            ['.', '.', '.', '.', '.', '.', '.', '.', '.'],
            ['.', '.', '.', 'G', '.', '.', '.', '.', '.'],
            ['E', '.', '.', 'A', '.', '.', '.', 'E', 'G'],
            ['P', '.', '.', '.', '.', '.', '.', 'O', '.']
        ])


        # walls. defined as bidirectional
        self.walls = [

            # Horizontal
            ((0, 0), (1, 0)), ((1, 0), (0, 0)),
            ((0, 1), (1, 1)), ((1, 1), (0, 1)),
            ((0, 2), (1, 2)), ((1, 2), (0, 2)),
            ((2, 0), (3, 0)), ((3, 0), (2, 0)),
            ((1, 1), (2, 1)), ((2, 1), (1, 1)),
            ((1, 2), (2, 2)), ((2, 2), (1, 2)),
            ((0, 7), (1, 7)), ((1, 7), (0, 7)),
            ((2, 7), (3, 7)), ((3, 7), (2, 7)),
            ((2, 8), (3, 8)), ((3, 8), (2, 8)),
            # Vertical
            ((0, 3), (0, 4)), ((0, 4), (0, 3)),
            ((0, 4), (0, 5)), ((0, 5), (0, 4)),
            ((1, 3), (1, 4)), ((1, 4), (1, 3)),
            ((2, 3), (2, 4)), ((2, 4), (2, 3)),
            ((1, 4), (1, 5)), ((1, 5), (1, 4)),
            ((2, 4), (2, 5)), ((2, 5), (2, 4)),
            ((1, 6), (1, 7)), ((1, 7), (1, 6)),
            ((2, 6), (2, 7)), ((2, 7), (2, 6)),
            ((3, 3), (3, 4)), ((3, 4), (3, 7))
        ]

        self.has_egg_beater = False

        # location of elements on the grid
        self.egg_beater_positions = [(2, 0), (2, 7)]
        self.goal = (3, 7)
        self.teleportation_locations = {(1, 3): (2, 8), (2, 8): (1, 3)}



    def reset(self):
        self.agent_location = (2, 3)
        self.has_egg_beater = False
        return (self.agent_location, self.has_egg_beater)


    # checking move to make sure that it won't hit wall
    def checkIfValidMove(self, move):
        updatedMove = (self.agent_location[0] + move[0], self.agent_location[1] + move[1])
        if not (0 <= updatedMove[0] < self.grid.shape[0] and 0 <= updatedMove[1] < self.grid.shape[1]):
            return False
        if (self.agent_location, updatedMove) in self.walls or (updatedMove, self.agent_location) in self.walls:
            return False
        return True



    def oneStep(self, action):
        # Define possible moves including teleport as a discrete action
        moves = {'up': (-1, 0), 'down': (1, 0), 'left': (0, -1), 'right': (0, 1), 'teleport': (0, 0)}

        reward = -1  # Each move has some penalty to encourage agent to go to high reward place as soon as possible
        done = False

        if action in moves:
            if action != 'teleport':
                move = moves[action]
                updatedMove = (self.agent_location[0] + move[0], self.agent_location[1] + move[1])

                # Check for valid move and update position if not a wall or out of bounds
                if self.checkIfValidMove(move):
                    self.agent_location = updatedMove

            # Teleporting
            else:
                if self.agent_location in self.teleportation_locations:
                    self.agent_location = self.teleportation_locations[self.agent_location]
                else:
                    reward = -10  # Penalize for invalid teleport attempt

        # check egg beater pick-up
        if self.agent_location in self.egg_beater_positions:
            self.has_egg_beater = True

        # Check if the new position is a goal. give reward if its goal, penalize if not.
        if self.agent_location == self.goal:
            if self.has_egg_beater:
                reward = 100
                done = True
            else:
                reward = -10

        return (self.agent_location, self.has_egg_beater), reward, done









in the beginning i initialize environment. define 2D array to represent kitchen. since counting starts from botton in the written work pdf, my first array is bottom line of grid represented in pdf.   
then i defined walls as bidirectional couples to block movement from both sides using checkIfValidMove() function. then i defined locations of important elements on the grid.
in this section i also have reset and oneStep methods.The reset() function reinitializes the environment to its starting state for a new episode. The oneStep() function advances the environment by one timestep in response to an action taken by the agent. It updates the agent's position based on the action, checks for interactions (like picking up an item or hitting a wall), calculates the reward or penalty, and determines whether the episode has ended (e.g., reaching a goal).

In [5]:
# section 3. defining agent. initializing and adding action method

class Agent:
    def __init__(self, states, actions, epsilon=1.0, alpha=0.1, gamma=0.99, epsilon_decay=0.995):
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = 0.01
        self.actions = actions
        self.q_table = {((state, has_egg_beater), action): 0.0 for state in states for has_egg_beater in [False, True] for action in actions}

    def action(self, compound_state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.actions)
        q_values = [self.q_table[(compound_state, action)] for action in self.actions]
        max_q_value = max(q_values)
        actions_with_max_q = [action for action, q in zip(self.actions, q_values) if q == max_q_value]
        return np.random.choice(actions_with_max_q)

    def qValuesUpdate(self, compound_state, action, reward, next_compound_state):
        max_next_q = max([self.q_table[(next_compound_state, a)] for a in self.actions])
        self.q_table[(compound_state, action)] += self.alpha * (reward + self.gamma * max_next_q - self.q_table[(compound_state, action)])


In this section , the `Agent` class is defined, encapsulating the initialization and action-decision mechanisms for a reinforcement learning agent.

The constructor initializes the agent with parameters for exploration (`epsilon`), learning rate (`alpha`), discount factor (`gamma`), and exploration decay (`epsilon_decay`).

It also initializes a Q-table for storing the value of state-action pairs, accounting for the presence of an egg beater in the state.

i have action() method, which  decides the next action: either exploring randomly, based on `epsilon`, or exploiting the best-known action from the Q-table.

The qValuesUpdate() method updates the Q-table using the reward received

In [6]:
# section 4. simulation  and main execution.
def simulation(total_episodes, steps_for_each_episode, agent, environment):
    for episode in range(total_episodes):
        state, has_egg_beater = environment.reset()
        print(f"Episode {episode + 1}: Starting Position = {state}, Has Egg Beater = {has_egg_beater}")
        total_rewards = 0

        for oneStep in range(steps_for_each_episode):
            compound_state = (state, has_egg_beater)
            action = agent.action(compound_state)
            next_compound_state, reward, done = environment.oneStep(action)
            next_state, next_has_egg_beater = next_compound_state

            print(f"Step {oneStep + 1}: Position = {next_state}, Has Egg Beater = {next_has_egg_beater}, Action = {action}, Reward = {reward}")
            agent.qValuesUpdate(compound_state, action, reward, next_compound_state)

            state, has_egg_beater = next_state, next_has_egg_beater
            total_rewards += reward

            if done:
                print(f"Reached goal in {oneStep + 1} steps with Egg Beater: {has_egg_beater}")
                break

        print(f"Episode {episode + 1}: Total Rewards = {total_rewards}\n")




#  Main Execution
if __name__ == "__main__":
    states = [(x, y) for x in range(4) for y in range(9)]
    actions = ['up', 'down', 'left', 'right', 'teleport']
    environment = Environment()
    agent = Agent(states, actions, epsilon=0.1, alpha=0.1, gamma=0.9)
    total_episodes = 500
    steps_for_each_episode = 100
    simulation(total_episodes,steps_for_each_episode , agent, environment)











[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Step 7: Position = (0, 7), Has Egg Beater = True, Action = left, Reward = -1
Step 8: Position = (0, 7), Has Egg Beater = True, Action = down, Reward = -1
Step 9: Position = (0, 6), Has Egg Beater = True, Action = left, Reward = -1
Step 10: Position = (1, 6), Has Egg Beater = True, Action = down, Reward = -1
Step 11: Position = (1, 5), Has Egg Beater = True, Action = left, Reward = -1
Step 12: Position = (1, 6), Has Egg Beater = True, Action = right, Reward = -1
Step 13: Position = (2, 6), Has Egg Beater = True, Action = down, Reward = -1
Step 14: Position = (3, 6), Has Egg Beater = True, Action = down, Reward = -1
Step 15: Position = (3, 7), Has Egg Beater = True, Action = right, Reward = 100
Reached goal in 15 steps with Egg Beater: True
Episode 224: Total Rewards = 86

Episode 225: Starting Position = (2, 3), Has Egg Beater = False
Step 1: Position = (1, 3), Has Egg Beater = False, Action = up, Reward = -1
Step 2: Posit

In this section  a simulation() function runs a specified number of episodes, each with a set number of steps, to train a reinforcement learning agent within an environment. For each episode, the environment is reset, and the agent's starting state is printed.

The agent then takes actions, receives feedback (rewards), and updates its knowledge base (Q-table) accordingly. This process repeats for each step until the episode ends (either by reaching a goal or hitting the maximum number of steps).

The main execution part sets up the environment, agent, and simulation parameters before running the simulation, aiming to learn an optimal policy for the agent to navigate the environment effectively.


after running multiple times my algorithm found, that, minimum number for agent from its starting position(which is hardcoded in my case, but can be ramdomized) to goal position(which is oven position, since we need to make pudding) is 12 steps. one example for reaching goal is episode :

Episode 439: Starting Position = (2, 3), Has Egg Beater = False

Step 1: Position = (1, 3), Has Egg Beater = False, Action = up, Reward = -1

Step 2: Position = (2, 8), Has Egg Beater = False, Action = teleport, Reward = -1

Step 3: Position = (2, 7), Has Egg Beater = True, Action = left, Reward = -1

Step 4: Position = (2, 8), Has Egg Beater = True, Action = right, Reward = -1

Step 5: Position = (1, 8), Has Egg Beater = True, Action = up, Reward = -1

Step 6: Position = (0, 8), Has Egg Beater = True, Action = up, Reward = -1

Step 7: Position = (0, 7), Has Egg Beater = True, Action = left, Reward = -1

Step 8: Position = (0, 6), Has Egg Beater = True, Action = left, Reward = -1

Step 9: Position = (1, 6), Has Egg Beater = True, Action = down, Reward = -1

Step 10: Position = (2, 6), Has Egg Beater = True, Action = down, Reward = -1

Step 11: Position = (3, 6), Has Egg Beater = True, Action = down, Reward = -1

Step 12: Position = (3, 7), Has Egg Beater = True, Action = right, Reward = 100
Reached goal in 12 steps with Egg Beater: True
Episode 439: Total Rewards = 89