In [1]:
import random
import numpy as np

In [7]:
class GridWorld:
    def __init__(self, size=10, start=(0, 0), goal=(9, 9), energy=50):
        self.size = size
        self.start = start
        self.goal = goal
        self.energy = energy
        self.robot_pos = start
        self.object_collected = False

        # Membuat Grid
        self.grid = np.zeros((size, size))
        # Menempatkan Rintangan (X)
        self.grid[1:4, 1] = -1
        self.grid[1, 3] = -1
        self.grid[3:6, 3] = -1
        self.grid[6:, 3] = -1
        self.grid[1:7, 7] = -1

        # Menempatkan Stasiun Pengisian Daya (C)
        self.grid[0, 3] = 2
        self.grid[6, 7] = 2

        # Menempatkan Tujuan (G)
        self.grid[goal] = 3

    def reset(self):
        self.robot_pos = self.start
        self.object_collected = False
        self.energy = 50
        return (self.robot_pos, self.energy, self.object_collected)

    def step(self, action):
        # 0: Atas, 1: Bawah, 2: Kiri, 3: Kanan
        if action == 0 and self.robot_pos[0] > 0:
            self.robot_pos = (self.robot_pos[0] - 1, self.robot_pos[1])
        elif action == 1 and self.robot_pos[0] < self.size - 1:
            self.robot_pos = (self.robot_pos[0] + 1, self.robot_pos[1])
        elif action == 2 and self.robot_pos[1] > 0:
            self.robot_pos = (self.robot_pos[0], self.robot_pos[1] - 1)
        elif action == 3 and self.robot_pos[1] < self.size - 1:
            self.robot_pos = (self.robot_pos[0], self.robot_pos[1] + 1)

        self.energy -= 1  # Mengurangi energi untuk setiap langkah
        reward = -1  # Biaya langkah

        # Memeriksa rintangan
        if self.grid[self.robot_pos] == -1:
            reward = -50
            self.energy = 0 # Mengakhiri episode

        # Memeriksa stasiun pengisian daya
        if self.grid[self.robot_pos] == 2:
            if self.energy < 30: # Mengisi daya hanya jika energi rendah
                self.energy = 50
                reward = +20

        # Memeriksa tujuan
        if self.robot_pos == self.goal:
            if not self.object_collected:
                self.object_collected = True
                reward = +100

        done = self.energy <= 0 or self.robot_pos == self.goal

        return (self.robot_pos, self.energy, self.object_collected), reward, done

    def render(self):
        display = ""
        for i in range(self.size):
            for j in range(self.size):
                if (i, j) == self.robot_pos:
                    display += "R "  # Robot
                elif self.grid[i, j] == -1:
                    display += "X "  # Rintangan
                elif self.grid[i, j] == 2:
                    display += "C "  # Stasiun Pengisian Daya
                elif self.grid[i, j] == 3:
                    display += "G " # Tujuan
                else:
                    display += "- "
            display += "\n"
        print(display)

In [8]:
def initialize_q_table(size, num_actions=4):
    # Setiap status adalah (posisi, energi, objek_dikumpulkan)
    # Kita sederhanakan dengan energi yang didiskretisasi (misalnya, 0-10, 11-20, ..., 41-50)
    q_table = {}
    for row in range(size):
        for col in range(size):
            for energy_level in range(6):  # Diubah menjadi 6 tingkat energi
                for obj_collected in [False, True]:
                    q_table[((row, col), energy_level * 10, obj_collected)] = [0] * num_actions
    return q_table

def epsilon_greedy_policy(q_table, state, epsilon=0.1):
    if random.uniform(0, 1) < epsilon:
        return random.choice([0, 1, 2, 3])  # Eksplorasi
    else:
        return np.argmax(q_table[state])  # Eksploitasi

In [9]:
# Parameter
epsilon = 0.1
alpha = 0.5
gamma = 0.9
num_episodes = 1000

# Membuat lingkungan dan Q-table
env = GridWorld()
q_table = initialize_q_table(env.size)

In [10]:
# Pelatihan Q-Learning
for episode in range(num_episodes):
    state = env.reset()
    # Mendiskretisasi tingkat energi
    discretized_state = (state[0], (state[1] // 10) * 10, state[2])
    done = False

    while not done:
        action = epsilon_greedy_policy(q_table, discretized_state, epsilon)
        next_state, reward, done = env.step(action)
        discretized_next_state = (next_state[0], (next_state[1] // 10) * 10, next_state[2])

        # Pembaruan Q-table
        old_value = q_table[discretized_state][action]
        next_max = np.max(q_table[discretized_next_state])
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[discretized_state][action] = new_value

        discretized_state = discretized_next_state

    if episode % 100 == 0:
        print(f"Episode: {episode}, Energi: {state[1]}, Posisi: {state[0]}, Objek Dikumpulkan: {state[2]}")

Episode: 0, Energi: 50, Posisi: (0, 0), Objek Dikumpulkan: False
Episode: 100, Energi: 50, Posisi: (0, 0), Objek Dikumpulkan: False
Episode: 200, Energi: 50, Posisi: (0, 0), Objek Dikumpulkan: False
Episode: 300, Energi: 50, Posisi: (0, 0), Objek Dikumpulkan: False
Episode: 400, Energi: 50, Posisi: (0, 0), Objek Dikumpulkan: False
Episode: 500, Energi: 50, Posisi: (0, 0), Objek Dikumpulkan: False
Episode: 600, Energi: 50, Posisi: (0, 0), Objek Dikumpulkan: False
Episode: 700, Energi: 50, Posisi: (0, 0), Objek Dikumpulkan: False
Episode: 800, Energi: 50, Posisi: (0, 0), Objek Dikumpulkan: False
Episode: 900, Energi: 50, Posisi: (0, 0), Objek Dikumpulkan: False


In [11]:
# Menguji agen terlatih
state = env.reset()
discretized_state = (state[0], (state[1] // 10) * 10, state[2])
done = False
total_reward = 0
env.render()

while not done:
    action = np.argmax(q_table[discretized_state])
    next_state, reward, done = env.step(action)
    discretized_next_state = (next_state[0], (next_state[1] // 10) * 10, next_state[2])
    total_reward += reward
    discretized_state = discretized_next_state
    env.render()

print(f"Total reward: {total_reward}")

R - - C - - - - - - 
- X - X - - - X - - 
- X - - - - - X - - 
- X - X - - - X - - 
- - - X - - - X - - 
- - - X - - - X - - 
- - - X - - - C - - 
- - - X - - - - - - 
- - - X - - - - - - 
- - - X - - - - - G 

- R - C - - - - - - 
- X - X - - - X - - 
- X - - - - - X - - 
- X - X - - - X - - 
- - - X - - - X - - 
- - - X - - - X - - 
- - - X - - - C - - 
- - - X - - - - - - 
- - - X - - - - - - 
- - - X - - - - - G 

- - R C - - - - - - 
- X - X - - - X - - 
- X - - - - - X - - 
- X - X - - - X - - 
- - - X - - - X - - 
- - - X - - - X - - 
- - - X - - - C - - 
- - - X - - - - - - 
- - - X - - - - - - 
- - - X - - - - - G 

- - - R - - - - - - 
- X - X - - - X - - 
- X - - - - - X - - 
- X - X - - - X - - 
- - - X - - - X - - 
- - - X - - - X - - 
- - - X - - - C - - 
- - - X - - - - - - 
- - - X - - - - - - 
- - - X - - - - - G 

- - - C R - - - - - 
- X - X - - - X - - 
- X - - - - - X - - 
- X - X - - - X - - 
- - - X - - - X - - 
- - - X - - - X - - 
- - - X - - - C - - 
- - - X -