# 倉庫揀貨流程模擬系統(Simulation)

In [1]:
# 匯入numpy套件，用於數值計算與陣列處理
import numpy as np

# 將倉庫中每個位置對應到唯一的整數編號(state)，方便在強化學習中使用數值代表狀態
# 例如：位置'A'對應到狀態0，位置'B'對應到狀態1，依此類推
location_to_state = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 
                     'G': 6, 'H': 7, 'I': 8, 'J': 9, 'K': 10, 'L': 11}

# 行動空間(actions)：代表agent可以選擇移動到的目標位置編號，範圍從0到11，對應倉庫中12個不同位置
actions = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [2]:
# 行動限制矩陣R：表示倉庫中位置間的可移動關係
# 矩陣的行與列分別對應位置A到L(0到11)
# 1表示可從該行位置移動到該列位置，0表示不可移動
# 位置'G'(索引6)有特殊高獎勵1000，可能代表目標位置

R = np.array([
    # A  B  C  D  E  F   G    H  I  J  K  L
    [ 0, 1, 0, 0, 0, 0,  0,   0, 0, 0, 0, 0],  # A(0)
    [ 1, 0, 1, 0, 0, 1,  0,   0, 0, 0, 0, 0],  # B(1)
    [ 0, 1, 0, 0, 0, 0,  1,   0, 0, 0, 0, 0],  # C(2)
    [ 0, 0, 0, 0, 0, 0,  0,   1, 0, 0, 0, 0],  # D(3)
    [ 0, 0, 0, 0, 0, 0,  0,   0, 1, 0, 0, 0],  # E(4)
    [ 0, 1, 0, 0, 0, 0,  0,   0, 0, 1, 0, 0],  # F(5)
    [ 0, 0, 1, 0, 0, 0, 1000, 1, 0, 0, 0, 0],  # G(6)，高獎勵位置
    [ 0, 0, 0, 1, 0, 0,  1,   0, 0, 0, 0, 1],  # H(7)
    [ 0, 0, 0, 0, 1, 0,  0,   0, 0, 1, 0, 0],  # I(8)
    [ 0, 0, 0, 0, 0, 1,  0,   0, 1, 0, 1, 0],  # J(9)
    [ 0, 0, 0, 0, 0, 0,  0,   0, 0, 1, 0, 1],  # K(10)
    [ 0, 0, 0, 0, 0, 0,  0,   1, 0, 0, 1, 0]   # L(11)
])

In [3]:
gamma = 0.75  # 折扣因子，影響未來獎勵的權重
alpha = 0.9   # 學習率，控制Q值更新的速度

# 行動值函數初始值為0
Q = np.array(np.zeros([12, 12]))

# 進行指定數量的回合(1000回合)
for i in range(1000):
    # 隨機選擇初始狀態(0到11)
    current_state = np.random.randint(0, 12)

    # 初始化空清單，用來存放可行的行動(位置索引)
    playable_actions = []

    # 針對所有位置索引0到11做迴圈檢查，若在行動限制矩陣R中current_state到j的位置可移動(值>0)，就把j加入可行動清單
    for j in range(12):
        if R[current_state, j] > 0:
            playable_actions.append(j)
    
    # 從所有可行動的狀態中，隨機選擇下一個狀態(即執行的行動)
    next_state = np.random.choice(playable_actions)

    # 計算Temporal Difference(TD)誤差
    # TD = 當前獎勵 + 折扣後的未來最大行動值 - 現在的行動值
    TD = R[current_state, next_state] + gamma * Q[next_state, np.argmax(Q[next_state, :])] - Q[current_state, next_state]

# 利用學習率(alpha)乘以TD誤差，更新行動值函數Q
Q[current_state, next_state] += alpha * TD

In [4]:
# 匯入pandas套件，用於資料操作與分析
import pandas as pd

# 將Q值陣列轉成DataFrame，欄位名稱使用location_to_state字典中的位置名稱(A, B, C...)
# 使用sorted()方法確保位置名稱依字母順序排列
q_values = pd.DataFrame(Q, columns = sorted(location_to_state.keys()))

# 對Q值四捨五入並用背景顏色漸層呈現，顏色映射使用'GnBu'色系(綠藍漸層)
s = q_values.round().style.background_gradient(cmap='GnBu')

# 顯示格式化後的Q值表格
s

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# 行動限制矩陣R：表示倉庫中各位置之間的可移動關係
# R[i][j] = 1表示可以從位置i移動到位置j，0表示不可移動
# 每一列對應的位置分別為A(0), B(1), C(2), ..., L(11)

R = np.array([
   # A  B  C  D  E  F  G  H  I  J  K  L
    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # A可到B
    [1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0],  # B可到A, C, F
    [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],  # C可到B, G
    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],  # D可到H
    [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],  # E可到I
    [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],  # F可到B, J
    [0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0],  # G可到C, G(自身), H
    [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1],  # H可到D, G, L
    [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],  # I可到E, J
    [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0],  # J可到F, I, K
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],  # K可到J, L
    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0],  # L可到H, K
])

In [6]:
# location_to_state：位置名稱，如A對應的數值編號0
# state_to_location：反轉後的字典，用於將狀態數值轉換回位置名稱
# 例如：state_to_location[0]會傳回'A'

# 將位置名稱與編號對調，建立數字到位置的字典
state_to_location = {state: location for location, state in location_to_state.items()}

# 輸出狀態代碼對應的位置名稱字典
state_to_location

{0: 'A',
 1: 'B',
 2: 'C',
 3: 'D',
 4: 'E',
 5: 'F',
 6: 'G',
 7: 'H',
 8: 'I',
 9: 'J',
 10: 'K',
 11: 'L'}

In [7]:
# 建立route函式，傳入起點與終點位置
def route(starting_location, ending_location):
    # 將終點位置轉換成對應的狀態編號
    ending_state = location_to_state[ending_location]  
    
    # 複製獎勵矩陣R，避免修改原始矩陣
    R_new = np.copy(R)  
    
    # 將終點狀態的獎勵設為1000，表示抵達終點的高獎勵
    R_new[ending_state, ending_state] = 1000  
    
    # 初始化Q矩陣，12x12大小，初始值全為0
    Q = np.array(np.zeros([12, 12]))  
    
    # 在每個回合中持續與環境互動，進行1000次學習迭代
    for i in range(1000):
        # 隨機選擇當前狀態(狀態編號0到11)
        current_state = np.random.randint(0, 12)
        
        # 初始化可行動作列表，用於存放當前狀態可移動到的下一狀態
        playable_actions = []

        # 檢查所有可能的下一個狀態(0到11)
        for j in range(12):
            # 若從當前狀態移動到j狀態有獎勵，代表可行動
            if R_new[current_state, j] > 0:
                # 將該行動加入可執行行動清單
                playable_actions.append(j)
        
        # 從可行動作中隨機選擇下一狀態，模擬探索環境
        next_state = np.random.choice(playable_actions)
        
        # 計算TD誤差：即時獎勵 + 折扣後未來最高Q值 - 現有Q值
        TD = R_new[current_state, next_state] + gamma * Q[next_state, np.argmax(Q[next_state,])] - Q[current_state, next_state]
        
        # 更新Q值，使用學習率alpha調整
        Q[current_state, next_state] = Q[current_state, next_state] + alpha * TD  
    
    # 初始化路徑列表，將起點加入路徑中
    route = [starting_location]

    # 將下一位置暫時設為起點，準備開始尋找最佳路徑
    next_location = starting_location
    
    # 持續找到下一步直到抵達終點
    while next_location != ending_location:
        # 將當前位置轉換成狀態編號
        starting_state = location_to_state[starting_location]
        
        # 從Q矩陣中找出該狀態Q值最高的行動(下一狀態)
        next_state = np.argmax(Q[starting_state,])
        
        # 將狀態編號轉換為對應的實際位置名稱
        next_location = state_to_location[next_state]
        
        # 將下一位置加入路徑
        route.append(next_location)
        
        # 將起點更新為新找到的下一個位置
        starting_location = next_location

    # 傳回最終找到的最佳路徑
    return route

In [8]:
# 測試從節點E到節點G的最佳路徑
route('E', 'G')

['E', 'I', 'J', 'K', 'L', 'H', 'G']

In [9]:
# 測試從節點A到節點K的最佳路徑
route('A', 'K')

['A', 'B', 'F', 'J', 'K']

In [10]:
# 建立best_route函式，傳入起點、中繼點與終點位置，並傳回最佳路徑
def best_route(starting_location, intermediary_location, ending_location):
    # 3個點的路由 = 起點到中繼點的最佳路由 + 中繼點到終點的最佳路由(去掉重複的中繼點)
    # 例如：best_route('A', 'G', 'I') = route('A', 'G') + route('G', 'I')[1:]
    return route(starting_location, intermediary_location) + route(intermediary_location, ending_location)[1:]

In [11]:
# 測試從E經過K到G的最佳路徑
best_route('E', 'K', 'G')

['E', 'I', 'J', 'K', 'L', 'H', 'G']

In [12]:
# 測試從A經過G到K的最佳路由
initial = "A"       # 起點位置設定為A
intermediary = "G"  # 中繼點位置設定為G
final = "K"         # 終點位置設定為K

# 呼叫best_route函式，計算從起點經中繼點到終點的最佳路徑，並將結果存入best
best = best_route(initial, intermediary, final)

# 輸出標題文字提示
print('最佳路由: ')

# 使用print的展開運算子 * 將路徑列表展開，並用逗號和空格分隔，輸出完整路徑
print(*best, sep=', ')

最佳路由: 
A, B, C, G, H, L, K
