In [1]:
import os
import torch
from rl.agent import DQNAgent
from gymjsp.jsspenv import HeuristicJsspEnv
from ortools_scheduler import ORtools_scheduler
from matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd
from copy import deepcopy

In [2]:
# instances = ["ft06", "la01", "la06", "la11", "la21", "la31", "la36", "orb01", "swv01", "swv06", "swv11", "yn1"]
# instances = ["swv06"]
num_episodes = 1000
memory_size = 100000
batch_size = 64
target_update = 100
noisy = False
plotting_inteval = 10

#### 随机环境

In [3]:
random_rate = 0.5
cv = 0.2
n = 10 

In [19]:
instance = "ft06"
policy_file = f"policies/dqn_mlp/{instance}_num_episodes={num_episodes}_memory_size={memory_size}_target_update={target_update}_noisy={noisy}.pth"
env = HeuristicJsspEnv(instance)
agent = DQNAgent(env, memory_size, batch_size, target_update, noisy=noisy)
agent.load_dqn(policy_file)

In [20]:
state = env.reset()
done = False
agent_actions = []
score = 0
while not done:
    action = agent.select_action(state, determine=True)
    agent_actions.append(int(action))
    next_state, reward, done, info = env.step(action)
    state = next_state
    score += reward
makespan = info["makespan"]
print(f"Agent action, makespan = {makespan}, score = {score}")
print(agent_actions)

Agent action, makespan = 65, score = -2.100097125097125
[4, 4, 4, 3, 0, 0, 4, 0, 0]


In [16]:
state = env.reset()
done = False
agent_actions = []
score = 0
while not done:
    action = 6
    agent_actions.append(int(action))
    next_state, reward, done, info = env.step(action)
    state = next_state
    score += reward
makespan = info["makespan"]
print(f"Agent action = 6, makespan = {makespan}, score = {score}")

Agent action = 6, makespan = 59, score = -2.4800505050505044


In [18]:
env = HeuristicJsspEnv("ft06", schedule_cycle=1)
state = env.reset()
done = False
agent_actions = []
score = 0
while not done:
    action = agent.select_action(state, determine=True)
    agent_actions.append(int(action))
    next_state, reward, done, info = env.step(action)
    state = next_state
    score += reward
makespan = info["makespan"]
print(f"Agent action, makespan = {makespan}, score = {score}")
print(agent_actions)

Agent action, makespan = 65, score = -26.916666666666654
[4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 7, 5, 5, 0, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 2, 2, 2, 2, 7, 7, 7, 7, 2, 2, 2, 2, 2, 7, 5, 0]


In [5]:
instance = "ft06"
policy_file = f"policies/dqn_mlp/{instance}_num_episodes=500_memory_size={memory_size}_target_update={target_update}_noisy={noisy}_cycle=1.pth"
env = HeuristicJsspEnv(instance, schedule_cycle=1)
agent = DQNAgent(env, memory_size, batch_size, target_update, noisy=noisy)
agent.load_dqn(policy_file)

In [7]:
state = env.reset()
done = False
agent_actions = []
score = 0
while not done:
    action = agent.select_action(state, determine=True)
    agent_actions.append(int(action))
    next_state, reward, done, info = env.step(action)
    state = next_state
    score += reward
makespan = info["makespan"]
print(f"Agent action, makespan = {makespan}, score = {score}")

Agent action, makespan = 59, score = -19.833333333333332


#### 对比随机动作

In [22]:
state = env.reset()
done = False
agent_actions = []
while not done:
    action = agent.select_action(state, determine=True)
    agent_actions.append(int(action))
    next_state, reward, done, info = env.step(action)
    state = next_state
makespan = info["makespan"]
print(f"Agent action, makespan = {makespan}")

Agent action, makespan = 2077


In [91]:
makespans = []
for _ in range(10):
    state = env.reset()
    done = False
    while not done:
        action = np.random.randint(0, 8)
        next_state, reward, done, info = env.step(action)
        state = next_state

    makespan = info["makespan"]
    makespans.append(makespan)
print(f"Random action, makespan = {np.mean(makespans):.2f}")

Random action, makespan = 2375.10


#### 对比 heuristic rule

In [10]:
state = env.reset()
done = False
agent_actions = []
while not done:
    action = 6
    agent_actions.append(int(action))
    next_state, reward, done, info = env.step(action)
    state = next_state
makespan = info["makespan"]
print(f"Agent action=6, makespan = {makespan}")

Agent action=6, makespan = 2287


#### agent train on swv06, test on swv07-swv09

agent

In [6]:
test_instances = [f"swv0{x}" for x in range(7,10)]
for instance in test_instances:
    test_env = HeuristicJsspEnv(instance)
    state = test_env.reset()
    done = False
    while not done:
        action = agent.select_action(state, determine=True)
        next_state, reward, done, info = test_env.step(action)
        state = next_state
    makespan = info["makespan"]
    print(f"On instance {instance}, makespan = {makespan}")

On instance swv07, makespan = 1914
On instance swv08, makespan = 2366
On instance swv09, makespan = 2074


random

In [7]:
test_instances = [f"swv0{x}" for x in range(7,10)]
for instance in test_instances:
    test_env = HeuristicJsspEnv(instance)
    state = test_env.reset()
    done = False
    while not done:
        action = np.random.randint(0, 8)
        next_state, reward, done, info = test_env.step(action)
        state = next_state
    makespan = info["makespan"]
    print(f"On instance {instance}, makespan = {makespan}")

On instance swv07, makespan = 2173
On instance swv08, makespan = 2518
On instance swv09, makespan = 2472


#### 哪些动作做的多

In [25]:
from collections import Counter
for i in Counter(agent_actions).keys():
    print(f"Agent choose action {i} for {Counter(agent_actions)[i]} times")

Agent choose action 3 for 49 times
Agent choose action 6 for 154 times
Agent choose action 7 for 38 times
Agent choose action 5 for 19 times


action | rule  
3        shortest processing time  
6        most operations remaining  
7        least operations remaining  
5        shortest processing time remained  

#### 哪些state重要

In [63]:
model = agent._get_dqn()

In [46]:
state = env.reset()
state = torch.FloatTensor(state).to(agent.device)
state.requires_grad_(True)
print(state)
model.eval()
Q = model(state)

tensor([ 0.0000,  0.0000, -1.0000,  ...,  0.0000,  0.0000,  0.7333],
       device='cuda:0', requires_grad=True)


In [47]:
external_grad = torch.tensor(np.ones(8)).to(agent.device)
Q.backward(gradient=external_grad)

In [50]:
state.grad

tensor([-0.0558,  0.0433, -0.0338,  ..., -0.6092, -1.2825, -0.1516],
       device='cuda:0')

In [51]:
state_grad = state.grad.detach().cpu().numpy()

In [54]:
state_grad.max()

15.880271

In [61]:
np.where(state_grad>3)              # 阈值为 3

(array([   7,   62,   97,  247,  307,  347,  397,  427,  477,  527,  537,
         587,  677,  687,  737,  867,  977, 1297, 1447, 1507, 1517, 1577,
        1597, 1607, 1907, 1917, 2012, 2057, 2077, 2097, 2187, 2357, 2376,
        2467, 2537, 2547, 2677, 2707, 2817, 2846, 2847, 2976], dtype=int64),)

显然agent考虑的主要是 7 结尾的特征，即工件的剩余加工时间

In [76]:
a = np.where(state_grad>1)[0]           # 阈值为 1
a = [x%10 for x in a]
for i in Counter(a).keys():
    print(f"Agent consider state {i} for {Counter(a)[i]} times")

Agent consider state 7 for 100 times
Agent consider state 2 for 78 times
Agent consider state 8 for 43 times
Agent consider state 6 for 64 times


2 - type  
6 - waiting_time  
7 - remain_time  
8 - doable

In [60]:
state_grad[np.where(state_grad>10)]   # 阈值为 10

array([12.657236, 15.880271, 10.319552, 14.00865 , 11.86575 ],
      dtype=float32)

#### 第一个动作，有多重要

In [62]:
for first_action in range(8):
    state = env.reset()
    next_state, reward, done, info = env.step(first_action)
    state = next_state
    while not done:
        action = agent.select_action(state, determine=True)
        next_state, reward, done, info = env.step(action)
        state = next_state
    makespan = info["makespan"]
    print(f"Agent first action is {first_action}, makespan = {makespan}")

Agent first action is 0, makespan = 2049
Agent first action is 1, makespan = 2052
Agent first action is 2, makespan = 2085
Agent first action is 3, makespan = 2077
Agent first action is 4, makespan = 1951
Agent first action is 5, makespan = 2241
Agent first action is 6, makespan = 2049
Agent first action is 7, makespan = 2049


实际agent选择action=3，可以看出不是最好。agent应该学到了根据这些工件的剩余加工时间来调度，

#### enhanced policy by greedy search
greedy can be substituted by MCTS

In [87]:
def get_greedy_action(env_start):
    """Given env_start at some status, simulate the process with first action [0,1,2,3,4,5,6,7]
    and return the action with the lowest makespan.
    """
    makespans = []
    for a in range(8):
        env = deepcopy(env_start)
        next_state, reward, done, info = env.step(a)
        state = next_state
        while not done:
            action = agent.select_action(state, determine=True)
            next_state, reward, done, info = env.step(action)
            state = next_state
        makespans.append(info["makespan"])
    return np.argmin(makespans), makespans

In [89]:
step_size =30

done = False
state = env.reset()
while not done:
    a, makespans = get_greedy_action(env)
    next_state, reward, done, info = env.step(a)
    state = next_state
    print(makespans)
    steps = 0
    while steps < step_size:
        action = agent.select_action(state, determine=True)
        next_state, reward, done, info = env.step(action)
        state = next_state
        steps += 1
        if done:
            break

[2049, 2052, 2085, 2077, 1951, 2241, 2049, 2049]
[1981, 1984, 1984, 1951, 1981, 1984, 1951, 1981]
[2009, 1951, 2009, 1951, 1951, 2009, 1951, 2125]
[1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951]
[1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951]
[1993, 1996, 1951, 1993, 1951, 2013, 1951, 1996]
[1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951]
[1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951]


只有 first action 能优化makespan

In [90]:
step_size =10

done = False
state = env.reset()
while not done:
    a, makespans = get_greedy_action(env)
    next_state, reward, done, info = env.step(a)
    state = next_state
    print(makespans)
    steps = 0
    while steps < step_size:
        action = agent.select_action(state, determine=True)
        next_state, reward, done, info = env.step(action)
        state = next_state
        steps += 1
        if done:
            break

[2049, 2052, 2085, 2077, 1951, 2241, 2049, 2049]
[2082, 2142, 2109, 1951, 2058, 2045, 2000, 2082]
[1951, 2026, 2026, 1951, 1951, 2026, 1951, 1951]
[1951, 2025, 2025, 1951, 2025, 1951, 2025, 1951]
[1951, 2045, 2045, 1951, 2045, 1981, 2045, 1981]


KeyboardInterrupt: 