In [1]:
import gym
from gym.spaces import Discrete, Box

import numpy as np

In [2]:
def next_state(items, state, action):
    idx = action // 2
    act = action % 2

    if idx < len(items):
        state[idx] += (1 if act == 1 else -1)

    return state

def calc_value(items, state, max_weight, burst_value):
    reward = 0
    weight = 0
    
    for i in range(len(state)):
        reward += items[i][0] * state[i]
        weight += items[i][1] * state[i]
    
    if weight > max_weight or min(state) < 0:
        reward = burst_value
    
    return reward, weight

# Env

In [3]:
class Knapsack(gym.Env):
    def __init__(self, config):
        self.items = config["items"]
        self.max_weight = config["max_weight"]
        self.episode_steps = config["episode_steps"]
        self.burst_reward = config["burst_reward"]
        self.bonus_rules = config["bonus_rules"]
        
        n = self.episode_steps
        
        self.action_space = Discrete(len(self.items) * 2 + 1)
        self.observation_space = Box(low = -n, high = n, shape = (len(self.items), ))
        
        self.reset()

    def reset(self):
        self.current_steps = 0
        self.state = [0 for _ in self.items]
        
        return self.state

    def step(self, action):
        self.state = next_state(self.items, self.state, action)
        
        r, _ = calc_value(self.items, self.state, self.max_weight, self.burst_reward)
        reward = r
        
        for (v, b) in self.bonus_rules:
            if r > v:
                reward += b
        
        self.current_steps += 1
        done = self.current_steps >= self.episode_steps
        
        return self.state, reward, done, {}

In [4]:
items = [
    [105, 10],
    [74, 7],
    [164, 15],
    [32, 3],
    [235, 22]
]


In [5]:
config = {
    "env": Knapsack, 
    "vf_clip_param": 60,
    "env_config": {
        "items": items, "episode_steps": 10, "max_weight": 35, "burst_reward": -100, 
        "bonus_rules": [ (375, 200) ]
    }
}


In [6]:
import ray

#ray.shutdown()
ray.init()


2021-10-01 00:07:41,101	INFO services.py:1265 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.0.196',
 'raylet_ip_address': '192.168.0.196',
 'redis_address': '192.168.0.196:6379',
 'object_store_address': 'tcp://127.0.0.1:59717',
 'raylet_socket_name': 'tcp://127.0.0.1:57331',
 'webui_url': '127.0.0.1:8265',
 'session_dir': 'C:\\Users\\Takumi\\AppData\\Local\\Temp\\ray\\session_2021-10-01_00-07-40_104029_5276',
 'metrics_export_port': 62596,
 'node_id': '4cb013c27e987b4844903d0375265fed25d8109544c3dd23fd7159f6'}

# PPO

In [7]:
from ray.rllib.agents.ppo import PPOTrainer

trainer = PPOTrainer(config = config)

2021-10-01 00:07:53,645	INFO trainer.py:714 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
2021-10-01 00:07:53,646	INFO ppo.py:159 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2021-10-01 00:07:53,646	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


# Train

In [8]:
r_max = []
r_min = []
r_mean = []


In [12]:
from ray.tune.logger import pretty_print

for _ in range(2):
    r = trainer.train()
    print(pretty_print(r))
    
    r_max.append(r["episode_reward_max"])
    r_min.append(r["episode_reward_min"])
    r_mean.append(r["episode_reward_mean"])


StopIteration: 

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

plt.plot(r_max, label = "reward_max", color = "red")
plt.plot(r_min, label = "reward_min", color = "green")
plt.plot(r_mean, label = "reward_mean", color = "blue")

plt.legend(loc = "upper left")
plt.ylabel("reward")

plt.show()

# Evaluate

In [None]:
s = [0 for _ in range(len(items))]

for _ in range(config["env_config"]["episode_steps"]):
    a = trainer.compute_action(s)
    
    s = next_state(items, s, a)
    
    r, w = calc_value(items, s, config["env_config"]["max_weight"], config["env_config"]["burst_reward"])
    
    print(f"{a}, {s}, {r}, {w}")
    

In [None]:
import collections

rs = []

for _ in range(1000):
    
    s = [0 for _ in range(len(items))]
    r_tmp = config["env_config"]["burst_reward"]

    for _ in range(config["env_config"]["episode_steps"]):
        a = trainer.compute_action(s)
        s = next_state(items, s, a)

        r, w = calc_value(items, s, config["env_config"]["max_weight"], config["env_config"]["burst_reward"])
        
        r_tmp = max(r, r_tmp)
        
        #print(f"{a}, {s}, {r}, {w}")

    rs.append(r_tmp)

collections.Counter(rs)

# Save

In [None]:
checkpoint = trainer.save()
checkpoint

# Load

In [None]:
trainer.restore(checkpoint)
