In [1]:
import random
import math
import numpy as np
import cv2

import gym
from gym import Env, spaces

In [2]:
#### COLORS (BGR)
BLUE =  [255, 0,   0]
GREEN = [0,   255, 0]
RED =   [0,   0,   255]
WHITE = [255, 255, 255]
GREY =  [240, 240, 240]
BLACK = [0,   0,   0]

#### BUILDINGS
WASTELAND = 0
OFFICE    = 1
HOUSE     = 2

#### REWARDS
WASTING_TIME_REWARD = -100
MOVE_REWARD         =  -1
OFFICE_REWARD       =   5

In [8]:
# https://blog.paperspace.com/creating-custom-environments-openai-gym/
# https://towardsdatascience.com/creating-a-custom-openai-gym-environment-for-stock-trading-be532be3910e

class City(Env):
    def __init__(self, observation_shape = (20, 4)):
        super(City, self).__init__()
        
        self.observation_shape = observation_shape
        self.observation_space = spaces.Box(low = 0, high = 2,
            shape = (observation_shape[0] * observation_shape[1],), dtype = np.uint8)
        
        self.canvas_shape = 700, 700, 3 # width, height, color (BGR)
        self.canvas = np.ones(self.canvas_shape, dtype = np.uint8) * 0
        
        #self.action_space = spaces.Tuple((
        #     spaces.Discrete(self.observation_shape[0] * self.observation_shape[1]),
        #     spaces.Discrete(self.observation_shape[0] * self.observation_shape[1])))
        #self.action_space = spaces.Discrete(self.observation_shape[0] * self.observation_shape[1],)
        self.action_space = spaces.Discrete(observation_shape[0] * observation_shape[1])
        
        # set the player's position in the middle of the map
        self.position = self.observation_shape[0] // 2, self.observation_shape[1] // 2
        
        # 
        self.is_placing_house = True
        
        # set the map
        self.map = np.ones(self.observation_shape, dtype = np.uint8) * WASTELAND
        self.offices = []
        self.houses = []
        pass
    
    def reset(self, random_start = True, start_shape = (4, 4)):
        
        # reset the player's position in the middle of the map
        self.position = self.observation_shape[0] // 2, self.observation_shape[1] // 2
        
        # 
        self.is_placing_house = True
        
        # reset the map with WASTELAND
        self.map = np.ones(self.observation_shape, dtype = np.uint8) * WASTELAND
        self.offices = []
        self.houses = []
        
        # (re)place random houses and offices in the middle of the map
        if random_start :            
            for y in range((self.observation_shape[1] - start_shape[1]) // 2, (self.observation_shape[1] + start_shape[1]) // 2):
                for x in range((self.observation_shape[0] - start_shape[0]) // 2, (self.observation_shape[0] + start_shape[0]) // 2):
                    self.map[y, x] = random.randrange(3)
                    if   self.map[y, x] == OFFICE : self.offices.append((x, y))
                    elif self.map[y, x] == HOUSE  : self.houses.append((x, y))
        
        return self.map.flatten()
    
    def __search_nearest_office(self, position):
        return int(min([math.dist(position, office) for office in self.offices]))
    
    def __search_nearest_house(self, position):
        return int(min([math.dist(position, house) for house in self.houses]))
    
    # test if a position is occupied
    def __is_free(self, position):
        return self.map[position] == WASTELAND
    
    def __place(self):
        
        # give a bad reward to the player if his position is occupied
        if not self.__is_free(self.position): return WASTING_TIME_REWARD
        
        if self.is_placing_house :
            # place the house
            self.houses.append(self.position)
            self.map[self.position] = HOUSE

            # calculate the reward
            reward = self.__search_nearest_office(self.position)
            reward = -reward + 5
        else :
            # place the office
            self.offices.append(self.position)
            self.map[self.position] = OFFICE
            
            reward = OFFICE_REWARD
        
        self.is_placing_house = not self.is_placing_house
        
        return reward
    
    # test if a position if out of bound
    def __is_oob(self, position):
        return not(0 <= position[0] < self.observation_shape[0]) \
            or not(0 <= position[1] < self.observation_shape[1])
    
    def step(self, action):
        reward = 0
        
        action_x, action_y = action % self.observation_shape[1], action // self.observation_shape[1]
        self.position = (action_x, action_y)
        
        #print("action", action, "\nposition", self.position, "\n")
        
        reward = self.__place()
        
        # draw all elements on the canvas
        self.draw_elements_on_canvas()
        
        #return self.canvas, reward
        return self.map.flatten(), reward, reward == WASTING_TIME_REWARD, {}
    
    def __draw_element_on_canvas(self, x, y, color):
        observation_width, observation_height = self.observation_shape
        canvas_width, canvas_height, _ = self.canvas_shape

        drawing_width = int(canvas_width / observation_width)
        drawing_height = int(canvas_height / observation_height)

        # fit element to the canvas
        for j in range(y * drawing_height, y * drawing_height + drawing_height):
            for i in range(x * drawing_width, x * drawing_width + drawing_width):
                try : self.canvas[i, j] = color
                except IndexError : pass
                
        for j in range(y * drawing_height, y * drawing_height + drawing_height):
            try : self.canvas[x * drawing_width, j] = GREY
            except IndexError : pass
            
        for i in range(x * drawing_width, x * drawing_width + drawing_width):
            try : self.canvas[i, y * drawing_height] = GREY
            except IndexError : pass
        pass

    def __draw_player_position(self, thickness = 3): # thickness must be odd 
        y, x = self.position
        thickness_range = range(- (thickness // 2), thickness // 2 + 1)
        
        observation_width, observation_height = self.observation_shape
        canvas_width, canvas_height, _ = self.canvas_shape

        drawing_width = int(canvas_width / observation_width)
        drawing_height = int(canvas_height / observation_height)
        
        for j in range(y * drawing_height, y * drawing_height + drawing_height):
            try :
                for t in thickness_range:
                    self.canvas[x * drawing_width + t, j] = BLACK
                    self.canvas[(x + 1) * drawing_width + t, j] = BLACK
            except IndexError : pass

        for i in range(x * drawing_width, x * drawing_width + drawing_width):
            try :
                for t in thickness_range:
                    self.canvas[i, y * drawing_height + t] = BLACK
                    self.canvas[i, (y + 1) * drawing_height + t] = BLACK
            except IndexError : pass
            
        pass
    
    def draw_elements_on_canvas(self):
        
        # draw each element of the map
        for y in range(len(self.map)):
            for x in range(len(self.map[0])):
                
                color = WHITE
                if   self.map[y, x] == OFFICE : color = BLUE
                elif self.map[y, x] == HOUSE  : color = RED
                
                self.__draw_element_on_canvas(x, y, color)
            pass
               
        # draw player's position
        self.__draw_player_position()
        pass
    
    def render(self, mode = "console"):
        if mode == "human" :
            cv2.imshow("", self.canvas)
            cv2.waitKey(1)
        if mode == "console" :
            print(self.position)
    
    def close(self):
        pass

In [9]:
#env = City()
#env.reset(random_start = True, start_shape = (4, 4))
#env.draw_elements_on_canvas()

#cv2.imshow("", env.canvas)
#cv2.waitKey(0)
#cv2.destroyAllWindows()

In [10]:
env = City((16, 16))
env.reset(random_start = True, start_shape = (4, 4))

for _ in range(0):
    env.step(env.action_space.sample())
    env.render("human")
    #faudrai aussi potenciellement suprimer les duplicata
    if (len(env.houses)+len(env.offices))>=(env.observation_shape[1]*env.observation_shape[1]):
        env.reset()
cv2.waitKey(0)
cv2.destroyAllWindows()

In [11]:
import stable_baselines3
from stable_baselines3 import DQN, PPO, A2C
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.env_util import make_vec_env

In [12]:
env = City((8, 8))
check_env(env) # test if the env is ok

In [14]:
env = City((8, 8))
env = make_vec_env(lambda: env, n_envs = 1)

model = DQN("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=100)
#model.save("test")

Using cpu device
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.75     |
|    ep_rew_mean      | -93.2    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1        |
|    time_elapsed     | 7        |
|    total_timesteps  | 11       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 3.62     |
|    ep_rew_mean      | -89.9    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1        |
|    time_elapsed     | 20       |
|    total_timesteps  | 29       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 3.42     |
|    ep_rew_mean      | -90.6    |
|    exploration_rate | 0.05     |
| time/               |          |
|  

<stable_baselines3.dqn.dqn.DQN at 0x1bc6611d640>

In [28]:
model = DQN.load("test")

In [29]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render("human")
    
    if cv2.waitKey(1) == 32 : break 
cv2.destroyAllWindows()

action 19 
position (3, 2) 

action 19 
position (3, 2) 

action 10 
position (2, 1) 

action 10 
position (2, 1) 

action 11 
position (3, 1) 

action 11 
position (3, 1) 

action 10 
position (2, 1) 

action 17 
position (1, 2) 

action 17 
position (1, 2) 

action 19 
position (3, 2) 

action 19 
position (3, 2) 

action 19 
position (3, 2) 

action 11 
position (3, 1) 

action 11 
position (3, 1) 

action 11 
position (3, 1) 

action 54 
position (6, 6) 

action 10 
position (2, 1) 

action 10 
position (2, 1) 

action 19 
position (3, 2) 

action 19 
position (3, 2) 

action 11 
position (3, 1) 

action 19 
position (3, 2) 

action 11 
position (3, 1) 

action 10 
position (2, 1) 

action 11 
position (3, 1) 

action 19 
position (3, 2) 

action 11 
position (3, 1) 

action 11 
position (3, 1) 

action 19 
position (3, 2) 

action 19 
position (3, 2) 

action 19 
position (3, 2) 

action 10 
position (2, 1) 

action 11 
position (3, 1) 

action 10 
position (2, 1) 

action 19 
pos

ValueError: min() arg is an empty sequence

_

In [None]:
env = make_vec_env("CartPole-v1", n_envs=4)

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10)
model.save("ppo_cartpole")

#del model # remove to demonstrate saving and loading

#model = PPO.load("ppo_cartpole")

In [None]:
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random
import tensorflow
from tensorflow.keras.layers import Conv2D
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
env = City((10, 10))
env.observation_space.sample()

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam


states = env.observation_space.shape
actions = env.action_space.n

def build_model(states, actions):
    model = tensorflow.keras.Sequential()    
    model.add(Dense(24, activation='relu', input_shape=((1,env.observation_shape[0], env.observation_shape[1]))))
    model.add(Dense(24, activation='relu'))
    model.add(Flatten())
    model.add(Dense(actions, activation='linear'))
    return model

model = build_model(states, actions)
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=100, target_model_update=1e-2)
    return dqn
dqn = build_agent(model, actions)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=10000, visualize=True, verbose=1)

cv2.destroyAllWindows()

In [None]:
dqn.test(env, nb_episodes=10, visualize=True)

cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
cv2.destroyAllWindows()