# UP/DOWN prediction

In [4]:
# import necessary libraries
import numpy as np
import gym
from gym import spaces

In [5]:
class Betting(gym.Env):

    # actions available
    UP = 0
    DOWN = 1

    def __init__(self, data):
        super(Betting, self).__init__() # gym.Env의 __init__ 호출

        # data 정의
        self.data = data
        self.size = len(data) # size of the data
        self.range = 10  # range of the data

        # randomly assign the inital location of agent
        self.observe_idx = np.random.randint(self.size - 1)
        self.agent_position = data[self.observe_idx]

        # respective actions of agents : up, down
        self.action_space = spaces.Discrete(2)

        # set the observation space to (1,) to represent agent position
        self.observation_space = spaces.Box(low=0, high=self.range, shape=(1,), dtype=np.uint8)

    def step(self, action):
        info = {}  # additional information

        reward = 0

        # UP, DOWN 맞으면 reward=1, 틀리면 맞을 때까지 반복
        if action == self.UP:
            if self.data[self.observe_idx] < self.data[self.observe_idx + 1]:
                reward += 1
                self.observe_idx += 1
            else:
                reward += 0
        elif action == self.DOWN:
            if self.data[self.observe_idx] > self.data[self.observe_idx + 1]:
                reward += 1
                self.observe_idx += 1
            else:
                reward += 0
 #       else:
 #           raise ValueError("Received invalid action={} which is not part of the action space".format(action))

        # 더 이상 데이터가 없을 경우, done
        done = bool(self.observe_idx == self.size - 1)
        
        if not done:
            self.agent_position = data[self.observe_idx]

        return np.array([self.agent_position]).astype(np.uint8), reward, done, info

    def render(self, mode='console'):
        '''
            render the state
        '''
#         if mode != 'console':
#             raise NotImplementedError()

#         for pos in range(self.size):
#             if pos == self.agent_position:
#                 print("X", end='')
#             else:
#                 print('.', end='')
#             print('')

    def reset(self):
        # -1 to ensure agent inital position will not be at the end state
        self.observe_idx = np.random.randint(self.size - 2)
        self.agent_position = data[self.observe_idx]

        return np.array([self.agent_position]).astype(np.uint8)

    def close(self):
        pass

In [6]:
# This is to test if custom enviroment created properly
# If the environment don't follow the gym interface, an error will be thrown

from stable_baselines.common.env_checker import check_env

data = [0,9,7,4,3,5]

env = Betting(data)
check_env(env, warn=True)

# try draw the grid world
obs = env.reset()
env.render()

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [7]:
# import various RL algorithms
from stable_baselines import DQN, PPO2, A2C, ACKTR

# Train the agent
model = ACKTR('MlpPolicy', env, verbose=1).learn(10000)

Wrapping the env in a DummyVecEnv.




Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.







Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where












---------------------------------
| explained_variance | -0.289   |
| fps                | 43       |
| nupdates           | 1        |
| policy_entropy     | 0.693    |
| policy_loss        | 1.43     |
| total_timesteps    | 0        |
| value_loss         | 6.75     |
---------------------------------
----------------------------------
| explained_variance | 0         |
| fps                | 1390      |
| nupdates           | 100       |
| policy_entropy     | 0.0104    |
| policy_loss        | -7.49e-05 |
| total_timesteps    | 2079      |
| value_loss         | 0.000563  |
----------------------------------
---------------------------------
| explained_variance | 0.956    |
| fps                | 1638

In [8]:
# running the simulation with trained model to verify result

obs = env.reset()
n_steps = 50
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break

Step 1
Action:  1
obs= [4] reward= 1 done= False
Step 2
Action:  1
obs= [3] reward= 1 done= False
Step 3
Action:  0
obs= [3] reward= 1 done= True
Goal reached! reward= 1


In [12]:
# print the trained policy map
# recall UP = 0, DOWN = 1
for i in range(10):
    obs = [i]
    action, _ = model.predict(obs, deterministic=True)

    print(i, " : ", action, ", prob : ", model.action_probability(obs), end="\n")

0  :  0 , prob :  [0.990036   0.00996393]
1  :  0 , prob :  [0.986093   0.01390699]
2  :  0 , prob :  [0.93550116 0.06449886]
3  :  0 , prob :  [0.66572607 0.33427387]
4  :  1 , prob :  [0.24379683 0.7562032 ]
5  :  1 , prob :  [0.07148959 0.9285104 ]
6  :  1 , prob :  [0.02611539 0.97388464]
7  :  1 , prob :  [0.01231219 0.98768777]
8  :  1 , prob :  [0.00705826 0.99294174]
9  :  1 , prob :  [0.00465138 0.99534863]
