# UP/DOWN prediction

In [1]:
# import necessary libraries
import numpy as np
import gym
from gym import spaces

In [2]:
class Betting(gym.Env):

    # actions available
    UP = 0
    DOWN = 1

    def __init__(self, data):
        super(Betting, self).__init__() # gym.Env의 __init__ 호출

        # data 정의
        self.data = data
        self.size = len(data) # size of the data
        self.range = 1000  # range of the data

        # randomly assign the inital location of agent
        self.observe_idx = np.random.randint(self.size - 1)
        self.agent_position = data[self.observe_idx]

        # respective actions of agents : up, down
        self.action_space = spaces.Discrete(2)

        # set the observation space to (1,) to represent agent position
        self.observation_space = spaces.Box(low=0, high=self.range, shape=(1,9,), dtype=np.uint16)

    def step(self, action):
        info = {}  # additional information

        reward = 0

        # UP, DOWN 맞으면 reward=1, 틀리면 맞을 때까지 반복
        if action == self.UP:
            if 1000 < self.data[self.observe_idx + 1][1]:
                reward += 1
                self.observe_idx += 1
            else:
                reward += 0
        elif action == self.DOWN:
            if 1000 > self.data[self.observe_idx + 1][1]:
                reward += 1
                self.observe_idx += 1
            else:
                reward += 0
 #       else:
 #           raise ValueError("Received invalid action={} which is not part of the action space".format(action))

        # 더 이상 데이터가 없을 경우, done
        done = bool(self.observe_idx == self.size - 1)
        
        if not done:
            self.agent_position = data[self.observe_idx]

        return np.array([self.agent_position]).astype(np.uint8), reward, done, info

    def render(self, mode='console'):
        '''
            render the state
        '''
#         if mode != 'console':
#             raise NotImplementedError()

#         for pos in range(self.size):
#             if pos == self.agent_position:
#                 print("X", end='')
#             else:
#                 print('.', end='')
#             print('')

    def reset(self):
        # -1 to ensure agent inital position will not be at the end state
        self.observe_idx = np.random.randint(self.size - 2)
        self.agent_position = data[self.observe_idx]

        return np.array([self.agent_position]).astype(np.uint8)

    def close(self):
        pass

In [3]:
# This is to test if custom enviroment created properly
# If the environment don't follow the gym interface, an error will be thrown

from stable_baselines.common.env_checker import check_env

data = [[1000,1000,1004,986,1000,1004,973,988,1235],
   [1000,1002,1012,997,1000,1010,986,1178,1495],
   [1000,979,1000,978,1000,989,969,1008,1222],
   [1000,952,1012,951,1000,951,927,653,750],
   [1000,1002,1016,993,1000,954,923,660,665],
   [1000,1006,1006,987,1000,973,935,918,830]]

env = Betting(data)
check_env(env, warn=True)

# try draw the grid world
obs = env.reset()
env.render()

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.





In [4]:
# import various RL algorithms
from stable_baselines import DQN, PPO2, A2C, ACKTR

# Train the agent
model = ACKTR('MlpPolicy', env, verbose=1).learn(100000)

Wrapping the env in a DummyVecEnv.




Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.







Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where












---------------------------------
| explained_variance | 0.0706   |
| fps                | 47       |
| nupdates           | 1        |
| policy_entropy     | 0.693    |
| policy_loss        | 1.22     |
| total_timesteps    | 20       |
| value_loss         | 3.74     |
---------------------------------
---------------------------------
| explained_variance | 0.712    |
| fps                | 1308     |
| nupdates           | 100      |
| policy_entropy     | 0.648    |
| policy_loss        | 0.153    |
| total_timesteps    | 2000     |
| value_loss         | 0.853    |
---------------------------------
---------------------------------
| explained_variance | 0.876    |
| fps                | 1540     |
| 

---------------------------------
| explained_variance | 0.252    |
| fps                | 1756     |
| nupdates           | 800      |
| policy_entropy     | 0.39     |
| policy_loss        | -0.466   |
| total_timesteps    | 16000    |
| value_loss         | 1.78     |
---------------------------------
---------------------------------
| explained_variance | 0.409    |
| fps                | 1768     |
| nupdates           | 900      |
| policy_entropy     | 0.48     |
| policy_loss        | 0.0202   |
| total_timesteps    | 18000    |
| value_loss         | 1.27     |
---------------------------------
---------------------------------
| explained_variance | 0.748    |
| fps                | 1780     |
| nupdates           | 1000     |
| policy_entropy     | 0.515    |
| policy_loss        | 0.0638   |
| total_timesteps    | 20000    |
| value_loss         | 0.261    |
---------------------------------
---------------------------------
| explained_variance | 0.671    |
| fps         

---------------------------------
| explained_variance | 0.844    |
| fps                | 1849     |
| nupdates           | 3500     |
| policy_entropy     | 0.599    |
| policy_loss        | 0.0181   |
| total_timesteps    | 70000    |
| value_loss         | 0.112    |
---------------------------------
---------------------------------
| explained_variance | 0.849    |
| fps                | 1850     |
| nupdates           | 3600     |
| policy_entropy     | 0.564    |
| policy_loss        | -0.0532  |
| total_timesteps    | 72000    |
| value_loss         | 0.282    |
---------------------------------
---------------------------------
| explained_variance | 0.825    |
| fps                | 1852     |
| nupdates           | 3700     |
| policy_entropy     | 0.606    |
| policy_loss        | -0.0343  |
| total_timesteps    | 74000    |
| value_loss         | 0.224    |
---------------------------------
---------------------------------
| explained_variance | 0.908    |
| fps         

In [5]:
# running the simulation with trained model to verify result

obs = env.reset()
n_steps = 50
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break

Step 1
Action:  0
obs= [[232 234 244 229 232 242 218 154 215]] reward= 1 done= False
Step 2
Action:  0
obs= [[232 234 244 229 232 242 218 154 215]] reward= 0 done= False
Step 3
Action:  0
obs= [[232 234 244 229 232 242 218 154 215]] reward= 0 done= False
Step 4
Action:  0
obs= [[232 234 244 229 232 242 218 154 215]] reward= 0 done= False
Step 5
Action:  0
obs= [[232 234 244 229 232 242 218 154 215]] reward= 0 done= False
Step 6
Action:  0
obs= [[232 234 244 229 232 242 218 154 215]] reward= 0 done= False
Step 7
Action:  0
obs= [[232 234 244 229 232 242 218 154 215]] reward= 0 done= False
Step 8
Action:  0
obs= [[232 234 244 229 232 242 218 154 215]] reward= 0 done= False
Step 9
Action:  0
obs= [[232 234 244 229 232 242 218 154 215]] reward= 0 done= False
Step 10
Action:  0
obs= [[232 234 244 229 232 242 218 154 215]] reward= 0 done= False
Step 11
Action:  0
obs= [[232 234 244 229 232 242 218 154 215]] reward= 0 done= False
Step 12
Action:  0
obs= [[232 234 244 229 232 242 218 154 215]]

In [6]:
# print the trained policy map
# recall UP = 0, DOWN = 1
obs = [[1000,1002,1012,997,1000,1010,986,1178,1495]]
action, _ = model.predict(obs, deterministic=True)

print("action:", action,"prob : ", model.action_probability(obs), end="\n")

action: 0 prob :  [0.8182154  0.18178457]
