In [173]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from functools import partial
from IPython.display import clear_output
from itertools import cycle
from pathlib import Path
import random
from time import time
from typing import Tuple, List, Callable, Any

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tf_agents.typing import types
%matplotlib inline
import seaborn as sns
sns.set()
import tensorflow as tf
from tf_agents.agents import DqnAgent
from tf_agents.agents.tf_agent import LossInfo
from tf_agents.environments.py_environment import PyEnvironment
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.networks.q_rnn_network import QRnnNetwork
from tf_agents.networks.q_network import QNetwork
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
from tf_agents.specs import TensorSpec
from tf_agents.trajectories import trajectory
from tf_agents.trajectories.time_step import TimeStep
from tf_agents.trajectories.trajectory import Trajectory
from tf_agents.utils import common

print('Physical Devices:\n', tf.config.list_physical_devices(), '\n\n')

OUTPUTS_DIR = f'./outputs/{int(10000000 * time())}'
print('Output Directory:', OUTPUTS_DIR)

Physical Devices:
 [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')] 


Output Directory: ./outputs/16205694402578152


In [174]:
LOST = 0
DISCOUNT = 1
import abc
from tf_agents.environments import py_environment
from tf_agents.specs import BoundedArraySpec
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts
from tf_agents.trajectories.time_step import StepType
tf.compat.v1.enable_v2_behavior()

class EnglishAuction(py_environment.PyEnvironment):


    def __init__(self, valuations):
        bid_spec = BoundedArraySpec(
            (), dtype=np.int32, minimum=0, maximum=4
        )
        player_spec = BoundedArraySpec(shape=(1,),dtype=np.int32, minimum=1, maximum=1)
        self._action_spec = {
            "bid": bid_spec,
            "player": player_spec
        }
        minimum = np.zeros((2,5), dtype=np.int32)
        maximum = np.array([[1000,2,1,1,1000],[1000,2,1,1,1000]], dtype=np.int32)
        self._observation_spec = BoundedArraySpec(
            shape=(2,5), dtype=np.int32, name="observation", maximum=maximum, minimum=minimum
        )
        # highest bid, winning player, p1 active, p2 active, p1 valuation, p2 valuation
        self._state = np.zeros(shape=(2,6), dtype=np.int32)
        self._state[:,4:6] = valuations

    def _reset(self):
        self._state = np.zeros(shape=(2,6), dtype=np.int32)
         #gen valuations
        return ts.restart(np.array(self._state, dtype=np.int32))

    def __calculateWinnings(self, player):
        item_1 = (self._state[0,1] == player) * (self._state[0, 3 + player] - self._state[0,0])
        item_2 = (self._state[1,1] == player) * (self._state[1, 3 + player] - self._state[1,0])
        return item_1 + item_2

    def _step(self, action):
        if self._current_time_step.is_last():
            return self._reset()

        player = action["player"]
        bid = action["bid"]

        if np.all((self._state[0:1, 2:3] == 0)):
            return TimeStep(StepType.LAST,
                            self.__calculateWinnings(player),
                            DISCOUNT,
                            self._state
                            )

        for i in range(2):
            if bid == 0:                        # Pass both
                self._state[0,1+player] = 0
                self._state[1,1+player] = 0
            if bid == 1:                        # Increment on Item_1, stay on Item_2
                self._state[0,1+player] = 1
                self._state[1,1+player] = 0
                self._state[0, 0] += 1
                self._state[0, 1] = player
            if bid == 2:                        # Increment on Item_2, stay on Item_1
                self._state[0,1+player] = 0
                self._state[1,1+player] = 1
                self._state[1, 0] += 1
                self._state[1, 1] = player
            if bid == 3:                        # Increment on both
                self._state[0, 0] += 1
                self._state[1, 0] += 1
                self._state[0, 1] = player
                self._state[1, 1] = player
                self._state[0,1+player] = 1
                self._state[1,1+player] = 1

        if np.all((self._state[0:1, 2:3] == 0)):
            return TimeStep(StepType.MID,
                            self.__calculateWinnings(player),
                            DISCOUNT,
                            self._state
                            )
        return TimeStep(
            StepType.MID,
            0,
            DISCOUNT,
            self._state
        )
    def observation_spec(self) -> types.NestedArraySpec:
        return self._observation_spec

    def action_spec(self) -> types.NestedArraySpec:
        return self._action_spec

    def get_info(self) -> Any:
        pass

    def get_state(self) -> Any:
        return self._state

    def set_state(self, state: Any) -> None:
        self._state = state

In [175]:
class IMAgent(DqnAgent):

    def __init__(self,
                 env: TFPyEnvironment,
                 observation_spec: TensorSpec = None,
                 action_spec: TensorSpec = None,
                 reward_fn: Callable = lambda time_step: time_step.reward,
                 action_fn: Callable = lambda action: action,
                 name: str='IMAgent',
                 q_network=None,
                 # training params
                 replay_buffer_max_length: int = 1000,
                 learning_rate: float = 1e-5,
                 training_batch_size: int = 8,
                 training_parallel_calls: int = 3,
                 training_prefetch_buffer_size: int = 3,
                 training_num_steps: int = 2,
                 **dqn_kwargs):

        self._env = env
        self._reward_fn = reward_fn
        self._name = name
        self._observation_spec = observation_spec or self._env.observation_spec()
        self._action_spec = action_spec or self._env.action_spec()
        self._action_fn = action_fn

        q_network = q_network or self._build_q_net()

        env_ts_spec = self._env.time_step_spec()
        time_step_spec = TimeStep(
            step_type=env_ts_spec.step_type,
            reward=env_ts_spec.reward,
            discount=env_ts_spec.discount,
            observation=q_network.input_tensor_spec
        )

        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        super().__init__(time_step_spec,
                         self._action_spec,
                         q_network,
                         optimizer,
                         name=name,
                         **dqn_kwargs)

        self._policy_state = self.policy.get_initial_state(
            batch_size=self._env.batch_size)
        self._rewards = []

        self._replay_buffer = TFUniformReplayBuffer(
            data_spec=self.collect_data_spec,
            batch_size=self._env.batch_size,
            max_length=replay_buffer_max_length)

        self._training_batch_size = training_batch_size
        self._training_parallel_calls = training_parallel_calls
        self._training_prefetch_buffer_size = training_prefetch_buffer_size
        self._training_num_steps = training_num_steps
        self.train = common.function(self.train)

    def _build_q_net(self):
#         q_net = QRnnNetwork(input_tensor_spec=self._observation_spec,
#                             action_spec=self._action_spec,
#                             name=f'{self._name}QRNN')

        fc_layer_params = (50,)
        q_net = QNetwork(
            self._observation_spec,
            self._action_spec,
            fc_layer_params=fc_layer_params)

        q_net.create_variables()
        q_net.summary()

        return q_net

    def reset(self):
        self._policy_state = self.policy.get_initial_state(
            batch_size=self._env.batch_size
        )
        self._rewards = []

    def episode_return(self) -> float:
        return np.sum(self._rewards)

    def _observation_fn(self, observation: tf.Tensor) -> tf.Tensor:
        """
            Takes a tensor with specification self._env.observation_spec
            and extracts a tensor with specification self._observation_spec.

            For example, consider an agent within an NxN maze environment.
            The env could expose the entire NxN integer matrix as an observation
            but we would prefer the agent to only see a 3x3 window around their
            current location. To do this we can override this method.

            This allows us to have different agents acting in the same environment
            with different observations.
        """
        return observation

    def _augment_time_step(self, time_step: TimeStep) -> TimeStep:

        reward = self._reward_fn(time_step)
        reward = tf.convert_to_tensor(reward, dtype=tf.float32)
        if reward.shape != time_step.reward.shape:
            reward = tf.reshape(reward, time_step.reward.shape)

        observation = self._observation_fn(time_step.observation)

        return TimeStep(
            step_type=time_step.step_type,
            reward=reward,
            discount=time_step.discount,
            observation=observation
        )

    def _current_time_step(self) -> TimeStep:
        time_step = self._env.current_time_step()
        time_step = self._augment_time_step(time_step)
        return time_step

    def _step_environment(self, action) -> TimeStep:
        action = self._action_fn(action)
        time_step = self._env.step(action)
        time_step = self._augment_time_step(time_step)
        return time_step

    def act(self, collect=False) -> Trajectory:
        time_step = self._current_time_step()

        if collect:
            policy_step = self.collect_policy.action(
                time_step, policy_state=self._policy_state)
        else:
            policy_step = self.policy.action(
                time_step, policy_state=self._policy_state)

        self._policy_state = policy_step.state
        next_time_step = self._step_environment(policy_step.action)
        traj = trajectory.from_transition(time_step, policy_step, next_time_step)

        self._rewards.append(next_time_step.reward)

        if collect:
            self._replay_buffer.add_batch(traj)

        return traj

    def train_iteration(self) -> LossInfo:
        experience, info = self._replay_buffer.get_next(
            sample_batch_size=self._training_batch_size,
            num_steps=self._training_num_steps
        )
        return self.train(experience)


In [176]:
def ttt_action_fn(player, action):
    return {'position': action, 'value': player}
valuation = np.random.rand(2,2) * 1000
auction = EnglishAuction(valuation)
auction = tf_py_environment.TFPyEnvironment(auction)
player_1 = IMAgent(env=auction,
                   action_spec=auction.action_spec()["bid"],
                   action_fn=partial(ttt_action_fn,1),
                   name="player1"
                   )

player_2 = IMAgent(env=auction,
                   action_spec=auction.action_spec()["bid"],
                   action_fn=partial(ttt_action_fn,2),
                   name="player2"
                   )

Model: "QNetwork"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
EncodingNetwork (EncodingNet multiple                  550       
_________________________________________________________________
dense_55 (Dense)             multiple                  255       
Total params: 805
Trainable params: 805
Non-trainable params: 0
_________________________________________________________________
Model: "QNetwork"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
EncodingNetwork (EncodingNet multiple                  550       
_________________________________________________________________
dense_59 (Dense)             multiple                  255       
Total params: 805
Trainable params: 805
Non-trainable params: 0
_________________________________________________________________
