<a href="https://colab.research.google.com/github/hchaparov/Dynamic_Pricing_MARL/blob/main/Bachelor_thesis_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment

**Stationary Demand market:**



In [1]:
!pip install gymnasium
!pip install numpy
!pip install torch
!pip install stable_baselines3 #"stable-baselines3[extra]>=2.0.0a4"
!pip install huggingface_sb3

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manyli

In [5]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

class OligopolyMarketEnv(gym.Env):

   # Because of google colab, we cannot implement the GUI ('human' render mode)
    metadata = {"render_modes": ["console"]}

    def __init__(self, a, b, beta_G, beta_L, reference_price, c, a_phi, before, max_steps, render_mode="console"):
        super(OligopolyMarketEnv, self).__init__()
        self.render_mode = render_mode
        self.reference_price = reference_price
        self.a = a
        self.b = b
        self.beta_G = beta_G
        self.beta_L = beta_L
        self.c = c  # costs (Lower boundary for prices)
        self.a_phi = a_phi
        self.before = before
        self.max_steps = max_steps
        self.last_action = None
        self.last_profit = None
        self.uv_buffer = []
        self.revenue_buffer = []
        self.t = 0
        self.upper_bound = None
        action = 0



         # Determine the upper bound of the action space
        if self.reference_price <= self.a / self.b:
            self.upper_bound = min((self.a + self.beta_G * self.reference_price) / (self.b + self.beta_G), (self.a - self.b * self.reference_price + self.beta_G * self.reference_price)/(self.beta_G))
        else:
            self.upper_bound = min((self.a + self.beta_L * self.reference_price) / (self.b + self.beta_L), (self.a - self.b * self.reference_price + self.beta_L * self.reference_price)/(self.beta_L))

        # Ensure the upper bound is at least greater than the lower bound c
        if self.upper_bound < self.c:
           print("Watch out: upper_bound < costs")

        # Action space (price set by the firm) is bounded
        # self.action_space = spaces.Box(low=np.array([c], dtype=np.float32), high=np.array([upper_bound], dtype=np.float32), dtype=np.float32)
        # Normalize action [-1, 1]
        self.action_space = spaces.Box(low=np.array([-1], dtype=np.float32), high=np.array([1], dtype=np.float32), dtype=np.float32)

        # State space is the reference price
        self.observation_space = spaces.Box(low=np.array([0], dtype=np.float32), high=np.array([np.inf], dtype=np.float32), shape=(1,), dtype=np.float32)

    def step(self, action):
        # rescale the action space [c, upper_bound]
        action = action
        rescaled_action = self.c + ((action + 1.01) / 2 )* (self.upper_bound - self.c)
        price = rescaled_action
        # average_price = price  # Simplified for single agent
        b_phi = 1

        # Implementing demand function
        if self.reference_price > price:
            demand = self.a - self.b * self.reference_price + self.beta_G * (self.reference_price - price)
        else:
            demand = self.a - self.b * self.reference_price + self.beta_L * (self.reference_price - price)

        demand = np.array(demand).item()
        demand = int(np.floor(demand))

        # Ensuring float type inside the environment
        # Converting from array to scalar if needed
        if isinstance(price, np.ndarray):
          price = price.item()

        if isinstance(demand, np.ndarray):
          demand = demand.item()

        revenue = price * demand
        self.revenue_buffer.append(revenue)
        self.uv_buffer.append(np.random.randint(demand + 1, 1000))

        # immediate reward function
        if self.t == 0 or self.before >= self.t:
           reward = self.revenue_buffer[self.t] / self.uv_buffer[self.t]
        else:
           reward = (self.revenue_buffer[self.t] / self.uv_buffer[self.t]) - (self.revenue_buffer[self.t - self.before] / self.uv_buffer[self.t - self.before])


        # Store the last action and last profit for render()
        self.last_action = rescaled_action
        self.last_profit = revenue

        next_state = np.array([self.reference_price]).astype(np.float32)  # State is constant
        self.t += 1
        done = self.t >= self.max_steps

        return next_state, reward, done, False, {} #, action, price, demand, self.t, rescaled_action, self.revenue_buffer, self.uv_buffer


    def reset(self, seed=None, options=None):
        super().reset(seed=seed, options=options)
       # we convert to float32 to make it more general, because we want to use continuous actions
        return np.array([self.reference_price]).astype(np.float32), {}


    def render(self):
      if self.render_mode == "console":
          if self.last_action is not None and self.last_profit is not None:
              # Ensure last_action and last_profit are scalars for formatting
              if isinstance(self.last_action, np.ndarray):
                  if self.last_action.size == 1:
                      last_action = self.last_action.item()  # Convert single element array to scalar
                  else:
                      last_action = self.last_action
                      print("Error: last_action is not a single element array")
              else:
                  last_action = self.last_action  # if already scalar

              if isinstance(self.last_profit, np.ndarray):
                  if self.last_profit.size == 1:
                      last_profit = self.last_profit.item()
                  else:
                      last_profit = self.last_profit
                      print("Error: last_profit is not a single element array")
              else:
                  last_profit = self.last_profit

              print(f"Reference Price: {self.reference_price:.3f}")
              print(f"Last Action (Price Set by Firm): {last_action:.3f}")
              print(f"Last Profit: {last_profit:.3f}")


    def close(self):
        pass



In [6]:
from stable_baselines3.common.env_checker import check_env
env = OligopolyMarketEnv(a = 10, b = 1, beta_G = 2, beta_L = 2, reference_price = 1.0, c = 0, a_phi = 1, before = 2, max_steps = 3)
# If the environment doesn't follow the interface, an error will be thrown
check_env(env, warn=True)

TEST:

In [4]:
import random
env = OligopolyMarketEnv(a = 10, b = 1, beta_G = 2, beta_L = 2, reference_price = 1.0, c = 0, a_phi = 1, before = 2, max_steps = 3)

obs, _ = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())


# Test:
n_steps = 20
for step in range(n_steps):
    price = random.uniform(-1, 1)
    print(f"Step {step + 1}")
    obs, reward, done, false, info, action, price, demand, t, rescaled_action, revenue_buffer, uv_buffer = env.step(price)
    print( "reward=", reward, "action=", action, "done =", done, ",price=", price, ",demand=", demand,",t = ", t, ",rescaled_action=", rescaled_action, ",revenue_buffer=", revenue_buffer, ",uv_buffer=", uv_buffer)
    env.render()

Box(0.0, inf, (1,), float32)
Box(-1.0, 1.0, (1,), float32)
[-0.6324878]
Step 1
reward= 0.014012238142137486 action= 0.01873181693526038 done = False ,price= 2.057463633870521 ,demand= 6 ,t =  1 ,rescaled_action= 2.057463633870521 ,revenue_buffer= [12.344781803223125] ,uv_buffer= [881]
Reference Price: 1.000
Last Action (Price Set by Firm): 2.057
Last Profit: 12.345
Step 2
reward= 0.025653819591916344 action= -0.009501035915262523 done = False ,price= 2.000997928169475 ,demand= 6 ,t =  2 ,rescaled_action= 2.000997928169475 ,revenue_buffer= [12.344781803223125, 12.005987569016849] ,uv_buffer= [881, 468]
Reference Price: 1.000
Last Action (Price Set by Firm): 2.001
Last Profit: 12.006
Step 3
reward= 0.027971061758629248 action= 0.3437993891176556 done = True ,price= 2.7075987782353113 ,demand= 5 ,t =  3 ,rescaled_action= 2.7075987782353113 ,revenue_buffer= [12.344781803223125, 12.005987569016849, 13.537993891176557] ,uv_buffer= [881, 468, 484]
Reference Price: 1.000
Last Action (Price Set

Vectorize the environment:

In [7]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor


# Use make_vec_env to create and vectorize your environment
vec_env = make_vec_env(OligopolyMarketEnv, n_envs=10, env_kwargs=dict(a = 10, b = 1, beta_G = 2, beta_L = 2, reference_price = 1.0, c = 0, a_phi = 1, before = 2, max_steps = 3))


# **PPO model**

In [8]:
model = PPO(
    policy = 'MlpPolicy',
    env = vec_env,
    seed = 0,
    n_steps = 1024,
    batch_size = 64,
    n_epochs = 4,
    gamma = 0.999,
    gae_lambda = 0.98,
    ent_coef = 0.01,
    verbose=1).learn(100000)


Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.00034  |
| time/              |          |
|    fps             | 4306     |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 10240    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | -0.012      |
| time/                   |             |
|    fps                  | 2012        |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 20480       |
| train/                  |             |
|    approx_kl            | 0.026157323 |
|    clip_fraction        | 0.215       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.36       |
|    explained_variance   | 0           |
|    learnin

In [11]:
mean_reward, std_reward = evaluate_policy(model, vec_env, n_eval_episodes=10, deterministic=False)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

mean_reward=0.00 +/- 0.021421786223608897


  and should_run_async(code)


# Test the agent:

In [10]:
# Test the trained agent
# using the vecenv
obs = vec_env.reset()
n_steps = 20
for step in range(n_steps):
    action, _ = model.predict(obs, deterministic = False)
    print(f"Step {step + 1}")
    print("Action: ", action)
    obs, reward, done, info = vec_env.step(action)
    print("obs=", obs, "reward=", reward)
    vec_env.render()


Step 1
Action:  [[ 0.7311433 ]
 [ 0.3121027 ]
 [ 0.51408315]
 [ 0.37898678]
 [ 0.56312174]
 [ 0.74516964]
 [ 0.64595   ]
 [ 0.31043443]
 [-0.28906   ]
 [ 0.6230095 ]]
obs= [[1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]] reward= [-3.52313966e-02  2.06570532e-02  8.63399506e-02  2.88830817e-01
  1.08813285e-04  1.01802510e-03  5.80080524e-02  8.64140689e-02
  8.12418479e-03  2.27963589e-02]
Reference Price: 1.000
Last Action (Price Set by Firm): 3.482
Last Profit: 13.929
Reference Price: 1.000
Last Action (Price Set by Firm): 2.644
Last Profit: 13.221
Reference Price: 1.000
Last Action (Price Set by Firm): 3.048
Last Profit: 12.193
Reference Price: 1.000
Last Action (Price Set by Firm): 2.778
Last Profit: 13.890
Reference Price: 1.000
Last Action (Price Set by Firm): 3.146
Last Profit: 12.585
Reference Price: 1.000
Last Action (Price Set by Firm): 3.510
Last Profit: 10.531
Reference Price: 1.000
Last Action (Price Set by Firm): 3.312
Last Profit: 13.248
Reference Price: 1.00