In [2]:
import numpy as np
from sb3_contrib import TQC
from stable_baselines3.common.evaluation import evaluate_policy

from single_state_mdp import SingleStateMDP

In [27]:
def make_env():
    # Define and return an instance of the SingleStateMDP environment
    return SingleStateMDP()

In [28]:
env = make_env()

In [35]:
# Create the TQC agent with specified parameters
policy_kwargs = {"n_quantiles": 25, "n_critics": 2, "net_arch": [50,50]}
agent = TQC("MlpPolicy", env, policy_kwargs=policy_kwargs)

In [9]:
# Train the agent for the desired number of iterations
agent.learn(total_timesteps=3000)

<sb3_contrib.tqc.tqc.TQC at 0x7f4491316b70>

In [29]:
# Evaluate the performance of the TQC agent using the evaluate_policy function
mean_rewards, std_rewards = evaluate_policy(agent, make_env(), n_eval_episodes=100)

print(f'Mean reward: {mean_rewards:.3f} +/- {std_rewards:.3f}')

Mean reward: 0.569 +/- 0.240


In [33]:
# Define a dense uniform grid of actions to evaluate the approximations on
actions = np.linspace(-1, 1, 2000)

# Initialize arrays to store the results
signed_discrepancies = np.zeros(actions.shape)

# Evaluate the performance of the TQC agent on the dense uniform grid of actions
for i, action in enumerate(actions):
    # Get the approximate Q-value for the current action
    q_value, _ = agent.predict(np.array([0]), np.array([action]))
    
    # Calculate the signed discrepancy between the approximate Q-value and the true Q-value
    signed_discrepancy = q_value - env._mean_reward(action)
    
    # Store the results
    signed_discrepancies[i] = signed_discrepancy

print(f'Mean reward: {signed_discrepancies.mean():.3f} +/- {signed_discrepancies.std():.3f}')

Mean reward: 0.092 +/- 0.470
