In [None]:
import gym
import numpy as np
import mjx
from mjx.agents import RandomAgent, ShantenAgent
from ppo_agent import PPOAgent, GymEnv
import torch

In [None]:
# Initialize the environment: Play against Shanten agent
opponents = [ShantenAgent() for _ in range(3)]  
env = GymEnv(opponent_agents=opponents)

# obtain the observation and action mask shape
obs, info = env.reset()
obs_shape = obs.flatten().shape[0]
action_dim = len(info["action_mask"])  # action number

# Initialize PPO agent
agent = PPOAgent(
    input_dim=obs_shape,
    hidden_dim=128,
    output_dim=action_dim,
    lr = 1e-4,              # small learning rate
    entropy_coef=0.001      # small entropy coefficient: small curiousity
)


In [12]:
import matplotlib.pyplot as plt

def plot_rewards(rewards, path="logs/reward_curve.png"):
    plt.figure()
    plt.plot(rewards)
    plt.xlabel("Episode")
    plt.ylabel("Reward")
    plt.title("Training Reward Curve")
    plt.savefig(path)
    plt.close()


In [None]:
import json

# training parameters
num_episodes = 3000
log_interval = 100  # Log every 100 episodes
rolling_rewards = []

all_rewards = []
all_actor_loss = []
all_value_loss = []
best_reward = -float("inf")

for episode in range(1, num_episodes + 1):
    obs, info = env.reset()
    total_reward = 0
    done = False

    while not done:
        action = agent.act(obs, info["action_mask"])
        next_obs, reward, done, info = env.step(action)
        agent.store_reward(reward)
        obs = next_obs
        total_reward += reward
    
    stats = agent.update()
    all_rewards.append(total_reward)
    all_actor_loss.append(stats['actor_loss'])
    all_value_loss.append(stats['value_loss'])

    # print statistics
    if episode % log_interval == 0:
        avg_reward = np.mean(all_rewards[-log_interval:])
        rolling_rewards.append(avg_reward)
        print(f"Episode {episode}/{num_episodes}, avg reward: {avg_reward:.3f}")
        # Update the best model if the average reward is higher
        if avg_reward > best_reward:
            best_reward = avg_reward
            torch.save(agent.model.state_dict(), "logs/ppo_shanten_opponent_model.pt")
            print(f"Best model saved with reward: {best_reward:.3f}")

        log_data = {
            'episode': episode,
            'avg_reward': avg_reward,
            'actor_loss': stats['actor_loss'],
            'value_loss': stats['value_loss'],
            'entropy': stats['entropy'],
            'total_loss': stats['total_loss']
        }
        with open('logs/ppo_shanten_opponent_training_log.json', 'a') as f:
            json.dump(log_data, f)
            f.write('\n')
# plot the reward curve
plot_rewards(rolling_rewards, path=f"logs/ppo_shanten_opponent_reward_curve_{episode}.png")




Episode 100/3000, avg reward: -125.550
Best model saved with reward: -125.550
Episode 200/3000, avg reward: -130.950
Episode 300/3000, avg reward: -126.900
Episode 400/3000, avg reward: -123.750
Best model saved with reward: -123.750
Episode 500/3000, avg reward: -122.850
Best model saved with reward: -122.850
Episode 600/3000, avg reward: -128.250
Episode 700/3000, avg reward: -123.300
Episode 800/3000, avg reward: -128.250
Episode 900/3000, avg reward: -122.850
Episode 1000/3000, avg reward: -121.500
Best model saved with reward: -121.500
Episode 1100/3000, avg reward: -123.300
Episode 1200/3000, avg reward: -130.950
Episode 1300/3000, avg reward: -122.850
Episode 1400/3000, avg reward: -132.300
Episode 1500/3000, avg reward: -125.550
Episode 1600/3000, avg reward: -128.250
Episode 1700/3000, avg reward: -126.900
Episode 1800/3000, avg reward: -128.250
Episode 1900/3000, avg reward: -130.950
Episode 2000/3000, avg reward: -131.850
Episode 2100/3000, avg reward: -123.300
Episode 2200/

训练时长：202m34.5s