In [None]:
import gym
import numpy as np
import mjx
from mjx.agents import RandomAgent, ShantenAgent
from ppo_agent import PPOAgent, GymEnv
import torch

In [3]:
import matplotlib.pyplot as plt

def plot_rewards(rewards, path="logs/reward_curve.png"):
    plt.figure()
    plt.plot(rewards)
    plt.xlabel("Episode")
    plt.ylabel("Reward")
    plt.title("Training Reward Curve")
    plt.savefig(path)
    plt.close()


In [None]:
import json

def train_curriculum_agent(
    info_type = "default", opponents = list(RandomAgent() for _ in range(3)),
    num_episodes=1000, log_interval = 100,
    pretrained_model=None,
    stage = 1,
    lr = 1e-4,
    all_logs = {
        "all_rewards": [],
        "all_actor_loss": [],
        "all_value_loss": [],
    },
    
):
    env = GymEnv(opponent_agents=opponents, info_type=info_type, discard_model=False)
    obs, info = env.reset()
    obs_shape = obs.flatten().shape[0]
    action_dim = len(info["action_mask"])
    agent = PPOAgent(
        input_dim=obs_shape,
        hidden_dim=128,
        output_dim=action_dim,
        pretrained_model=pretrained_model, # If have a pretrained model, load it
        lr=lr,
    )

    rolling_rewards = []
    best_reward = -np.inf

    for episode in range(1, num_episodes + 1):
        obs, info = env.reset()
        total_reward = 0
        done = False

        while not done:
            action = agent.act(obs, info["action_mask"])
            next_obs, reward, done, info = env.step(action)
            agent.store_reward(reward)
            obs = next_obs
            total_reward += reward

        stats = agent.update(next_obs, done)
        all_logs["all_actor_loss"].append(stats["actor_loss"])
        all_logs["all_value_loss"].append(stats["value_loss"])
        all_logs["all_rewards"].append(total_reward)

        if episode % log_interval == 0:
            avg_reward = np.mean(all_logs["all_rewards"][-log_interval:])
            rolling_rewards.append(avg_reward)
            print(f"Episode {episode}/{num_episodes}, avg reward: {avg_reward:.3f}")
            # Update the best model if the average reward is higher than the previous best
            if avg_reward > best_reward:
                best_reward = avg_reward
                torch.save(agent.model.state_dict(), f"logs/ppo4/best_model_ppo4_stage_{stage}.pt")
                print(f"Best model saved with reward: {best_reward:.3f}")
            
            log_data = {
                "episode": episode,
                "avg_reward": avg_reward,
                "actor_loss": stats["actor_loss"],
                "value_loss": stats["value_loss"],
                "entropy": stats["entropy"],
                "total_loss": stats["total_loss"],
            }

            with open(f"logs/ppo4/stage_{stage}_logs.json", "a") as f:
                f.write(json.dumps(log_data) + "\n")
            print(f"Episode {episode} logs saved.")

    plot_rewards(rolling_rewards, path=f"logs/ppo4/stage_{stage}_reward_curve.png")
    return agent, all_logs

In [5]:
print("Training Stage 1: Perfect Information with Random Agent Opponents")

stage_1_agent, all_logs = train_curriculum_agent(
    info_type="perfect",
    opponents=[RandomAgent() for _ in range(3)],
    num_episodes=400,
    log_interval=100,
    pretrained_model=None,
    stage=1,
    
)

print("Training Stage 2: Perfect Information with Shanten Agent Opponents")
stage_2_agent, all_logs = train_curriculum_agent(
    info_type="perfect",
    opponents=[ShantenAgent() for _ in range(3)],
    num_episodes=600,
    log_interval=100,
    pretrained_model="logs/ppo4/best_model_ppo4_stage_1.pt",
    stage=2,
    all_logs=all_logs,
    lr = 5e-4,
)

print("Training Stage 3: Imperfect Information with Shanten Agent Opponents")
stage_3_agent, all_logs = train_curriculum_agent(
    info_type="default",
    opponents=[ShantenAgent() for _ in range(3)],
    num_episodes=900,
    log_interval=100,
    pretrained_model="logs/ppo4/best_model_ppo4_stage_2.pt",
    stage=3,
    all_logs=all_logs,
    lr = 5e-4
)


print("Training Stage 4: Self-Play")
stage_3_agent, all_logs = train_curriculum_agent(
    info_type="default",
    opponents=[ShantenAgent() for _ in range(3)],
    num_episodes=1100,
    log_interval=100,
    pretrained_model="logs/ppo4/best_model_ppo4_stage_3.pt",
    stage=4,
    all_logs=all_logs,
    lr=5e-4
)

Training Stage 1: Perfect Information with Random Agent Opponents


  model.load_state_dict(torch.load(path, map_location=DEVICE)['model_state'])


Episode 100/400, avg reward: 282.610
Best model saved with reward: 282.610
Episode 100 logs saved.
Episode 200/400, avg reward: 338.910
Best model saved with reward: 338.910
Episode 200 logs saved.
Episode 300/400, avg reward: 377.290
Best model saved with reward: 377.290
Episode 300 logs saved.
Episode 400/400, avg reward: 376.370
Episode 400 logs saved.
Training Stage 2: Perfect Information with Shanten Agent Opponents
Loaded pretrained model from logs/ppo4/best_model_ppo4_stage_1.pt


  pretrained_state_dict = torch.load(pretrained_model)


Episode 100/600, avg reward: 381.670
Best model saved with reward: 381.670
Episode 100 logs saved.
Episode 200/600, avg reward: 434.180
Best model saved with reward: 434.180
Episode 200 logs saved.
Episode 300/600, avg reward: 422.700
Episode 300 logs saved.
Episode 400/600, avg reward: 515.050
Best model saved with reward: 515.050
Episode 400 logs saved.
Episode 500/600, avg reward: 534.830
Best model saved with reward: 534.830
Episode 500 logs saved.
Episode 600/600, avg reward: 506.340
Episode 600 logs saved.
Training Stage 3: Imperfect Information with Shanten Agent Opponents
Loaded pretrained model from logs/ppo4/best_model_ppo4_stage_2.pt
Episode 100/900, avg reward: 449.490
Best model saved with reward: 449.490
Episode 100 logs saved.
Episode 200/900, avg reward: 523.120
Best model saved with reward: 523.120
Episode 200 logs saved.
Episode 300/900, avg reward: 523.350
Best model saved with reward: 523.350
Episode 300 logs saved.
Episode 400/900, avg reward: 564.460
Best model sa

In [6]:
import numpy as np
import matplotlib.pyplot as plt

def plot_rewards_with_std(rewards, window=10, path="reward_curve_with_std.png"):
    rewards = np.array(rewards)
    episodes = np.arange(len(rewards))

    # Compute rolling mean and std
    rolling_mean = np.convolve(rewards, np.ones(window)/window, mode='valid')
    rolling_std = np.array([np.std(rewards[max(0, i - window):i + 1]) for i in range(window - 1, len(rewards))])

    # Align x-axis for rolling mean
    rolling_episodes = episodes[window - 1:]

    # Ensure that rolling_episodes and rolling_mean are the same length
    if len(rolling_episodes) != len(rolling_mean):
        print("Warning: Rolling episodes and mean length mismatch.")
        return

    # Plot the reward curve with shaded area for std
    plt.figure(figsize=(10, 6))
    plt.plot(rolling_episodes, rolling_mean, label='Rolling Average Reward', color='blue')
    plt.fill_between(rolling_episodes,
                     rolling_mean - rolling_std,
                     rolling_mean + rolling_std,
                     color='blue', alpha=0.3, label='±1 Std Dev')

    plt.xlabel("Episode")
    plt.ylabel("Average Reward")
    plt.title("Training Reward Curve with Standard Deviation")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()

    # Save the plot
    plt.savefig(path)
    plt.close()

# Example usage
plot_rewards_with_std(all_logs["all_rewards"], window=100, path="logs/ppo4/whole_reward_curve_with_std.png")


In [10]:
logs = []
for i in range(1, 5):
    with open(f"logs/ppo4/stage_{i}_logs.json", "r") as f:
        for line in f:
            logs.append(json.loads(line))
if logs:
    all_logs = {
        "all_rewards": [],
    }
    for log in logs:
        all_logs["all_rewards"].append(log["avg_reward"])
        
    plot_rewards(all_logs["all_rewards"], path="logs/ppo4/whole_reward_curve.png")
else:
    print("No logs found to process.")