In [None]:
import gym
import numpy as np
import mjx
from mjx.agents import RandomAgent, ShantenAgent
from ppo_agent import PPOAgent, GymEnv
import torch

In [None]:
# Initialize the environment: Play against Shanten agent
opponents = [ShantenAgent() for _ in range(3)]  
env = GymEnv(opponent_agents=opponents)

# obtain the observation and action mask shape
obs, info = env.reset()
obs_shape = obs.flatten().shape[0]
action_dim = len(info["action_mask"])  # action number

# Initialize PPO agent
agent = PPOAgent(
    input_dim=obs_shape,
    hidden_dim=128,
    output_dim=action_dim,
    lr = 1e-4,              # small learning rate
    entropy_coef=0.001      # small entropy coefficient: small curiousity
)


In [5]:
import matplotlib.pyplot as plt

def plot_rewards(rewards, path="logs/reward_curve.png"):
    plt.figure()
    plt.plot(rewards)
    plt.xlabel("Episode")
    plt.ylabel("Reward")
    plt.title("Training Reward Curve")
    plt.savefig(path)
    plt.close()


In [6]:
import matplotlib.pyplot as plt
import numpy as np

def plot_returns(rewards, window=10, path="logs/reward_curve.png"):
    rewards = np.array(rewards)
    episodes = np.arange(len(rewards))

    # Compute rolling mean and std
    rolling_mean = np.convolve(rewards, np.ones(window)/window, mode='valid')
    rolling_std = np.array([np.std(rewards[max(0, i - window):i + 1]) for i in range(window - 1, len(rewards))])

    # Align x-axis for rolling mean
    rolling_episodes = episodes[window - 1:]

    plt.figure(figsize=(10, 6))
    plt.plot(rolling_episodes, rolling_mean, label='Rolling Avg Reward', color='blue')
    plt.fill_between(rolling_episodes,
                     rolling_mean - rolling_std,
                     rolling_mean + rolling_std,
                     color='blue', alpha=0.3, label='±1 Std Dev')

    plt.xlabel("Logged Episode")
    plt.ylabel("Average Reward")
    plt.title("Training Reward Curve with Variance")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(path)
    plt.close()


In [None]:
import json

# Training hyperparameters
num_episodes = 3000
log_interval = 100  # record every 100 episodes
rolling_rewards = []

all_rewards = []
all_actor_loss = []
all_value_loss = []
best_reward = -float("inf")

for episode in range(1, num_episodes + 1):
    print(f"Episode {episode}/{num_episodes}")
    obs, info = env.reset()
    total_reward = 0
    done = False

    while not done:
        action = agent.act(obs, info["action_mask"])
        next_obs, reward, done, info = env.step(action)
        agent.store_reward(reward)
        obs = next_obs
        total_reward += reward
    
    stats = agent.update()
    all_rewards.append(total_reward)
    all_actor_loss.append(stats['actor_loss'])
    all_value_loss.append(stats['value_loss'])
    print(f"Episode {episode} finished with total reward: {total_reward:.3f}")

    # print logs
    if episode % log_interval == 0:
        avg_reward = np.mean(all_rewards[-log_interval:])
        rolling_rewards.append(avg_reward)
        print(f"Episode {episode}/{num_episodes}, avg reward: {avg_reward:.3f}")
        # update the best model
        if avg_reward > best_reward:
            best_reward = avg_reward
            torch.save(agent.model.state_dict(), "logs/ppo5/ppo5_10000.pt")
            print(f"Best model saved with reward: {best_reward:.3f}")

        log_data = {
            'episode': episode,
            'avg_reward': avg_reward,
            'actor_loss': stats['actor_loss'],
            'value_loss': stats['value_loss'],
            'entropy': stats['entropy'],
            'total_loss': stats['total_loss']
        }
        with open('logs/ppo5/training_log_ppo5_10000.json', 'a') as f:
            json.dump(log_data, f)
            f.write('\n')
    if episode % 1000 == 0:
        # plot the reward curve
        plot_rewards(rolling_rewards, path=f"logs/ppo5/sr_reward_curve_{episode}_10000.png")
        plot_returns(all_rewards, window=100, path=f"logs/ppo5/sr_return_curve_{episode}_10000.png")




Episode 1/3000


  model.load_state_dict(torch.load(path, map_location=DEVICE)['model_state'])


Episode 1 finished with total reward: 164.000
Episode 2/3000
Episode 2 finished with total reward: 34.000
Episode 3/3000
Episode 3 finished with total reward: 227.000
Episode 4/3000
Episode 4 finished with total reward: -20.000
Episode 5/3000
Episode 5 finished with total reward: 101.000
Episode 6/3000
Episode 6 finished with total reward: -217.000
Episode 7/3000
Episode 7 finished with total reward: 341.000
Episode 8/3000
Episode 8 finished with total reward: 188.000
Episode 9/3000
Episode 9 finished with total reward: 208.000
Episode 10/3000
Episode 10 finished with total reward: -97.000
Episode 11/3000
Episode 11 finished with total reward: 74.000
Episode 12/3000
Episode 12 finished with total reward: -6.000
Episode 13/3000
Episode 13 finished with total reward: -49.000
Episode 14/3000
Episode 14 finished with total reward: 39.000
Episode 15/3000
Episode 15 finished with total reward: 57.000
Episode 16/3000
Episode 16 finished with total reward: 43.000
Episode 17/3000
Episode 17 fin