In [26]:
import mjx
from mjx.agents import RandomAgent, ShantenAgent

from ppo_agent import PPOAgent, GymEnv
from tqdm import tqdm
import numpy as np

import json


In [27]:
RANK_DICT = {
    90 : 1,
    45 : 2,
    0 : 3,
    -135 : 4
}

def score_func(avg_score, avg_rank, std_score, std_rank, rank_distribution, 
          w_score=0.4, w_rank=0.2, w_stability=0.2, w_distribution=0.2):
    """
    rank_distribution: list or array of length 4, e.g. [1st%, 2nd%, 3rd%, 4th%]
    """

    # Normalize avg_score: from [0, 25000] to [0, 1]
    score_norm = avg_score / 25000

    # Normalize avg_rank: from [1, 4] to [1, 0]
    rank_norm = (4 - avg_rank) / 3

    # Stability score: low std is better
    std_score_norm = 1 - (std_score / 10000)
    std_rank_norm = 1 - std_rank  # since max std(rank) is 1 in 4-rank system
    stability = 0.5 * std_score_norm + 0.5 * std_rank_norm

    # Rank distribution quality: encourage 1st, penalize 4th
    dist_score = (
        1.0 * rank_distribution[0] +   # 1st
        0.6 * rank_distribution[1] +   # 2nd
        0.2 * rank_distribution[2] +   # 3rd
        -0.5 * rank_distribution[3]    # 4th (penalty)
    )

    final_score = (
        w_score * score_norm +
        w_rank * rank_norm +
        w_stability * stability +
        w_distribution * dist_score
    ) * 100  # Optional: scale to 0–100

    return final_score

    

def eval(records):
    avg_score = sum([record["score"] for record in records]) / len(records)
    avg_rank = sum([record["rank"] for record in records]) / len(records)
    std_score = (sum([(record["score"] - avg_score) ** 2 for record in records]) / len(records)) ** 0.5
    std_rank = (sum([(record["rank"] - avg_rank) ** 2 for record in records]) / len(records)) ** 0.5
    ranks = np.array([r["rank"] for r in records])
    total_games = len(ranks)

    rank_counts = np.bincount(ranks)[1:5]  
    rank_distribution = rank_counts / total_games

    score = score_func(
        avg_score,
        avg_rank,
        std_score,
        std_rank,
        rank_distribution
    )
    
    print(f"Average score: {avg_score:.2f} ± {std_score:.2f}")
    print(f"Average rank: {avg_rank:.2f} ± {std_rank:.2f}")
    print(f"Rank distribution: {rank_distribution}")
    print(f"Score: {score:.2f}")





## Test Base Model: Base Model vs. Base Model

In [28]:
def test_base_agent(my_agent, num_games=100):
    env = mjx.MjxEnv()
    results = []
    players = [
        my_agent,
        ShantenAgent(),
        ShantenAgent(),
        ShantenAgent(),
    ]

    for _ in tqdm(range(num_games)):
        obs_dict = env.reset()
        while not env.done():
            # actions = {
            #     player_id: my_agent.act(obs_dict[player_id])
            #     for player_id in obs_dict.keys()
            # }
            actions = {
                player_id: players[i].act(obs_dict[player_id])
                for i, player_id in enumerate(obs_dict.keys())
            }

            obs_dict = env.step(actions)

        my_index = obs_dict["player_0"].who()
        score = obs_dict["player_0"].tens()[my_index]
        my_reward = env.rewards()["player_0"]
        my_rank = RANK_DICT[my_reward]

        results.append({"rank": my_rank, "score": score})
        
    return results

### Test Random Agent

In [4]:
random_agent = RandomAgent()
random_agent_results = test_base_agent(random_agent, num_games=100)
# store results in a json file
with open("logs/battle_results/random_agent_results.json", "w") as f:
    json.dump(random_agent_results, f)
eval(random_agent_results)

100%|██████████| 100/100 [00:04<00:00, 20.49it/s]

Average score: 25189.00 ± 1553.51
Average rank: 2.46 ± 1.16
Rank distribution: [0.29 0.22 0.23 0.26]
Score: 64.16





### Test Shanten Agent

In [5]:
shanten_agent = ShantenAgent()
shanten_agent_results = test_base_agent(shanten_agent, num_games=100)
# store results in a json file
with open("logs/battle_results/shanten_agent_results.json", "w") as f:
    json.dump(shanten_agent_results, f)
eval(shanten_agent_results)

100%|██████████| 100/100 [00:08<00:00, 11.24it/s]

Average score: 25572.00 ± 10378.93
Average rank: 2.44 ± 1.13
Rank distribution: [0.28 0.24 0.24 0.24]
Score: 56.63





## Test PPO Model: PPO Model vs. Base Model

In [30]:
def test_ppo_agent(pretrained_model, num_games=100, discard_model=False):
    my_agent = PPOAgent(
        input_dim=544,
        hidden_dim=128,
        output_dim=181,
        pretrained_model=pretrained_model, # If have a pretrained model, load it
    )
    env = GymEnv(opponent_agents=[RandomAgent(), RandomAgent(), RandomAgent()], info_type="default", discard_model=discard_model)

    records = []
    for _ in tqdm(range(num_games)):
        obs, info = env.reset()
        done = False
        while not done:
            action_mask = info["action_mask"]
            action = my_agent.act(obs, action_mask)

            # env.step returns obs, reward, done, info
            obs, reward, done, info = env.step(action)

        my_index = env.curr_obs_dict['player_0'].who()
        my_score = env.curr_obs_dict['player_0'].tens()[my_index]
        my_reward = env.mjx_env.rewards()['player_0']
        my_rank = RANK_DICT[my_reward]


        records.append({
            "score": my_score,
            "rank": my_rank,
        })

    return records

### Test PPO Agent 1
PPO Agent 1 learned from random agent opponents and use the final rank as the reward.


In [7]:
ppo1_model = "pretrained_models/ppo1_model.pt"
ppo_agent1_results = test_ppo_agent(ppo1_model, num_games=100)


  state_dict = torch.load(pretrained_model)


Loaded pretrained model from pretrained_models/ppo1_model.pt


  model.load_state_dict(torch.load(path, map_location=DEVICE)['model_state'])
100%|██████████| 100/100 [34:35<00:00, 20.75s/it]


In [8]:
# store results in a json file
with open("logs/battle_results/ppo_agent1_results.json", "w") as f:
    json.dump(ppo_agent1_results, f)
eval(ppo_agent1_results)

Average score: 28243.00 ± 4588.29
Average rank: 1.49 ± 0.82
Rank distribution: [0.67 0.22 0.06 0.05]
Score: 84.93


### Test PPO Agent 2 
PPO Agent 2 learned from shanten agent opponents and use the final rank as the reward

In [9]:
ppo2_model = "pretrained_models/ppo2_model.pt"
ppo_agent2_results = test_ppo_agent(ppo2_model, num_games=100)
eval(ppo_agent2_results)

  state_dict = torch.load(pretrained_model)


Loaded pretrained model from pretrained_models/ppo2_model.pt


100%|██████████| 100/100 [33:37<00:00, 20.18s/it]


Average score: 26253.00 ± 2656.10
Average rank: 1.93 ± 1.05
Rank distribution: [0.47 0.25 0.16 0.12]
Score: 74.48


In [10]:
# store results in a json file
with open("logs/battle_results/ppo_agent2_results.json", "w") as f:
    json.dump(ppo_agent2_results, f)
eval(ppo_agent2_results)

Average score: 26253.00 ± 2656.10
Average rank: 1.93 ± 1.05
Rank distribution: [0.47 0.25 0.16 0.12]
Score: 74.48


### Test PPO Agent 3
PPO Agent 3 learned from shanten agent opponents and use the custom reward

In [11]:
ppo3_model = "pretrained_models/ppo3_model.pt"
ppo_agent3_results = test_ppo_agent(ppo3_model, num_games=100)

Loaded pretrained model from pretrained_models/ppo3_model.pt


100%|██████████| 100/100 [28:28<00:00, 17.09s/it]


In [12]:
# store results in a json file
with open("logs/battle_results/ppo_agent3_results.json", "w") as f:
    json.dump(ppo_agent3_results, f)
eval(ppo_agent3_results)

Average score: 31106.00 ± 4265.51
Average rank: 1.15 ± 0.54
Rank distribution: [0.91 0.05 0.02 0.02]
Score: 97.82


In [13]:
eval(ppo_agent3_results)

Average score: 31106.00 ± 4265.51
Average rank: 1.15 ± 0.54
Rank distribution: [0.91 0.05 0.02 0.02]
Score: 97.82


### Test PPO Agent 4
PPO Agent 4 learned from shanten agent opponents, use the custom reward, with curriculum learning

In [14]:
ppo4_model = "pretrained_models/ppo4_model.pt"
ppo_agent4_results = test_ppo_agent(ppo4_model, num_games=100)


Loaded pretrained model from pretrained_models/ppo4_model.pt


100%|██████████| 100/100 [31:47<00:00, 19.08s/it]


In [15]:
# store results in a json file
with open("logs/battle_results/ppo_agent4_results.json", "w") as f:
    json.dump(ppo_agent4_results, f)
eval(ppo_agent4_results)

Average score: 29292.00 ± 3337.35
Average rank: 1.20 ± 0.57
Rank distribution: [0.87 0.07 0.05 0.01]
Score: 94.88


### Test PPO Agent 5
PPO Agent 5 learned from shanten agen opponents, use the custom reward with discard supervised learning model

In [31]:
ppo5_model = "pretrained_models/ppo5_model.pt"
ppo_agent5_results = test_ppo_agent(ppo5_model, num_games=100, discard_model=True)

Loaded pretrained model from pretrained_models/ppo5_model.pt


100%|██████████| 100/100 [33:12<00:00, 19.92s/it]


In [32]:
# store results in a json file
# with open("logs/battle_results/ppo_agent5_results.json", "w") as f:
#     json.dump(ppo_agent5_results, f)
eval(ppo_agent5_results)

Average score: 29659.00 ± 3991.99
Average rank: 1.16 ± 0.50
Rank distribution: [0.88 0.1  0.   0.02]
Score: 95.95


## Test PPO Agent: Inter-Model Performance Evaluation

In [18]:
def run_game_between_ppo_agents(pretrained_models, num_games=100):
    agents = [
        PPOAgent(
            input_dim=544,
            hidden_dim=128,
            output_dim=181,
            pretrained_model=model,
        )
        for model in pretrained_models
    ]
    env = GymEnv(opponent_agents=agents, info_type="default")
    records_1, records_2, records_3, records_4 = [], [], [], []
    for _ in tqdm(range(num_games)):
        obs, info = env.reset()
        done = False
        while not done:
            action_mask = info["action_mask"]
            action = agents[0].act(obs, action_mask)
            
            obs, reward, done, info = env.step(action)

        # Get the score and rank for each agent
        for i in range(4):
            my_index = env.curr_obs_dict[f'player_{i}'].who()
            my_score = env.curr_obs_dict[f'player_{i}'].tens()[my_index]
            my_reward = env.mjx_env.rewards()[f'player_{i}']
            my_rank = RANK_DICT[my_reward]

            if i == 0:
                records_1.append({"score": my_score, "rank": my_rank})
            elif i == 1:
                records_2.append({"score": my_score, "rank": my_rank})
            elif i == 2:
                records_3.append({"score": my_score, "rank": my_rank})
            else:
                records_4.append({"score": my_score, "rank": my_rank})

    return records_1, records_2, records_3, records_4

In [19]:
ppo2_model = "pretrained_models/ppo2_model.pt"
ppo3_model = "pretrained_models/ppo3_model.pt"
ppo4_model = "pretrained_models/ppo4_model.pt"
ppo5_model = "pretrained_models/ppo5_model.pt"

game_players = [ppo2_model, ppo3_model, ppo4_model, ppo5_model]
ppo1_result, ppo2_result, ppo3_result, ppo4_result = run_game_between_ppo_agents(game_players, num_games=100)

Loaded pretrained model from pretrained_models/ppo2_model.pt
Loaded pretrained model from pretrained_models/ppo3_model.pt
Loaded pretrained model from pretrained_models/ppo4_model.pt
Loaded pretrained model from pretrained_models/ppo5_model.pt


100%|██████████| 100/100 [39:32<00:00, 23.72s/it]


In [20]:
# store results in a json file
for i, result in enumerate([ppo1_result, ppo2_result, ppo3_result, ppo4_result]):
    with open(f"logs/battle_results/ppo{i+1}_inter_battle_results.json", "w") as f:
        json.dump(result, f)
    print(f"ppo{i+1}'s result evaluation in inter-agent battle:")
    eval(result)

ppo1's result evaluation in inter-agent battle:
Average score: 24991.00 ± 1771.73
Average rank: 2.36 ± 1.16
Rank distribution: [0.32 0.24 0.2  0.24]
Score: 65.21
ppo2's result evaluation in inter-agent battle:
Average score: 24939.00 ± 1844.93
Average rank: 2.65 ± 1.06
Rank distribution: [0.18 0.26 0.29 0.27]
Score: 61.62
ppo3's result evaluation in inter-agent battle:
Average score: 25076.00 ± 2138.56
Average rank: 2.44 ± 1.11
Rank distribution: [0.27 0.24 0.27 0.22]
Score: 64.47
ppo4's result evaluation in inter-agent battle:
Average score: 24984.00 ± 1668.34
Average rank: 2.55 ± 1.12
Rank distribution: [0.23 0.26 0.24 0.27]
Score: 62.78
