In [1]:
from envs import TicTacToeTrainingEnv
import warnings
warnings.filterwarnings("ignore")
from copy import deepcopy

from sb3_contrib.ppo_mask import MaskablePPO
from utils.terminal_colors import *
from utils.json_utils import save_opponent_stats, load_opponent_stats
from utils.models_utils import should_save_model, get_models
from utils.evaluator import evaluate_model_by_opponent
from training.advanced_training.config import *
from sb3_contrib.common.wrappers import ActionMasker
from test.action_mask_ import mask_fn



/home/cytech/PROGRAMMATION/AI/tic_tac_toe_rl/models/models_3_3


<h1 style="color:#0b9ed8">TRAINING</h1>

In [None]:
# main training loop
for i in range(START_MODEL_INDEX, MAX_MODELS + 1):

    model_name = BASE_MODELS_NAME + f"_{i}.zip"
    model_path = os.path.join(MODELS_DIR, model_name)

    opponent_models = get_models(MODELS_DIR)
    opponent_pool = ["random", "smart_random"] + opponent_models

    improvement = False
    best_model = None
    n_checks = TOTAL_STEPS // CHECKPOINT_INTERVAL

    for check in range(n_checks):
        current_progress = (check * CHECKPOINT_INTERVAL) / TOTAL_STEPS

        # dynamic training parameters adjusted progressively
        n_steps = int(2048 + (4096 - 2048) * current_progress**0.8)
        batch_size = int(512 + (2048 - 512) * current_progress**1.0)
        ent_coef = max(0.001, 0.015 * (1 - current_progress**0.6))

        # load best stats from previous opponents
        best_stats = load_opponent_stats(opponent_pool)

        # initialize training environment and apply action masking
        env_init = TicTacToeTrainingEnv(
            board_length=TRAINING_DEFAULT_BOARD_LENGTH,
            pattern_victory_length=TRAINING_DEFAULT_PATTERN_VICTORY_LENGTH,
            opponent_pool=opponent_pool,
            first_play_rate=TRAINING_DEFAULT_FIRST_PLAY_RATE,
            lost_games_path=None,
            review_ratio=TRAINING_DEFAULT_REVIEW_RATIO,
            opponent_statistics_file=STATS_PATH,
        )
        env = ActionMasker(env_init, mask_fn)

        # initialize or load model depending on iteration and checkpoint
        if check == 0:
            if i == 1:
                model = MaskablePPO(
                    "MultiInputPolicy",
                    env=env,
                    verbose=1,
                    gamma=GAMMA,
                    gae_lambda=GAE_LAMBDA,
                    ent_coef=ent_coef,
                    n_steps=n_steps,
                    batch_size=batch_size,
                    learning_rate=LR_SCHEDULE(current_progress),
                    policy_kwargs=POLICY_KWARGS
                )
            else:
                prev_model_path = get_models(MODELS_DIR)[-1]
                model = MaskablePPO.load(prev_model_path, env=env)
                model.ent_coef = ent_coef
                model.n_steps = n_steps
                model.batch_size = batch_size
                model.learning_rate = LR_SCHEDULE(current_progress)

        print(f"\n{YELLOW}=== Training segment {check+1}/{n_checks} ===")
        print(f"Steps: {check*CHECKPOINT_INTERVAL}-{(check+1)*CHECKPOINT_INTERVAL}")
        print(f"Params: n_steps={n_steps}, batch={batch_size}, ent_coef={ent_coef:.4f}")
        print(f"Opponents: {opponent_pool}{RESET}\n")

        # train the model for the checkpoint interval
        model.learn(total_timesteps=CHECKPOINT_INTERVAL)

        # evaluate model against opponents
        results = evaluate_model_by_opponent(model, opponent_pool, n_episodes=200, stats_path=STATS_PATH)

        current_stats = {k: {
            "defeat_rate": v["defeat_rate"],
            "victory_rate": v["victory_rate"]
        } for k, v in results.items()}

        # decide if model should be saved based on improvement
        if should_save_model(current_stats, best_stats, IMPROVEMENT_THRESHOLD):

            print(f"{GREEN}Saved new best model at checkpoint {check}{RESET}")
            print(f"{RED}Old best stats -> {best_stats}{RESET}")
            print(f"{YELLOW}New best stats -> {current_stats}{RESET}")

            improvement = True
            best_model = model
            best_stats = deepcopy(current_stats)
            model.save(model_path)
            save_opponent_stats(best_stats, STATS_PATH)

    if improvement:
        print(f"{GREEN}Training completed for model {i}. Best model saved.{RESET}")
    else:
        print(f"{RED}Warning: No model met improvement criteria{RESET}")


---/home/cytech/PROGRAMMATION/AI/tic_tac_toe_rl/models/models_3_3---
---/home/cytech/PROGRAMMATION/AI/tic_tac_toe_rl/models/models_3_3---
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.

[33m=== Training segment 1/10 ===
Steps: 0-10000
Params: n_steps=2048, batch=512, ent_coef=0.0150
Opponents: ['random', 'smart_random', '/home/cytech/PROGRAMMATION/AI/tic_tac_toe_rl/models/models_3_3/model_3_3_1.zip', '/home/cytech/PROGRAMMATION/AI/tic_tac_toe_rl/models/models_3_3/model_3_3_2.zip', '/home/cytech/PROGRAMMATION/AI/tic_tac_toe_rl/models/models_3_3/model_3_3_3.zip', '/home/cytech/PROGRAMMATION/AI/tic_tac_toe_rl/models/models_3_3/model_3_3_4.zip', '/home/cytech/PROGRAMMATION/AI/tic_tac_toe_rl/models/models_3_3/model_3_3_7.zip'][0m

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 3.6      |
|    ep_rew_mean     | -0.006   |
| time/              |          |
|    fps             | 108      |
|    iterations      | 1    