In [None]:
from envs import TicTacToeBaseEnv, TicTacToeTrainingEnv
import warnings
import os
warnings.filterwarnings("ignore")
import json
import numpy as np
from sb3_contrib.common.wrappers import ActionMasker
import wandb
from scripts.action_mask_ import mask_fn
from sb3_contrib import MaskablePPO

<h1 style="color:red">REAL TRAINING</h1>

In [None]:

# 📁 Chemin vers le fichier JSON des taux de défaite
models_dir = "MODELS_9x9"
DEFEAT_RATE_PATH = os.path.join(models_dir, "defeat_rates.json")

def convert_to_serializable(obj):
    """Convertit les objets non sérialisables (comme ndarray) en types compatibles JSON."""
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, (np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.float64, np.float32)):
        return float(obj)
    elif isinstance(obj, dict):
        return {key: convert_to_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(item) for item in obj]
    return obj


def load_defeat_rates(opponents):
    if os.path.exists(DEFEAT_RATE_PATH):
        with open(DEFEAT_RATE_PATH, "r") as f:
            saved_rates = json.load(f)
    else:
        saved_rates = {}

    for opp in opponents:
        if opp not in saved_rates:
            saved_rates[opp] = 1.0

    return saved_rates

def save_defeat_rates(rates):
    with open(DEFEAT_RATE_PATH, "w") as f:
        json.dump(rates, f, indent=4)

def evaluate_model_by_opponent(model, opponent_pool, n_episodes=100):
    global reward
    results = {}
    defeated_games = {}
    all_boards = []

    for opponent in opponent_pool:
        env = TicTacToeTrainingEnv(board_length=9,
                                   pattern_victory_length=5,
                                   opponent_pool=[opponent],
                                   evaluation=True,
                                   first_play_rate=0.2,
                                   lost_games_path="defeated_games.json",
                                   review_ratio=0.0)

        wins_play_first, losses_play_first, draws_play_first = 0, 0, 0
        wins_play_second, losses_play_second, draws_play_second = 0, 0, 0

        defeat_count = 0
        for _ in range(n_episodes):
            obs, _ = env.reset()
            done = False
            episode_data = {"player" : env.player, "board": []}

            while not done:
                action, _ = model.predict(obs, deterministic=True, action_masks=obs["action_mask"])
                obs, reward, done, _, _ = env.step(action)
                if not done:
                    episode_data["board"] = convert_to_serializable(obs["observation"])

            if reward == env.victory_reward:
                if env.first_to_play:
                    wins_play_first += 1
                else:
                    wins_play_second += 1
            elif reward == -env.victory_reward:
                defeat_count += 1
                if episode_data["board"] not in all_boards:
                    all_boards.append(episode_data["board"])
                    defeated_games[f"game_lose_against_{opponent}_{defeat_count}"] = episode_data
                if env.first_to_play:
                    losses_play_first += 1
                else:
                    losses_play_second += 1
            else:
                if env.first_to_play:
                    draws_play_first += 1
                else:
                    draws_play_second += 1

        results[opponent] = {
            "wins_play_first": wins_play_first,
            "wins_play_second": wins_play_second,
            "losses_play_first": losses_play_first,
            "losses_play_second": losses_play_second,
            "draws_play_first": draws_play_first,
            "draws_play_second": draws_play_second,
            "defeat_rate": (losses_play_first + losses_play_second) / n_episodes
        }

    if defeated_games:
        with open("defeated_games.json", "w") as f:
            json.dump(convert_to_serializable(defeated_games), f, indent=4)
    return results

def linear_schedule(start_value):
    def schedule(progress):
        return start_value * progress
    return schedule

# 🔧 Hyperparamètres
GAMMA = 0.99
GAE_LAMBDA = 0.95
START_ENT_COEF = 0.01

start_model_index = 2
max_models = 2

for i in range(start_model_index, max_models + 1):
    model_name = f"ppo_tictactoe_{i}.zip"
    model_path = os.path.join(models_dir, model_name)

    os.makedirs(models_dir, exist_ok=True)

    def get_models(path):
        return sorted(
            [os.path.join(path, f) for f in os.listdir(path) if f.startswith("ppo_tictactoe_") and f.endswith(".zip")],
            key=lambda x: int(x.split("_")[-1].split(".")[0])
        )

    opponent_models = get_models(models_dir)
    opponent_pool = ["random"] * 4 + ["smart_random"] * 1 + opponent_pool * 1

    defeat_rates = load_defeat_rates(opponent_pool)

    env_init = TicTacToeTrainingEnv(board_length=9,
                                    pattern_victory_length=5,
                                    opponent_pool=opponent_pool,
                                    first_play_rate=0.4,
                                    lost_games_path="defeated_games.json",
                                    review_ratio=0.0)

    env = ActionMasker(env_init, mask_fn)

    wandb.init(
        project=f"{models_dir}_ppo-tictactoe",
        name=f"{models_dir}-run_model_{i}",
        config={
            "model_index": i,
            "opponents": opponent_pool,
            "current_defeat_rates" : defeat_rates,
            "gamma": GAMMA,
            "gae_lambda": GAE_LAMBDA,
            "ent_coef_start": START_ENT_COEF
        },
        reinit=True
    )

    if i == 1:
        model = MaskablePPO(
            "MultiInputPolicy",
            env,
            verbose=1,
            gamma=GAMMA,
            gae_lambda=GAE_LAMBDA,
            ent_coef=START_ENT_COEF,
        )
    else:
        prev_model_path = get_models(models_dir)[-1]
        print(f"🔄 Chargement du modèle précédent : {prev_model_path}")
        model = MaskablePPO.load(
            prev_model_path,
            env=env,
            gamma=GAMMA,
            gae_lambda=GAE_LAMBDA,
            ent_coef=START_ENT_COEF,
        )

    total_steps = 50_000
    patience = 1
    no_improve_counter = 0

    while True:
        print(f"🎯 Entraînement modèle {i} ({models_dir}) contre ({opponent_pool}) pour {total_steps} steps...")
        model.learn(total_timesteps=total_steps)

        results = evaluate_model_by_opponent(model, opponent_pool)

        current_defeat_rates = {k: v["defeat_rate"] for k, v in results.items()}
        wandb.log({f"{k}_defeat_rate": v for k, v in current_defeat_rates.items()})

        improved_or_equal = all(current_defeat_rates[k] <= defeat_rates.get(k, 1.0) for k in current_defeat_rates)

        if improved_or_equal:
            print("✅ Taux de défaite réduit ou constant pour tous les adversaires. Sauvegarde du modèle.")
            model.save(model_path)
            defeat_rates.update(current_defeat_rates)
            save_defeat_rates(defeat_rates)
            break
        else:
            no_improve_counter += 1
            print(f"⏳ Pas d'amélioration sur tous les adversaires ({no_improve_counter}/{patience})")

            if no_improve_counter >= patience:
                print("🛑 Arrêt de l'entraînement : pas d'amélioration suffisante.")
                break

        wandb.log({
            **{f"{k}_wins_play_first": v["wins_play_first"] for k, v in results.items()},
            **{f"{k}_wins_play_second": v["wins_play_second"] for k, v in results.items()},
            **{f"{k}_draws_play_first": v["draws_play_first"] for k, v in results.items()},
            **{f"{k}_draws_play_second": v["draws_play_second"] for k, v in results.items()},
            **{f"{k}_losses_play_first": v["losses_play_first"] for k, v in results.items()},
            **{f"{k}_losses_play_second": v["losses_play_second"] for k, v in results.items()},
        })

    model.save(model_path)
    print(f"✅ Modèle {i} sauvegardé dans {model_path}")

    example_result = next(iter(results.values()))
    with open(os.path.join(models_dir, "training_log.txt"), "a") as log_file:
        log_file.write(
            f"Modèle {i}: {example_result['wins_play_first']}W_1st - {example_result['losses_play_first']}L_1st - "
            f"{example_result['draws_play_first']}D_1st - {example_result['wins_play_second']}W_2nd - "
            f"{example_result['losses_play_second']}L_2nd - {example_result['draws_play_second']}D_2nd | "
            f"Max Defeat Rate: {max(current_defeat_rates.values()):.2f}\n"
        )


    wandb.finish()
