Реализуйте алгоритм GAIL на среде Mountain Car. Перед этим сгенерируйте экспертные данные (из детерминированной стратегии с первой практики). Хорошей идеей будет добавить в state (observation) синус и косинус от временной метки t для лучшего обучения.

In [13]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions.categorical import Categorical


In [14]:
def generate_expert_data(num_episodes=200):
    env = gym.make("MountainCar-v0", render_mode=None)
    expert_states = []
    expert_actions = []
    successful_episodes = 0

    while successful_episodes < num_episodes:
        state, _ = env.reset()
        episode_states = []
        episode_actions = []
        done = False
        t = 0

        while not done:
            position, velocity = state
            if position < -0.5:
                action = 2  # Вправо
            elif position > 0.5:
                action = 0  # Влево
            else:
                action = 2 if velocity > 0 else 0

            next_state, _, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            # Добавляем временные метки
            augmented_state = np.concatenate([
                state,
                [np.sin(t / 10), np.cos(t / 10)]
            ]).astype(np.float32)

            episode_states.append(augmented_state)
            episode_actions.append(action)
            state = next_state
            t += 1

        if t < 200:
            expert_states.extend(episode_states)
            expert_actions.extend(episode_actions)
            successful_episodes += 1

    env.close()
    return np.array(expert_states), np.array(expert_actions)

In [15]:
states, actions = generate_expert_data() # Получите экспертные данные

In [16]:
# исправьте для среды mountain car и учтите добавление данных в state
obs_dim = 4  # [position, velocity, sin(t/10), cos(t/10)]
# исправьте для среды mountain car
act_dim = 3  # 3 действия (0: влево, 1: нет газа, 2: вправо)
expert_obs = np.copy(states)
expert_acts = np.copy(actions)

In [17]:
class Policy(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        # self.net = nn.Sequential(
        #     nn.Linear(obs_dim, 64), nn.ReLU(),
        #     nn.Linear(64, act_dim)
        # )
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 64), nn.ReLU(),
            nn.Linear(64, 64), nn.ReLU(),
            nn.Linear(64, act_dim)
        )

    def forward(self, obs):
        logits = self.net(obs)
        return Categorical(logits=logits)

    def get_action(self, obs):
        dist = self.forward(obs)
        return dist.sample().item()

In [18]:
class Discriminator(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        # self.net = nn.Sequential(
        #     nn.Linear(obs_dim + act_dim, 64), nn.ReLU(),
        #     nn.Linear(64, 1),
        #     nn.Sigmoid()
        # )
        self.net = nn.Sequential(
            nn.Linear(obs_dim + act_dim, 64), nn.ReLU(),
            nn.Linear(64, 64), nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, obs, act):
        act_onehot = F.one_hot(act, num_classes=act_dim).float()
        x = torch.cat([obs, act_onehot], dim=1)
        return self.net(x)

In [19]:
class TrajectoryBuffer:
    def __init__(self):
        self.obs, self.acts, self.rews = [], [], []

    def store(self, o, a, r):
        self.obs.append(o)
        self.acts.append(a)
        self.rews.append(r)

    def get(self):
        return (
            torch.tensor(np.array(self.obs), dtype=torch.float32),
            torch.tensor(np.array(self.acts), dtype=torch.long),
            torch.tensor(np.array(self.rews), dtype=torch.float32)
        )

In [20]:
env = gym.make("MountainCar-v0", )
policy = Policy(obs_dim, act_dim)
discrim = Discriminator(obs_dim, act_dim)

policy_opt = optim.Adam(policy.parameters(), lr=1e-3)
discrim_opt = optim.Adam(discrim.parameters(), lr=1e-3)

In [21]:
for epoch in range(3000):
    buf = TrajectoryBuffer()
    obs, _ = env.reset()
    done = False
    total_reward = 0
    t = 0

    while not done:
        # Обработайте obs если планируете добавлять в него t
        augmented_obs = np.concatenate([
            obs,
            [np.sin(t / 10), np.cos(t / 10)]
        ]).astype(np.float32)
        obs_tensor = torch.tensor(augmented_obs, dtype=torch.float32).unsqueeze(0)
        action = policy.get_action(obs_tensor)
        next_obs, _, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        buf.store(augmented_obs, action, 0)
        obs = next_obs
        t += 1

    agent_obs, agent_acts, _ = buf.get()

    idxs = np.random.choice(len(expert_obs), len(agent_obs), replace=False)
    exp_obs = torch.tensor(expert_obs[idxs], dtype=torch.float32)
    exp_acts = torch.tensor(expert_acts[idxs], dtype=torch.long)

    for _ in range(2):
        discrim_opt.zero_grad()
        
        exp_preds = discrim(exp_obs, exp_acts)
        agent_preds = discrim(agent_obs, agent_acts)
        
        # Напишите loss для дискриминатора
        disc_loss = -torch.mean(torch.log(exp_preds + 1e-8) + torch.log(1 - agent_preds + 1e-8))

        disc_loss.backward()
        discrim_opt.step()

    with torch.no_grad():
        # Напишите получение новой награды от дискриминатора
        rewards = -torch.log(1 - discrim(agent_obs, agent_acts) + 1e-8).squeeze()

    policy_opt.zero_grad()
    # Напишите loss для стратегии
    dists = policy(agent_obs)
    log_probs = dists.log_prob(agent_acts)
    loss = -(log_probs * rewards).mean()
    
    loss.backward()
    policy_opt.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}: GAIL Loss {loss.item():.3f}, Disc Loss {disc_loss.item():.3f}")

Epoch 0: GAIL Loss 0.780, Disc Loss 1.376
Epoch 10: GAIL Loss 0.709, Disc Loss 1.293
Epoch 20: GAIL Loss 0.575, Disc Loss 1.174
Epoch 30: GAIL Loss 0.388, Disc Loss 1.198
Epoch 40: GAIL Loss 0.217, Disc Loss 1.232
Epoch 50: GAIL Loss 0.237, Disc Loss 1.268
Epoch 60: GAIL Loss 0.150, Disc Loss 1.201
Epoch 70: GAIL Loss 0.206, Disc Loss 1.143
Epoch 80: GAIL Loss 0.233, Disc Loss 1.107
Epoch 90: GAIL Loss 0.285, Disc Loss 1.251
Epoch 100: GAIL Loss 0.203, Disc Loss 1.054
Epoch 110: GAIL Loss 0.264, Disc Loss 1.203
Epoch 120: GAIL Loss 0.200, Disc Loss 1.073
Epoch 130: GAIL Loss 0.306, Disc Loss 1.203
Epoch 140: GAIL Loss 0.208, Disc Loss 1.090
Epoch 150: GAIL Loss 0.176, Disc Loss 1.001
Epoch 160: GAIL Loss 0.208, Disc Loss 1.016
Epoch 170: GAIL Loss 0.213, Disc Loss 1.062
Epoch 180: GAIL Loss 0.191, Disc Loss 0.993
Epoch 190: GAIL Loss 0.260, Disc Loss 1.137
Epoch 200: GAIL Loss 0.154, Disc Loss 0.978
Epoch 210: GAIL Loss 0.241, Disc Loss 1.234
Epoch 220: GAIL Loss 0.206, Disc Loss 1.084

Протестируйте ваш алгоритм

In [28]:
def test_policy(env, policy):
    for _ in range(10):
        obs, _ = env.reset()
        done = False
        total_reward = 0
        t = 0
        while not done:
            # Обработайте obs если планируете добавлять в него t
            augmented_obs = np.concatenate([
                obs,
                [np.sin(t / 10), np.cos(t / 10)]
            ]).astype(np.float32)

            obs_tensor = torch.tensor(augmented_obs, dtype=torch.float32).unsqueeze(0)
            action = policy.get_action(obs_tensor)
            next_obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            obs = next_obs
            total_reward += reward
            t += 1
        print(total_reward)

In [29]:
env = gym.make("MountainCar-v0", render_mode=None)
test_policy(env, policy)
env.close()

-200.0
-156.0
-138.0
-200.0
-189.0
-141.0
-117.0
-119.0
-186.0
-143.0
