# DQN

Задаем структуру аппроксимации $Q^\theta$, начальные вектор параметров $\theta$, вероятность исследования среды $\varepsilon = 1$.

Для каждого эпизода $k$ делаем:

Пока эпизод не закончен делаем:

- Находясь в состоянии $S_t$ совершаем действие $A_t \sim \pi(\cdot|S_t)$, где $\pi = \varepsilon\text{-greedy}(Q^\theta)$, получаем награду $R_t$  переходим в состояние $S_{t+1}$. Сохраняем $(S_t,A_t,R_t,S_{t+1}) \rightarrow Memory$


- Берем $\{(s_i,a_i,r_i,s'_i)\}_{i=1}^{n} \leftarrow Memory$, определяем целевые значения

$$
y_i =
\left\{
\begin{array}{ll}
r_i, &\text{ если } s'_i\text{ -терминальное},\\[0.0cm]
 r_i + \gamma \max\limits_{a'} Q^\theta(s'_i,a'), &\text{ иначе}
\end{array}
\right.
$$

функцию потерь $Loss(\theta) = \frac{1}{n}\sum\limits_{i=1}^n \big(y_i - Q^\theta(s_i,a_i)\big)^2$
и обновляем вектор параметров

$$
\theta \leftarrow \theta - \alpha \nabla_\theta Loss(\theta)
$$

- Уменьшаем $\varepsilon$


In [47]:
import numpy as np
import gymnasium as gym
import random
import torch
import torch.nn as nn

class Network(nn.Module):
    def __init__(self, input_dim, output_dim, hidder_size=64):
        super().__init__()

        self.linear_1 = nn.Linear(input_dim, hidder_size)
        self.linear_2 = nn.Linear(hidder_size, hidder_size)
        self.linear_3 = nn.Linear(hidder_size, output_dim)
        self.activation = nn.ReLU()

    def forward(self, input):
        hidden = self.linear_1(input)
        hidden = self.activation(hidden)
        hidden = self.linear_2(hidden)
        hidden = self.activation(hidden)
        output = self.linear_3(hidden)
        return output

In [89]:
class DQN:
    def __init__(self, state_dim, action_n, epsilon_decrease, gamma=0.99, batch_size=128, lr=1e-3, epsilon_min=1e-2):
        self.state_dim = state_dim
        self.action_n = action_n
        self.q_model = Network(self.state_dim, self.action_n)
        self.epsilon_decrease = epsilon_decrease
        self.epsilon_min = epsilon_min
        self.epsilon = 1
        self.gamma = gamma
        self.batch_size = batch_size
        self.memory = []
        self.optimizer = torch.optim.Adam(self.q_model.parameters(), lr=lr)

    def get_action(self, state):
        q_values = self.q_model(torch.FloatTensor(state)).data.numpy()
        max_action = np.argmax(q_values)
        probs = np.ones(self.action_n) * self.epsilon / self.action_n
        probs[max_action] += 1 - self.epsilon
        return np.random.choice(np.arange(self.action_n), p=probs)
    
    def fit(self, state, action, reward, done, next_state):
        self.memory.append([state, action, reward, int(done), next_state])

        if len(self.memory) > self.batch_size:
            batch = random.sample(self.memory, self.batch_size)
            states, actions, rewards, dones, next_states = map(torch.tensor, zip(*batch))
            
            targets = rewards + (1 - dones) * self.gamma * torch.max(self.q_model(next_states), dim=1).values
            q_values = self.q_model(states)[torch.arange(self.batch_size), actions]
            loss = torch.mean((q_values - targets) ** 2)

            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()

            self.epsilon = max(self.epsilon - self.epsilon_decrease, self.epsilon_min)

In [90]:
env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_n = env.action_space.n

agent = DQN(state_dim, action_n, epsilon_decrease=1e-4)

trajectory_n = 300
trajectory_len = 500

for trajectory in range(trajectory_n):
    total_reward = 0
    state, _ = env.reset()
    for t in range(trajectory_len):
        action = agent.get_action(state)
        next_state, reward, done, _, _ = env.step(action)
        total_reward += reward

        agent.fit(state, action, reward, done, next_state)

        state = next_state
        
        if done:
            break

    print(f'trajectory: {trajectory}, total_reward {total_reward}')

trajectory: 0, total_reward 11.0
trajectory: 1, total_reward 36.0
trajectory: 2, total_reward 38.0
trajectory: 3, total_reward 20.0
trajectory: 4, total_reward 16.0
trajectory: 5, total_reward 28.0
trajectory: 6, total_reward 29.0
trajectory: 7, total_reward 22.0
trajectory: 8, total_reward 25.0
trajectory: 9, total_reward 15.0
trajectory: 10, total_reward 20.0
trajectory: 11, total_reward 24.0
trajectory: 12, total_reward 34.0
trajectory: 13, total_reward 15.0
trajectory: 14, total_reward 25.0
trajectory: 15, total_reward 12.0
trajectory: 16, total_reward 22.0
trajectory: 17, total_reward 16.0
trajectory: 18, total_reward 13.0
trajectory: 19, total_reward 17.0
trajectory: 20, total_reward 25.0
trajectory: 21, total_reward 74.0
trajectory: 22, total_reward 11.0
trajectory: 23, total_reward 16.0
trajectory: 24, total_reward 12.0
trajectory: 25, total_reward 138.0
trajectory: 26, total_reward 29.0
trajectory: 27, total_reward 11.0
trajectory: 28, total_reward 14.0
trajectory: 29, total_r