## DQN sobre ambientes de Classic Control

https://gymnasium.farama.org/environments/classic_control/

### Imports

In [1]:
!pip install gymnasium[atari]
!pip install gymnasium[accept-rom-license]
!pip install pyvirtualdisplay -q

# Comienzo por levantar los archivos .py asociados
!git clone https://github.com/javiernunez1991/TIA.git

Collecting gymnasium[atari]
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium[atari])
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Collecting shimmy[atari]<1.0,>=0.1.0 (from gymnasium[atari])
  Downloading Shimmy-0.2.1-py3-none-any.whl (25 kB)
Collecting ale-py~=0.8.1 (from shimmy[atari]<1.0,>=0.1.0->gymnasium[atari])
  Downloading ale_py-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: farama-notifications, gymnasium, ale-py, shimmy
Successfully installed ale-py-0.8.1 farama-notifications-0.0.4 gymnasium-0.29.1 shimmy-0.2.1
Collecting autorom[accept-rom-license]~=0.4.2 (from gymnasium[accept-rom-license])
  Dow

In [2]:
import torch
import numpy as np
import random
import numpy as np
import gymnasium
import os
os.chdir('/content/TIA/Obligatorio')

### Seteamos los devices

In [3]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Running on {DEVICE}")
print("Cuda Available:", torch.cuda.is_available())

Running on cuda:0
Cuda Available: True


  and should_run_async(code)


### Seteo de seeds
Siempre es buena práctica hacer el seteo de seeds para la reproducibilidad de los experimentos

In [4]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True

### Creamos el ambiente y probamos algunas de sus funciones.

En este caso elegimos el CartPole pero pueden cambiarlo en la variable *ENV_NAME*.
El ambiente CartPole tiene la ventaja de que las recompensas son positivas y es mas fácil propagar estas hacia los estados iniciales. Mountain Car tiene como recompensa -1 por cada paso que damos y esta limitado a 200 pasos.

In [5]:
ENVS = ["MountainCar-v0", "CartPole-v1"]
ENV_NAME = ENVS[1]

env = gymnasium.make(ENV_NAME, render_mode="rgb_array")

print(f"# Actions: {env.action_space}")
print(f"Observation_Space: {env.observation_space.shape}")
env.reset()
next_state, reward, terminated, truncated, info = env.step(action=0)

print(f"Next_state shape: {next_state.shape}, Reward: {reward}, Terminated: {terminated}, Info: {info}")

# Actions: Discrete(2)
Observation_Space: (4,)
Next_state shape: (4,), Reward: 1.0, Terminated: False, Info: {}


### Seteamos los hyperparámetros

In [6]:
def process_state(obs, device):
    return torch.tensor(obs, device=device).unsqueeze(0)

#Hiperparámetros de entrenamiento del agente DQN
TOTAL_STEPS = 1_000_000
EPISODES = 1500
STEPS = 200

EPSILON_INI = 1
EPSILON_MIN = 0.1
EPSILON_DECAY = (EPSILON_INI - EPSILON_MIN) / STEPS
EPISODE_BLOCK = 20
EPSILON_TIME = 100_000

BATCH_SIZE = 128
BUFFER_SIZE = 10_000

GAMMA = 0.999
LEARNING_RATE = 1e-4

### Creamos el ambiente que vamos a estar usando para el entrenamiento

In [7]:
env = gymnasium.make(ENV_NAME)
input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n

print(f"Input dim: {input_dim}, Output dim: {output_dim}")

Input dim: 4, Output dim: 2


### Definimos nuestra red que vamos a usar como función de aproximación para el aprendizaje

In [8]:
#from dqn_model import DQN_Model
from dqn_cnn_model import DQN_Model
net = DQN_Model(input_dim, output_dim).to(DEVICE)

### Creamos el agente con los hyperparámetros y la red

In [9]:
from dqn_agent import DQNAgent
agent = DQNAgent(env, net, process_state, BUFFER_SIZE, BATCH_SIZE,
                LEARNING_RATE, GAMMA, epsilon_i= EPSILON_INI,
                epsilon_f=EPSILON_MIN, epsilon_anneal_time=EPSILON_TIME,
                epsilon_decay = EPSILON_DECAY, episode_block = EPISODE_BLOCK, device=DEVICE)

### Entrenamos a nuestro agente!

In [10]:
rewards, wins = agent.train(EPISODES, STEPS, TOTAL_STEPS, writer_name = ENV_NAME)

  1%|          | 8/1500 [00:00<01:13, 20.30 episodes/s]

Episode 0: Avg. Reward 20.0 over the last 20 episodes - Epsilon 0.9145 - TotalSteps 20





RuntimeError: Index tensor must have the same number of dimensions as input tensor

### Graficamos las recompensas obtenidas durante el entrenamiento

In [None]:
import matplotlib.pyplot as plt

average_range = EPISODE_BLOCK
episode_ticks = int(len(rewards) / average_range)

avg_rewards = np.array(rewards).reshape((episode_ticks, average_range))
avg_rewards = np.mean(avg_rewards, axis=1)

plt.plot(range(len(avg_rewards)), avg_rewards)

### Creamos un video para ver la performance del agente

In [None]:
import glob
from gymnasium.wrappers.record_video import RecordVideo
from IPython.display import HTML
from IPython import display as ipythondisplay
import io
import base64

def show_video():
  """
  Utility function to enable video recording of gym environment and displaying it
  To enable video, just do "env = wrap_env(env)""
  """
  mp4list = glob.glob('./videos/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else:
    print("Could not find video")


def wrap_env(env):
  """
  Wrapper del ambiente donde definimos un Monitor que guarda la visualizacion como un archivo de video.
  """

  #env = Monitor(env, './video', force=True)
  env = RecordVideo(env,video_folder='./videos')
  return env

In [None]:
env = wrap_env(gymnasium.make(ENV_NAME, render_mode="rgb_array"))
observation,_ = env.reset()

while True:
    env.render()

    action = agent.select_action(process_state(observation, DEVICE), train=False)
    observation, reward, done, truncated, info = env.step(action)

    if done or truncated:
      break

# Cerramos la conexion con el Monitor de ambiente y mostramos el video.
env.close()
show_video()

del env