In [1]:
import os

import numpy as np
from stable_baselines3 import PPO, DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.logger import configure
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback, StopTrainingOnRewardThreshold


from src.drone_env import DroneEnv
from src.utils import read_config
from src.monitor import Monitor
from src.logger_callback import LoggerCallback

# Create log directory
log_dir = "logs/"
os.makedirs(log_dir, exist_ok=True)

# Configure logger to use TensorBoard
logger = configure(log_dir, ["stdout", "tensorboard"])

config = read_config("config.yaml")

save_path = os.path.join('training', 'saved_models')
log_path = os.path.join('training', 'logs')

def print_state(state):
    # Print 4 decimals, and a space if the number is positive for alignment
    print("State: ", [f"{x:.4f}" if x < 0 else f"{x:.4f} " for x in state])

Logging to logs/


In [None]:
env = DroneEnv(config, render_mode="human", max_episode_steps=1000)
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset() # Get initial set of observations
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample() # Take a random action from the action space
        n_state, reward, done, info =  env.step(action) # Get new set of observations
        score+=reward
    print('Episode:{} Score:{}'.format(episode, round(score,2)))
env.close()

In [None]:
env.close()

In [None]:
num_envs = 16  # Number of parallel environments
env_fns = [lambda: DroneEnv(config) for _ in range(num_envs)]
env = DummyVecEnv(env_fns)

check_env(env.envs[0], warn=True)  # Checking only the first instance for compatibility

stop_callback = StopTrainingOnRewardThreshold(reward_threshold=1000, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback, 
                             eval_freq=10000, 
                             best_model_save_path=save_path, 
                             verbose=1)

monitor = Monitor(config)
monitor.update_plot()
logger = LoggerCallback(monitor=monitor)

callbacks = [eval_callback, logger]

model = DQN('MlpPolicy', env, tensorboard_log=log_dir)

In [None]:
num_envs = 16  # Number of parallel environments
env_fns = [lambda: DroneEnv(config) for _ in range(num_envs)]
env = DummyVecEnv(env_fns)

check_env(env.envs[0], warn=True)  # Checking only the first instance for compatibility

stop_callback = StopTrainingOnRewardThreshold(reward_threshold=1000, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback, 
                             eval_freq=10000, 
                             best_model_save_path=save_path, 
                             verbose=1)

monitor = Monitor(config)
monitor.update_plot()
logger = LoggerCallback(monitor=monitor)

callbacks = [eval_callback, logger]

model = DQN('MlpPolicy', env, tensorboard_log=log_dir)
model.learn(total_timesteps=1000000, progress_bar=True, callback=callbacks)

In [None]:
model.save(os.path.join('Training', 'Saved Models', 'DQN_model_1M'))

In [None]:
env = DroneEnv(config, render_mode="human", max_episode_steps=1000)
# model = DQN.load(os.path.join('training', 'saved_models', 'DQN_model_1M'), env=env)
model = PPO.load(os.path.join('training', 'saved_models', 'PPO_model_0.5m'), env=env)


In [None]:
evaluate_policy(model, env, n_eval_episodes=5, render=True)
env.close()

In [2]:
env = DroneEnv(config, render_mode="human", max_episode_steps=1000)
model = PPO.load(os.path.join('training', 'saved_models', 'PPO_model_0.5m'), env=env)
episodes = 5
for episode in range(1, episodes+1):
    obs, _ = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _ = model.predict(obs)
        obs, reward, done, _, info = env.step(action) # Get new set of observations
        score+=reward
    print('Episode:{} Score:{}'.format(episode, round(score,2)))
env.close()

Episode:1 Score:208.62
Episode:2 Score:120.81
Episode:3 Score:43.38
Episode:4 Score:105.27
Episode:5 Score:45.97
