In [1]:
import gymnasium as gym  # Changed from 'gym' to 'gymnasium'

# Create the environment
env = gym.make('LunarLander-v3')  # continuous: LunarLanderContinuous-v2

# required before you can step the environment
observation, info = env.reset()  # Updated: reset() now returns tuple

# sample action:
print("sample action:", env.action_space.sample())

# observation space shape:
print("observation space shape:", env.observation_space.shape)

# sample observation:
print("sample observation:", env.observation_space.sample())

env.close()

sample action: 0
observation space shape: (8,)
sample observation: [ 2.4049616   1.8248163  -8.629089    6.696082   -0.12193996 -2.0803773
  0.6859614   0.6054613 ]


In [15]:
import gymnasium as gym

env = gym.make('LunarLander-v3', render_mode='human')
obs, info = env.reset()

try:
    for step in range(200):
        env.render()
        # take random action
        obs, reward, terminated, truncated, info = env.step(env.action_space.sample())
        done = terminated or truncated
        print(f"Step {step}: Reward={reward}, Done={done}")
        
        if done:
            obs, info = env.reset()
            
except KeyboardInterrupt:
    print("Interrupted by user")
finally:
    env.close()

Step 0: Reward=-0.40717869506275406, Done=False
Step 1: Reward=-0.19706684651795853, Done=False
Step 2: Reward=0.8533328842884498, Done=False
Step 3: Reward=-1.3179326246312246, Done=False
Step 4: Reward=-1.5470044972002472, Done=False
Step 5: Reward=-1.7876319705590789, Done=False
Step 6: Reward=-2.1083961224679784, Done=False
Step 7: Reward=0.28614311542614357, Done=False
Step 8: Reward=-1.856675606041448, Done=False
Step 9: Reward=0.3964376208267993, Done=False
Step 10: Reward=-1.1142082392185841, Done=False
Step 11: Reward=-1.1987908460930328, Done=False
Step 12: Reward=-1.2785143338226135, Done=False
Step 13: Reward=-0.45158900364006516, Done=False
Step 14: Reward=-2.2972641832171505, Done=False
Step 15: Reward=-2.47830303590777, Done=False
Step 16: Reward=-1.7009803032414936, Done=False
Step 17: Reward=-1.746115057884765, Done=False
Step 18: Reward=-2.6585736764500028, Done=False
Step 19: Reward=-1.9566455498447226, Done=False
Step 20: Reward=-1.055200515866477, Done=False
Step 2

In [4]:
import os 

logdir = "logs"

if not os.path.exists(logdir):
    os.makedirs(logdir)

In [21]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

vec_env = make_vec_env('LunarLander-v3', n_envs = 1)

model = PPO('MlpPolicy',vec_env,verbose=1)
model.learn(total_timesteps = 10000 )
model.save('PPO_lunarlander')
vec_env.close()

print("\n testing the trained agent")
env = gym.make('LunarLander-v3',render_mode = 'human')

for ep in range(10):
    obs,info = env.reset()
    total_reward = 0
    done= False
    while not done: 
        action , _states = model.predict(obs,deterministic = True)
        obs,reward,terminated,truncated,info = env.step(action)
        done = terminated or truncated
        total_reward += reward
    print(f"episode {ep +1} finished with reward : { total_reward }")  
env.close()

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 86.5     |
|    ep_rew_mean     | -159     |
| time/              |          |
|    fps             | 988      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 88.5        |
|    ep_rew_mean          | -162        |
| time/                   |             |
|    fps                  | 824         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.005619606 |
|    clip_fraction        | 0.0162      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | 0.00535     |
|    learning

In [22]:
import os
models_dir = "models/PPO"
if not os.path.exists(models_dir):
    os.makedirs(models_dir)


In [24]:
import gymnasium as gym
from stable_baselines3 import PPO
import os


env = gym.make('LunarLander-v3') 
env.reset()

model = PPO('MlpPolicy', env, verbose=1)

TIMESTEPS = 10000
iters = 0
while True:
    iters += 1
    
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False)
    model.save(f"{models_dir}/{TIMESTEPS*iters}")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 88.8     |
|    ep_rew_mean     | -171     |
| time/              |          |
|    fps             | 664      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 91          |
|    ep_rew_mean          | -204        |
| time/                   |             |
|    fps                  | 553         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.006169917 |
|    clip_fraction        | 0.0288      |
|    clip_range           | 0.2         |
|    entropy_loss   

KeyboardInterrupt: 

In [33]:
import gymnasium as gym
from stable_baselines3 import PPO

models_dir = "models/PPO"

env = gym.make('LunarLander-v3',render_mode="human")  # continuous: LunarLanderContinuous-v2
env.reset()

model_path = f"{models_dir}/240000.zip"
model = PPO.load(model_path, env=env)

episodes = 5

for ep in range(episodes):
    total =0
    obs,info = env.reset()
    done = False
    while not done:
        action, _states = model.predict(obs,deterministic = True)
        obs, rewards, terminated,truncated, info = env.step(action)
        done = terminated or truncated 
        env.render()
        print(rewards)
env.close()

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
0.5999507216968436
0.45694568721535345
0.8363496324682
1.1914157751446635
1.3752882797050188
1.4142867012528495
1.3470088758208487
0.801802695745381
-1.174916570951325
-1.2288331171215248
-1.278831824977999
-1.9901152731917466
-0.6639282631234937
0.7205669083087229
0.6845570905518172
-2.2096114138067535
1.9622833726961175
1.15493355610526
-1.6554911550567215
-0.7343020453646603
0.006772358611709717
-0.16452983132647886
-1.5914552115836773
0.48943763716733885
-1.3366338349881335
-0.9453894565840983
1.4842812648632957
-0.700126335783466
1.4369589553483706
1.722187944605747
-1.3599340706284682
-0.3602438468694686
2.5575424632590513
0.26812313216973505
-1.3509660931795782
1.6715033494767624
4.055476924633024
-0.0341640632093447
-1.4116539593330788
2.608932077124945
0.17791385999518752
-1.5504550670582364
0.612165700490874
1.3341608001972645
1.3794278262222803
1.6806501519926769
-0.08053455286115535
3.0948531247939

In [2]:
!pip install tensorboard



In [1]:
import gymnasium as gym
from stable_baselines3 import PPO
import os

models_dir = "models/PPO"
logdir = "logs"

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)

env = gym.make('LunarLander-v3')  # Fixed: v2 not v3
env.reset()

model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=logdir)

TIMESTEPS = 10000
for i in range(1, 31):  # Fixed: range(1, 31) so we get proper numbering
    print(f"\n=== Training iteration {i}/30 ===")
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name="PPO")
    model.save(f"{models_dir}/{TIMESTEPS*i}")  # Fixed: use i not the loop variable
    print(f"Model saved to {models_dir}/{TIMESTEPS*i}")

env.close()
print("\n✅ Training complete!")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.

=== Training iteration 1/30 ===
Logging to logs\PPO_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 90.9     |
|    ep_rew_mean     | -175     |
| time/              |          |
|    fps             | 307      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 90         |
|    ep_rew_mean          | -171       |
| time/                   |            |
|    fps                  | 233        |
|    iterations           | 2          |
|    time_elapsed         | 17         |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00723111 |
|    clip_fraction        | 0.0324     |
|    clip_range    

In [2]:
import gymnasium as gym
from stable_baselines3 import A2C
import os
import torch

# Check if GPU is available
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    device = "cuda"
else:
    print("Using CPU")
    device = "cpu"

models_dir = "models/A2C"
logdir = "logs"
os.makedirs(models_dir, exist_ok=True)
os.makedirs(logdir, exist_ok=True)

env = gym.make('LunarLander-v3')  # Fixed: v2 not v3
env.reset()

# Add device parameter
model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=logdir, device=device)

TIMESTEPS = 10000
for i in range(1, 31):  # Fixed: range(1, 31)
    print(f"\nIteration {i}/30")
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name="A2C")
    model.save(f"{models_dir}/{TIMESTEPS*i}")

env.close()

CUDA Available: False
Using CPU
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.

Iteration 1/30
Logging to logs\A2C_0
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 86.4     |
|    ep_rew_mean        | -157     |
| time/                 |          |
|    fps                | 145      |
|    iterations         | 100      |
|    time_elapsed       | 3        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.08    |
|    explained_variance | 0.00934  |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -7.15    |
|    value_loss         | 103      |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 104      |
|    ep_rew_mean        | -233     |
| time/                 |          |
|    fps              