In [1]:
import gym
import ray
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.agents.dqn import DQNTrainer

import pygame
import numpy as np

import torch
from IPython.display import clear_output
from IPython import display
import random
import matplotlib.pylab as plt
import copy
import time

from typing import List, Optional
from gym.envs.toy_text.frozen_lake import generate_random_map

pygame 2.1.2 (SDL 2.0.16, Python 3.10.4)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
class WrappedFrozenLake(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        self.old = 0
        self.desc = self.env.desc
        self.size = len(self.desc)
        self.observation_space = gym.spaces.Box(0,1,shape=(3,self.size,self.size,),dtype="float32")
        self.action_space = self.env.action_space
        
    def oneHot(self,s):
        x = np.zeros(self.size*self.size)
        x[s] = 1
        state_ = np.array([ x.reshape(self.size,self.size),
                         np.array(self.env.desc == b"F").astype("float32"),
                         np.array(self.env.desc == b"G").astype("float32")
                          ])
        return state_.reshape(3,self.size,self.size,)

    def reset(self):
        # return self.oneHot(1)
        return self.oneHot(self.env.reset())
    
    def step(self, action):
        obs, r, done, info = self.env.step(action)
        
        if done:
            reward = 2 if r > 0 else -1
        elif obs == self.old:
            reward = -1
        else:
            reward = 0
        self.old = obs
        return self.oneHot(obs), reward, done, info

from ray.tune.registry import register_env

def env_creator(env_config): 
    size = env_config['size']
    numHoles = env_config['numHoles']
    p = 1-numHoles/(size**2)
    desc = generate_random_map(size=size, p=p)
    return WrappedFrozenLake(gym.make('FrozenLake-v1', desc = desc, is_slippery=False))  # return an env instance

register_env("myenv", env_creator)

In [3]:
env = env_creator(env_config = {'size':6, 'numHoles': 6})
env.reset()
done = False

while not done:
    # plt.cla()
    display.clear_output(wait=True)

    obs, reward, done, info = env.step(env.action_space.sample())
    env.render()
    time.sleep(0.1)
    # display.clear_output(wait=True)
    # display.display(plt.gcf())
    # plt.gcf()

  (Down)
SFFFFF
FFFFFF
HFFFFF
F[41mH[0mHFFF
FFFFFH
HFFFFG


In [4]:
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.framework import try_import_tf, try_import_torch

torch, nn = try_import_torch()

class MyTorchModel(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        """Build a simple [16, 16]-MLP (+ value branch)."""
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
                              model_config, name)
        nn.Module.__init__(self)
        self.device = torch.device("cpu")#"cuda"
        #                            if torch.cuda.is_available() else "cpu")

        self.mainLayer = nn.Sequential(
                # nn.Conv2d(3,12,kernel_size=3,stride= 1,padding=1),
                # nn.ReLU(),
                # nn.Conv2d(12,24,kernel_size=3,stride=1,padding=1),
                # nn.ReLU(),
                # nn.Conv2d(24,36,kernel_size=3,stride=1,padding=1),
                # nn.ReLU(),            
                nn.Flatten(start_dim=-3,end_dim=-1),
                nn.Linear(108,256),
                nn.ReLU(),
                nn.Linear(256,256),
                nn.ReLU(),
        ).to(self.device)
        
        # Action logits output.
        self.layer_out = nn.Linear(256, num_outputs).to(self.device)

        # "Value"-branch (single node output).
        # Used by several RLlib algorithms (e.g. PPO) to calculate an observation's value.
        self.value_branch = nn.Linear(256, 1).to(self.device)
        self.cur_value = None

    def forward(self, input_dict, state, seq_lens):
        """Custom-define your forard pass logic here."""
        # Pass inputs through our 2 layers.
        layer_1_out = self.mainLayer(input_dict["obs"])
        logits = self.layer_out(layer_1_out)

        # Calculate the "value" of the observation and store it for
        # when `value_function` is called.
        self.cur_value = self.value_branch(layer_1_out).squeeze(-1)

        return logits, state

    def value_function(self):
        """Implement the value branch forward pass logic here:
        
        We will just return the already calculated `self.cur_value`.
        """
        assert self.cur_value is not None, "Must call `forward()` first!"
        return self.cur_value

In [5]:
size = 6
test_model_torch = MyTorchModel(
   obs_space=gym.spaces.Box(0,1,shape=(3,size,size), dtype=np.float32),
   action_space=gym.spaces.Discrete(4),
   num_outputs=4,
   model_config={},
   name="MyModel",
)
#print("Torch-output={}".format(test_model_torch({"obs": torch.from_numpy(np.array([[0.5, 0.5]], dtype=np.float32))})))

obs = gym.spaces.Box(0,1,shape=(3,size,size)).sample()
test_model_torch({"obs": torch.from_numpy(obs)})

(tensor([ 0.0319,  0.0357,  0.1205, -0.0275], grad_fn=<AddBackward0>), [])

In [6]:
config = {
        "framework": "torch",
        "env":"myenv",  
        "env_config":{'size':6, 'numHoles': 6},
#        "num_workers": 6,
          "model": {
             "custom_model": MyTorchModel,  # for torch users: "custom_model": MyTorchModel
             "custom_model_config": {},
          },
 #       'num_envs_per_worker': 1,
        "create_env_on_driver": True,
}

In [None]:
from ray import tune
tune_config = config.copy()
tune_config["lr"] = tune.grid_search([0.0001])  # <- 0.5? again: ouch!

stop = {
    "training_iteration": 50,
    "episode_reward_mean": 1.95,
}

analysis =  tune.run(
    "PPO",
    config=tune_config,
    stop=stop,
    checkpoint_at_end=True,  
    checkpoint_freq=5,  
    local_dir="checkPoints/"
)

In [8]:
# The previous tune.run (the one we did before the exercise) returned an Analysis object, from which we can access any checkpoint
# (given we set checkpoint_freq or checkpoint_at_end to reasonable values) like so:
print(analysis)
# Get all trials (we only have one).
trials = analysis.trials
# Assuming, the first trial was the best, we'd like to extract this trial's best checkpoint "":
best_checkpoint = analysis.get_best_checkpoint(trial=trials[0], metric="episode_reward_mean", mode="max")
print(f"Found best checkpoint for trial #2: {best_checkpoint}")


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis object at 0x7f34f88079a0>
Found best checkpoint for trial #2: /home/ajit.kumar@SNU.IN/Desktop/myRLcases/frozenLake-rlLib/checkPoints/PPO/PPO_myenv_5306c_00000_0_lr=0.0001_2022-07-16_14-27-30/checkpoint_000018/checkpoint-18


In [9]:
rllib_config = tune_config.copy()
rllib_config["lr"] = 0.0001
#rllib_config["train_batch_size"] = 4159
#rllib_config["explore"] = False

# Restore a RLlib Trainer from the checkpoint.
new_trainer = PPOTrainer(config=rllib_config)
new_trainer.restore(best_checkpoint)

2022-07-16 14:30:52,599	INFO ppo.py:414 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2022-07-16 14:30:52,600	INFO trainer.py:903 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
2022-07-16 14:30:52,716	INFO trainable.py:588 -- Restored on 10.13.62.8 from checkpoint: /home/ajit.kumar@SNU.IN/Desktop/myRLcases/frozenLake-rlLib/checkPoints/PPO/PPO_myenv_5306c_00000_0_lr=0.0001_2022-07-16_14-27-30/checkpoint_000018/checkpoint-18
2022-07-16 14:30:52,717	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 18, '_timesteps_total': None, '_time_total': 86.2672049999237, '_episodes_total': 7718}


In [None]:
new_trainer.evaluate()

In [11]:
won = 0
lost = 0
rewardList = []

for i in range(100):
    env = env_creator({'size':6, 'numHoles': 6}) #gym.make("myenv-v1")#"FrozenLake-v1")
    #, #FrozenLakeWrapped({})
    # Get the initial observation (should be: [0.0] for the starting position).
    obs = env.reset()
    done = False
    total_reward = 0.0
    # Play one episode.
    while not done:
        # plt.cla()
#        display.clear_output(wait=True)
        # print("num won: ", won, " played: ", i, "total reward: ", total_reward)
        # env.render()

        action = new_trainer.compute_single_action(obs,explore=False)
        obs, reward, done, info = env.step(action)
        #print(obs, reward, done)
        total_reward += reward
        # time.sleep(0.1)
        
        # display.display(plt.gcf())
        # plt.gcf()
        if done: 
            rewardList.append(total_reward)
    if reward > 0 :
        won +=1 
      
    
#print(rewardList, np.mean(rewardList))
print( np.mean(rewardList))

-0.46


In [19]:
env = env_creator({'size':6, 'numHoles': 6})# gym.make("FrozenLake-v1")
obs = env.reset()
done = False
total_reward = 0.0
# Play one episode.
while not done:
    display.clear_output(wait=True)
    obs, reward, done, _ = env.step(new_trainer.compute_single_action(obs,explore=False))
    env.render()
    time.sleep(0.1)
    if done: 
        rewardList.append(total_reward)
if reward > 0 :
    won +=1 

  (Right)
SFHFFF
FF[41mH[0mFFF
FHFFFF
FFHHFF
FFFHFF
HFFFFG
