In [None]:
import os
from datetime import datetime

import numpy as np
import torch
import wandb
from tqdm import trange

import scipy.io as sio

#rllib with ray is the primary framework that will be used for imitation learning 
from ray import air, tune, rllib

#RL option after imitation learning is done
from agilerl.algorithms.ppo import PPO
from agilerl.training.train_on_policy import train_on_policy
from agilerl.utils.utils import create_population, make_skill_vect_envs, make_vect_envs
from agilerl.wrappers.learning import Skill

from ray.rllib.algorithms.bc import BCConfig

In [None]:
#loading data 

x = sio.loadmat()
y = sio.loadmat()

In [None]:

import numpy as np
import gym
from gym import spaces
from ray.rllib.env.multi_agent_env import MultiAgentEnv

class CichyEnv(MultiAgentEnv):
    def __init__(self, images, expert_rdms):
        self.images = images
        self.expert_rdms = expert_rdms
        self.num_agents = 2
        self.agent_ids = ["IT", "EVC"]


        self.observation_space = spaces.Dict({
            "image": spaces.Box(low=0, high=255, shape=(224, 224, 3), dtype=np.uint8),
            "other_action": spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)
        })

        # Continuous: 1. expending activity units 2. sending signal
        self.action_space = spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32)

    def reset(self):
        self.current_step = 0
        self.state = {agent: {"image": self.images[self.current_step], "other_action": np.array([0.0])} for agent in self.agent_ids}
        return self.state

    def step(self, action_dict):
        obs, rewards, dones, infos = {}, {}, {}, {}
        self.current_step += 1

        for agent_id in self.agent_ids:
            other_agent_id = "EVC" if agent_id == "IT" else "IT"
            obs[agent_id] = {"image": self.images[self.current_step], "other_action": action_dict[other_agent_id][1]}
            rewards[agent_id] = self._calculate_reward(agent_id, action_dict[agent_id])
            dones[agent_id] = self.current_step >= len(self.images) - 1
            infos[agent_id] = {}

        dones["__all__"] = all(dones.values())
        return obs, rewards, dones, infos

    def _calculate_reward(self, agent_id, action):
      ## placeholder now,
      ## thinking of BIC
        return -np.sum((self.expert_rdms[agent_id][self.current_step] - action) ** 2)





In [None]:

#image code converts pngs into arrays if they have not already been converted 

import numpy as np
from PIL import Image
img = Image.open('lena.png')
arr = np.array(img)

MVP(Minimum viable product) checklist:
1. load rdm and image data: in progress

2. create imitation learner: in progress
 - define observation space: in progress
 - define action space: in progress
 - definitively settle on imitation learner architecture: DONE 

 RL architecture: MARIWEL

 it support multiagent RL and support continuous action and observation spaces. 

3. create enviroment: in progress

4. load the model after imitation learning training: 
note: instead of rllib we can possibly use agilerl instead for everything after step 4
5. load the model and proceed with standard RL 

STREtTCH
1. create more agents to carry out simulations on more granular level. 

MISC

observation space: images, actions from other agents

action space: expending activity units, sending signal observable by other agent that also expends an activity unit

expert data: input: images, output: rdm

training stage: use rllib for imitation learning

save model in .pt after training

prediction stage: use agilerl for prediction

why is more RL needed beyond just the imitation learning? because the agents will also be able to interact with each other. This is not captured in the dataset that is being fed to the neural network 

enviroment

class

step()

reset()



In [None]:
"""
class raw_env(AECEnv, EzPickle):
    metadata = {
        "render_modes": ["human", "rgb_array"],
        "name": "connect_four_v3",
        "is_parallelizable": False,
        "render_fps": 2,
    }
"""
class outside(AECEnv,EzPickle):

    #goal is for this enviroment to encapsulate everything outside of a given agent

    #that means: agents other actions, and stimuli images. 

    #the step function is where each the actions the other agent has done and after n number a new stimuli and the reward will be updated
    #the reset function will simply initialize everything to 0 or some other default value at the beginning of the next step. 

    def __init__(self, render_mode=None, size=5):
        EzPickle.__init__(
            self,
            render_mode,

            continuous,
        )
        super().__init__()
        print("initializing enviroment")
    def step():
        print("step")

    def reset():
        print("new step")

In [None]:
#BC here stands for behavior cloning 

config = BCConfig().training(lr=0.00001, gamma=0.99)
config = config.offline_data(
    input_="./rllib/tests/data/cartpole/large.json")



In [None]:
    
    act_space = Dict(  
        {
            "ext_controller": continuous(1),
            "inner_state": continuous(1),
        })
    config = (
        PPOConfig()
        .environment(CichyEnv)
        .framework(args.framework)
        .env_runners(
            batch_mode="complete_episodes",
            num_env_runners=0,
            # TODO(avnishn) make a new example compatible w connectors.
            enable_connectors=False,
        )
        .callbacks(FillInActions)
        .training(model={"custom_model": "cc_model"})
        #.offline_data(input_="/tmp/cartpole-out")
        .multi_agent(
            policies={
                "pol1": (None, observer_space, action_space, {}),
                "pol2": (None, observer_space, action_space, {}),
            },
            policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "pol1"
            if agent_id == 0
            else "pol2",
            observation_fn=central_critic_observer,
        )
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
    ).fit()

    if args.as_test:
        check_learning_achieved(results, args.stop_reward)

    ray.shutdown()

In [None]:
from ray.rllib.algorithms.marwil import MARWILConfig
# Run this from the ray directory root.
config = MARWILConfig()  
config = config.training(beta=1.0, lr=0.00001, gamma=0.99)  
config = config.offline_data(  
    input_=["./rllib/tests/data/cartpole/large.json"])
config = config.multi_agent(
            policies={
                "pol1": (None, observer_space, action_space, {}),
                "pol2": (None, observer_space, action_space, {}),
            },
            policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "pol1"
            if agent_id == 0
            else "pol2",
            observation_fn=central_critic_observer,
        )
print(config.to_dict()) 
"""


"""
# Build an Algorithm object from the config and run 1 training iteration.
algo = config.build()  
algo.train() 

In [None]:
from ray.rllib.algorithms.marwil import MARWILConfig
from ray import tune
config = MARWILConfig()
# Print out some default values.
print(config.beta)  
# Update the config object.
config.training(lr=tune.grid_search(  
    [0.001, 0.0001]), beta=0.75)
# Set the config object's data path.
# Run this from the ray directory root.
config.offline_data( 
    input_=["./rllib/tests/data/cartpole/large.json"])
# Set the config object's env, used for evaluation.
config.environment(env="CartPole-v1")  
# Use to_dict() to get the old-style python config dict
# when running with tune.
tune.Tuner(  
    "MARWIL",
    run_config=air.RunConfig(stop=stop, verbose=2)
    param_space=config.to_dict(),
).fit()

In [None]:
#prediciton

action = agent.compute_single_action(obs)