In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
import os
import glob
from tqdm import tqdm

from toddler.action_coding import mass_answers, force_answers
from toddler.models import ValueNetwork
from toddler.RecurrentWorker import train
from toddler.validate import validate

from isaac.models import ComplexRNNModel as YokedModel

from toddler.simulator.config import generate_every_world_configuration, generate_cond

from generate_passive_simulations import get_configuration_answer

In [2]:
model_directory = "models/answer_questions_two_networks_v2/"
data_directory = "answer_questions_two_networks_v2_plots/"

question_type = "force"

In [3]:
if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"

In [4]:
device

'cuda:0'

In [5]:
discount_factor = 0.95

every_conf = generate_every_world_configuration()
every_world_answer = np.array(list(map(get_configuration_answer, every_conf)))
n_configurations = len(every_conf)

train_size = 0.7
val_size = 0.15
test_size = 0.15

all_indices = np.arange(n_configurations)
train_indices, not_train_indices = train_test_split(all_indices, train_size=train_size,
                                                    random_state=0, stratify=every_world_answer)
val_indices, test_indices = train_test_split(not_train_indices, train_size=0.5,                
                                             random_state=0,
                                             stratify=every_world_answer[not_train_indices])

N_WORLDS = 10
timeout = 1800
print("N_WORLDS", N_WORLDS)

torch.manual_seed(0)
np.random.seed(0)
repeated_val_indices = np.random.choice(val_indices, N_WORLDS, replace=True)
val_cond = generate_cond(every_conf[repeated_val_indices])

experience_replay = ()
agent_answers = ()

n_bodies = 2
action_repeat = 1
starting_step = 0
starting_episode = 0

if question_type == "mass":
    force_answers = {}
else:
    mass_answers = {}

for cond in val_cond:
    cond["timeout"] = timeout
    if question_type == "mass":
        cond["lf"] = [[0, 0], [0, 0]]
    cond["svs"] = [{'y': 0, 'x': 0} for _ in range(n_bodies)]


validation_dfs = []
for seed in [0, 42, 72]:
    
    this_seed_model_directory = model_directory + question_type + "/" + str(seed) + "/"
    
    for model in tqdm(glob.glob(this_seed_model_directory+"*_model")):
        episode_number = model.split("/")[-1].split("_")[0]
    
        net_params = {"input_dim":17, "hidden_dim":25, "n_layers":4, "output_dim":7, "dropout":0.0}
        value_network = ValueNetwork(**net_params).to(torch.device(device))
        value_network.load_state_dict(torch.load(model))
        
        yoked_params = {"input_dim":11, "hidden_dim":25, "n_layers":4, "output_dim":3, "dropout":0.5}
        yoked_network = YokedModel(**yoked_params).to(torch.device(device))
        yoked_network.load_state_dict(torch.load(this_seed_model_directory+episode_number+"_yokednet"))

        agent_cond = val_cond

        valArgs = {"value_network": value_network, "val_cond": agent_cond, 
                   "timeout": timeout, "n_bodies": n_bodies, "yoked_network": yoked_network,
                   "action_repeat": action_repeat, "print_stats":False,
                   "device": device, "reward_control": False, "done_with_control": False, 
                    "reward_not_controlling_negatively": False, "reward_not_answering_negatively": True, 
                    "possible_actions": np.arange(0, 7), "mouse_exploration_frames": 600, 
                    "mass_answers": mass_answers, "force_answers": force_answers, "return_replays": True,
                    "force_answer_at_t": (timeout - 1)}
        
        validation_data, replays = validate(**valArgs)
        
        
        for i, replay in enumerate(replays):
            replay.to_hdf(this_seed_model_directory+episode_number+"_replays.h5", key="replay_"+str(i))
        
        validation_data = {stat+"_"+attr: [f(validation_data[attr])] for attr in ["control", "episode_length", "answers"] 
                           for stat, f in zip(["avg", "std"], [np.mean, np.std])}
        
        df = pd.DataFrame.from_dict(validation_data)
        df["seed"] = seed
        df["episode"] = episode_number
        validation_dfs.append(df)

  0%|          | 0/2 [00:00<?, ?it/s]

N_WORLDS 10


  policy = np.array(policy) / sum(policy)
  selected_action = np.random.choice(possible_actions, p=policy)
  policy = np.array(policy) / sum(policy)
  selected_action = np.random.choice(possible_actions, p=policy)
100%|██████████| 2/2 [00:17<00:00,  9.91s/it]
100%|██████████| 2/2 [00:19<00:00,  9.80s/it]
100%|██████████| 2/2 [00:19<00:00,  9.87s/it]


In [6]:
validation_dfs = pd.concat(validation_dfs)
validation_dfs.to_hdf(data_directory+"validation_data.h5", key="validation_data")

In [7]:
validation_dfs

Unnamed: 0,avg_answers,avg_control,avg_episode_length,std_answers,std_control,std_episode_length,seed,episode
0,0.2,0.1,1799.0,0.4,0.3,0.0,0,99
0,0.5,0.0,1080.2,0.5,0.0,586.897742,0,0
0,0.2,0.0,1799.0,0.4,0.0,0.0,42,99
0,0.1,0.0,1799.0,0.3,0.0,0.0,42,0
0,0.4,0.0,1799.0,0.489898,0.0,0.0,72,99
0,0.1,0.0,1799.0,0.3,0.0,0.0,72,0
