In [10]:
# %reload_ext autoreload
# %autoreload 2

# import time
# from pathlib import Path
# from copy import copy, deepcopy
# import random
# import numpy as np
# import scipy as sp
# import torch
# from tqdm import tqdm
# import matplotlib.pyplot as plt
# import matplotlib as mpl
# from dominoes import leagueManager as lm
# from dominoes import gameplay as dg
# from dominoes import agents as da
# from dominoes import utils
# from dominoes import files as fm
# from dominoes.networks import transformers as transformers
# from dominoes import datasets
# from dominoes import training
# from dominoes.analysis import transformer_analysis as ta
# from dominoes import utils

# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(f"Using device: {device}")

In [None]:
"""
Dominoes tasks:
1. Plot valueAgentELO results effectively, document it
2. Train a library of lineValueAgents on a server
3. Build a policy-only agent
   - learn network approximation of value function same way other agents do
   - process hand and game state to generate a context embedding
   - then use pointer networks to choose which option to play based on the option encoding and the context
   - policy updated with temporal-difference of the value function
4. Build a win-probability agent method (to be applied to all agents) 
   - I think this is going to involve a bit of cleaning and refactoring of my agents code

Dominoes Experiments:
1. Compare lineValueNetwork trained on only it's turn vs. all turns. vs. it's turn and next turn
2. Compare value agents trained with hand-continuity of eligibility traces or resets

Dominoes Analysis Goals:
1. Study some of the key layers of agent value networks!
2. Initialize the game many times and correalte lineValueFeatures with final score output

Dominoes Coding Practice:
1. Still need to refactor some of the "experiment" scripts to make sure they divide results and plotting 
"""


"""
Pointer Immediate Tasks:
1. Use a separate value network to force encoding to predict the total rewards of the pointer output
2. Run an experiment where I take trained pointer networks and transition them to a dominoe based value 
function and a gamma < 1, and show that they can learn to prioritize playing high value dominoes first. Then...
3. Add the context vector that encodes the number of turns left (with uncertainty?)
    - so the full pointer network will get an extra context input that describes how many turns are left
    - 0 rewards will be given after the possible turns are over
    - so the network will have to learn to get as much value out as quickly as possible
4. Also apply these networks to the vehicle routing problem?
5. Analyze encoding space of pointer networks...
6. Do the encoder swap of different pointer layers...
7. Does the speed of learning for the different networks on the sequencer task come from true performance or just sensitivity to the temperature? 

Pointer Big Ideas:
1. Develop new tasks based on graphs, random forests, and complex rules and determine whether the sophisticated 
pointer layer architecture works better on them. 

Pointer Coding Practice:
1. Convert reward functions to a class 
2. Convert dataset preparation to a class

Pointer Mechanical Update:
1. Change it to learn an initial decoder to be used whenever an input position isn't provided! 
   - self.custom_tensor = nn.Parameter(torch.randn(input_size, output_size), requires_grad=True)
2. Add mechanism for storing hidden parameters to entire pointer network
"""

# THE CODE IS PRETTY UNORGANIZED :(
   # need to organize the dominoes agents better (especially with regards to whether they predict their final score or the win probability)

In [4]:
# TODO For refactoring
# at the moment there is no flexibility on the loss function control for the supervised dataset!!!!

# Add mechanism for printing the arguments used to build a pointernetwork so the user can see what they did. 

# Add documentation of baseline updates and performance etc
# Add some dataset specific summary plots and integrate into plotting code? 
# Get the supervised learning methods working for each dataset and task
# Checkpointing, figure making, logging, etc

# it worked!!! now trying without embedding bias...
# it works without embedding bias. It works (with different speeds per pointer layer!) with lower train temperature
 # (but of course that could be because of differential sensitivity to temperature..., should test that directly)
# now trying with 1 encoding layer. 

# :)


# TODO: 
# DOMINOES SEQUENCER Comparison of max to real reward:
# - Add target to batch (can do post-hoc, even if not requested)
# - Measure reward of target
# - Add a 2D vector comparing max and real reward for each batch element!!

# TSP Distance Traveled:
# - Explicitly measure the distance traveled by the agent in the TSP task
# - Compare to Held-Karp Solution


# TODO ASAP!!!!!!
# - Get test result plots in there for good plotting, then start running experiments with different parameters
#   so you can save / see the results!!!


In [2]:
%reload_ext autoreload
%autoreload 2

from time import time
import torch
from dominoes.datasets import get_dataset

In [15]:
task = "tsp"
dataset = get_dataset(task, build=True, num_cities=10)
batch = dataset.generate_batch(batch_size=4, return_target=True)

for k in batch.keys():
    print(k)

input
dists
init
num_cities
coord_dims
batch_size
return_target
ignore_index
threads
target


In [5]:
batch["target"]

tensor([[9, 7, 4, 2, 5, 6, 1, 0, 8, 3],
        [1, 9, 5, 6, 2, 0, 4, 8, 3, 7],
        [6, 3, 1, 9, 2, 0, 7, 5, 8, 4],
        [7, 5, 3, 9, 1, 6, 8, 2, 4, 0]])

In [4]:
reward = dataset.reward_function(batch["target"], batch)

ib = 0
prev = batch["init"][ib]
for i, s in enumerate(batch["target"][ib]):
    if i==(batch["target"].size(1)-1):
        add_dist = batch["dists"][ib][s, batch["init"][ib]]
    else:
        add_dist = 0
    print(f"From {prev} to {s} with reward {-reward[ib][i]:.2f}, distance: {batch['dists'][ib][prev, s] + add_dist :.2f}")
    prev = s

From 7 to 6 with reward 0.15, distance: 0.15
From 6 to 9 with reward 0.32, distance: 0.32
From 9 to 0 with reward 0.34, distance: 0.34
From 0 to 5 with reward 0.57, distance: 0.57
From 5 to 8 with reward 0.29, distance: 0.29
From 8 to 3 with reward 0.28, distance: 0.28
From 3 to 2 with reward 0.56, distance: 0.56
From 2 to 4 with reward 0.48, distance: 0.48
From 4 to 1 with reward 0.40, distance: 0.40


In [2]:
highest_dominoe = 9
dataset = DominoeDataset("sequencer", highest_dominoe, hand_size=8, return_target=True)

batch = dataset.generate_batch(train=False, batch_size=4, value_method="length")
dominoes = dataset.get_dominoe_set(train=False)

target_as_choice = batch["target"].clone()
target_as_choice[target_as_choice==-1] = dataset.prms["hand_size"]

reward, direction = dataset._measurereward_sequencer(target_as_choice, batch, return_direction=True)
print(reward)

tensor([[1., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0., 0.]])


In [7]:
ib = 1
hand = dominoes[batch["selection"][ib]]
hand_null = torch.cat([hand, -torch.ones((1, 2))], dim=0)

print(batch["available"][ib])
for c, d in zip(target_as_choice[ib], direction[ib]):
    print(hand_null[c], "forward" if d==0 else "reverse")

tensor(8)
tensor([6., 8.]) reverse
tensor([2., 6.]) reverse
tensor([2., 8.]) forward
tensor([1., 8.]) reverse
tensor([1., 4.]) forward
tensor([-1., -1.]) reverse
tensor([-1., -1.]) reverse
tensor([-1., -1.]) reverse
tensor([-1., -1.]) reverse
