In [3]:
import numpy as np
import torch 
import pandas as pd
from einops import rearrange, reduce, asnumpy, parse_shape


In [4]:
class SeqOptimisation:
    def __init__(self, seqSize=10, numberVars=20, easy=True):
        self.seqSize = seqSize
        self.hseqSize = seqSize/2.0
        self.numberVars = numberVars
        self.easy = easy
        if self.easy:
            # Search for same number repeated
            self.target_num1 = np.random.randint(0, numberVars, 1).item()
#             self.target_num2 = np.random.randint(0, numberVars, 1).item()
            self.target_seq = np.ones(seqSize)*self.target_num1
#             self.target_seq = np.concatenate([np.ones(seqSize//2)*self.target_num1, np.ones(int(self.seqSize-seqSize//2))*self.target_num2])
        else:    
            #TODO: Need to finish implement this harder task 
            # Also can add to search for seq with target subpattern
            self.target_seq = np.random.randint(0, numberVars, seqSize)
        self.target_seq = torch.IntTensor(self.target_seq).squeeze().long()
        self.br = None
        self.bseq = None
        print(f'Target Sequence is {self.target_seq}')
        self.evls = 0
    def reward(self, seq):
        seq = seq.squeeze().long()
        r = -((self.target_seq!=seq).sum().item())/(self.seqSize)
        r = np.tanh(r)
        if self.br is None:
            self.br = r
            self.bseq = seq.squeeze()
        if r>self.br:
            if r==0:
                print(f'Found Optimal Sequence {seq} at evaluation {self.evls}')
            self.br = r
            self.bseq = seq.squeeze()
        self.evls += 1            
        return r, False
            
    def breward(self, seq):
        return -((self.target_seq!=seq).sum(1))/(self.seqSize)
    
    def get_best(self):
        print(f'with hamming distance {(self.target_seq!=self.bseq).sum()} and reward {self.br}')

In [5]:
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from collections import namedtuple, deque
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = 'cpu'
# torch.cuda.set_device('cuda')
torch.set_default_tensor_type(torch.FloatTensor)

In [7]:
from torch.autograd import Variable
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
    
    
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))
    
    
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, max_len: int = 5000):
        super().__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = rearrange(x, 'b s e -> s b e')
        x = x + self.pe[:x.size(0)]
        return rearrange(x, 's b e -> b s e')


In [8]:
def attention(q, k, v, d_k, dropout=None):
    
    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    scores = F.softmax(scores, dim=-1)

    if dropout is not None:
        scores = dropout(scores)
        
    output = torch.matmul(scores, v)
    return output

In [9]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, dropout=0.1):
        super().__init__()
        
        self.d_model = d_model
        self.d_k = d_model
        self.h = 1
        
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, q, k, v, mask=None):
        
        bs = q.size(0)
        
        # perform linear operation and split into h heads
        
        k = self.k_linear(k).view(bs, -1, 1, self.d_k)
        q = self.q_linear(q).view(bs, -1, 1, self.d_k)
        v = self.v_linear(v).view(bs, -1, 1, self.d_k)
        
        # transpose to get dimensions bs * h * sl * d_model
       
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)
        # calculate attention using function we will define next
        scores = attention(q, k, v, self.d_k, self.dropout)
        
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous()\
        .view(bs, -1, self.d_model)
        
        output = self.out(concat)
    
        return output

In [10]:
import gym
from gym import spaces
import numpy as np
import os
import subprocess
import pandas as pd
import csv

from itertools import groupby
import re
import time
from torch.distributions import Categorical
from itertools import count


In [11]:
class TransformerNN(nn.Module):
    def __init__(self, numberVars=20, seqSize=11, outputs=1, d_model=32, device='cpu', dropout=0.1, PolicyGradients=False):
        super(TransformerNN, self).__init__()
#         self.dropout1 = nn.Dropout(dropout)
#         self.dropout2 = nn.Dropout(dropout)
        if PolicyGradients:
            self.saved_log_probs = []
            self.rewards = []
#             encoder_layer = nn.TransformerEncoderLayer(d_model=32, nhead=8)
#             transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=1)
            
#         else:
        self.PEncoder = PositionalEncoding(d_model, max_len=seqSize)
        self.Embedder = nn.Embedding(numberVars+1, d_model, max_norm=True)
        self.numberVars = numberVars
        self.d_model = d_model
        self.lin1 = nn.Linear(d_model, d_model)
        self.head = nn.Linear(d_model*seqSize, outputs)
        self.device = device
        self.numberVars = numberVars
        self.PolicyGradients = PolicyGradients

#             for p in self.parameters():
#                 print(p.shape)
#                 if p.dim() > 1:
#                     nn.init.xavier_uniform(p)

        
        
    def forward(self, x):
        assert x.max().item()<=(self.numberVars+1), print('Embedding more variables than we should')
        # [Batch, SeqLengthxNumVars]
        x = self.Embedder(x).float()
        x = self.PEncoder(x)
        x = self.lin1(x)
        x = F.relu(x)
        x = x.flatten(1)
        x = self.head(x)
        if self.PolicyGradients:
            x = F.softmax(x, dim=1)
        return x

In [12]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

def val_StrCritic(task, policy_net, NumClass, seqSize):
    with torch.no_grad():
        criterion = nn.L1Loss()
        TX = torch.randint(NumClass,(1000,seqSize))
        TestY = task.breward(TX)
        preds = policy_net(TX)
        loss = criterion(preds, TestY)
    return loss


CombTransition = namedtuple('CombTransition',
                        ('structure', 'reward'))

class CombReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a Combinatorial Optimisation transition"""
        self.memory.append(CombTransition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)
    
def getBestMemory(memory):
    transitions = memory.memory
    batch = CombTransition(*zip(*transitions))
    structure_batch = torch.cat(batch.structure)
    reward_batch = torch.cat(batch.reward)
    rewards_idmax = reward_batch.squeeze().argmax()
    return structure_batch[rewards_idmax][None, :]        
    
def optimize_Str(BATCH_SIZE, policy_net, optimizer, memory, gradient_steps=2, seqS=11):
    if len(memory) < BATCH_SIZE:
        return None
    for _ in range(gradient_steps):
        transitions = memory.sample(BATCH_SIZE)
        batch = CombTransition(*zip(*transitions))
        structure_batch = torch.cat(batch.structure)
        reward_batch = torch.cat(batch.reward)   
        # for each batch state according to policy_net
        expected_structure_values = policy_net(structure_batch)
        # Compute the expected Q values

        # Compute Huber loss
        criterion = nn.SmoothL1Loss()
        loss = criterion(reward_batch.squeeze(), expected_structure_values.squeeze())

        # Optimize the model
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(policy_net.parameters(), 100)

        optimizer.step()
    return loss.detach().item()
    

def optimize_SubStr(BATCH_SIZE, policy_net, optimizer, memory, gradient_steps=1, model='QL'):
    GAMMA = 0.99
    
    if len(memory) < BATCH_SIZE:
        return
    for _ in range(gradient_steps):
        transitions = memory.sample(BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                              batch.next_state)), device=device, dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch.next_state
                                                    if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = policy_net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(BATCH_SIZE, device=device)
        #Double Q Learning
    #     next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
        #Single Q Learning
        next_state_values[non_final_mask] = policy_net(non_final_next_states).max(1)[0].detach()

        # Compute the expected Q values
        if model=='QL':
            expected_state_action_values = (next_state_values * GAMMA) + reward_batch
        else:
            expected_state_action_values = reward_batch

        # Compute Huber loss
        criterion = nn.SmoothL1Loss()
        loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
        # Optimize the model
        optimizer.zero_grad()
        loss.backward()
#         grads = []
#         for param in policy_net.parameters():
#             grads.append(param.grad.clone().detach().mean().numpy())
        torch.nn.utils.clip_grad_norm_(policy_net.parameters(), 100)
#         mean_g = np.mean(grads)
        optimizer.step()
    return loss.detach().item()

    
def reset_state(seqSize, nClass):
    # Make new variable for empty token. 
    return (torch.ones((1,seqSize))*nClass).long()

def reset_state_naive():
    # Make new variable for empty token. 
    return torch.zeros((1,1)).long()

def val_critic(task, policy_net, NumClass, seqSize):
    with torch.no_grad():
        criterion = nn.L1Loss()
        TX = torch.randint(NumClass,(1000,seqSize))
        TestY = task.breward(TX)
        LastA = TX[:, -1].long().unsqueeze(1).clone()
        TX[:, -1] = NumClass
        actions = policy_net(TX)
        preds = actions.gather(1, LastA)
        loss = criterion(preds, TestY)
    return loss

class ActionSelection:
    def __init__(self, total_eps=10000):
        self.steps_done = 0 
        self.EPS_START = 1.0
        self.EPS_END = 0.05
        self.EPS_DECAY = total_eps//10
        self.action_samples = 0
        self.rejects = 0
        
    def increment(self):
        self.steps_done += 1 
        
    def select_action_pg(self, state, policy):
        probs = policy(state)
        m = Categorical(probs)
        action = m.sample()
        policy.saved_log_probs.append(m.log_prob(action))
        return action.item()
    
    def select_action_pg_greedy(self, state, policy):
        with torch.no_grad():
            probs = policy(state)
            action = probs.argmax()
            return action.item()
        
    def select_recurrent(self, q_vals, Nclass):
        steps_done = self.steps_done
        sample = random.random()
        eps_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * \
            math.exp(-1. * steps_done / self.EPS_DECAY)
        self.increment()
        if sample > eps_threshold:
            self.action_samples += 1
            with torch.no_grad():
                assert q_vals.shape[1]==Nclass
                return q_vals.max(1)[1].view(1, 1)
        else:
            self.rejects += 1
            return torch.randint(Nclass, (1,1), device=device)

    def select_action(self, state, policy_net, Nclass, greedy=False):
        steps_done = self.steps_done
        sample = random.random()
        eps_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * \
            math.exp(-1. * steps_done / self.EPS_DECAY)
        self.increment()
        if sample > eps_threshold or greedy:
            self.action_samples += 1
            with torch.no_grad():
                q_vals = policy_net(state)
                assert q_vals.shape[1]==Nclass
                return policy_net(state).max(1)[1].view(1, 1)
        else:
            self.rejects += 1
            return torch.randint(Nclass, (1,1), device=device)
        

def greedy_perturbation(memory, SeqSize=11, NumClass=20):
    if len(memory)<=0:
        return torch.randint(NumClass,(1,SeqSize))
    seq = getBestMemory(memory)
    idx = torch.randint(SeqSize,(1,))
    val = torch.randint(NumClass, (1,))
    seq[0,idx]=val 
    return seq 

def perturbation(seq, SeqSize=11, NumClass=20):
    if seq is None:
        return torch.randint(NumClass,(1,SeqSize))
    idx = torch.randint(SeqSize,(1,))
    val = torch.randint(NumClass, (1,))
    seq[0,idx]=val 
    return seq 
        
class StructureSelection:
    def __init__(self, t_init=1.):
        # Increase T means more accepts, decrease means more rejects
        self.t_init = t_init #1. 
        self.rejects = 0
        self.steps_done = 0
        self.action_samples = 0
    
    def increment(self):
        self.steps_done += 1 
        
    def _criterion(self, structure, nstructure, val1, val2, rew_std=None):
        """Do we accept struct1 or reject it and keep (or choose struct2) val1 is for chosen structure, val2 is for random structure."""
        if not torch.is_tensor(val1):
            val1 = torch.tensor(np.array([val1*1.0])).float()
        if not torch.is_tensor(val2):
            val2 = torch.tensor(np.array([val2*1.0])).float()
        # TODO: Try both with and without annealing temperature.
        if rew_std is not None:
            t = rew_std
        else:
            t = self.t_init/(self.steps_done+1)
#             t = self.t_init
        diff = val2-val1
        acceptP = torch.clip(torch.exp(-diff/t), 0, 1).item()
        self.increment()
        if diff<0 or torch.rand(1).item() < acceptP:
            # Critics Agree
            self.action_samples += 1
            return structure, True, acceptP
        else:
            # Difference is too minor, so takes random structure
            self.rejects += 1
            return nstructure, False, None
            
    def select_MH(self, structure, nstructure, StrCritic, rew_std=None):
        """Choose action based on second critic estimates."""
        if nstructure is None:
            return structure, True
        with torch.no_grad():
            val1 = StrCritic(structure).squeeze()
            val2 = StrCritic(nstructure).squeeze()
            return self._criterion(structure, nstructure, val1, val2, rew_std)

In [13]:
def backwardStore(SubStrmemory, Structure, rStructure, SeqSize, NumClass):
    a_next_state = None
    a_state = Structure.clone()
    for rstep in range(1, SeqSize+1):
        a_action = Structure[:, -rstep].unsqueeze(0)
        a_state[:, -rstep] = NumClass
        SubStrmemory.push(a_state, a_action, a_next_state, rStructure)
        a_next_state = a_state.clone()
    
    

In [14]:
class SA:
    def __init__(self, SeqSize, NumClass, exploration, num_episodes):
        self.curr_x = None
        self.curr_val = None 
        self.SeqSize = SeqSize
        self.NumClass = NumClass
        self.exploration = exploration
        self.counter = 0
        self.Strmemory = CombReplayMemory(min(num_episodes, 1e5))
        
    def suggest(self, method):
        if method=='SA':
            return self.suggest_sa()
        if method=='G':
            return self.suggest_greedy()
        if method=='RS':
            return self.suggest_random()
        
    def suggest_random(self):
        self.counter += 1 
        return torch.randint(self.NumClass,(1,self.SeqSize))
        
    def suggest_greedy(self):
        if self.counter>self.exploration:
            self.counter += 1 
            return greedy_perturbation(self.Strmemory, self.SeqSize, self.NumClass)
        else:
            self.counter += 1
            return torch.randint(self.NumClass,(1, self.SeqSize))
        
    def suggest_sa(self):
        if self.counter>self.exploration:
            self.counter += 1
            return perturbation(self.curr_x, self.SeqSize, self.NumClass)
        else:
            self.counter += 1
            return torch.randint(self.NumClass,(1, self.SeqSize))
    
    def update(self, Structure, rStructure, sSelector):
        
        """Maximise function"""
        
        # Store
        rStructure = torch.tensor([rStructure], device=device)
        self.Strmemory.push(Structure, rStructure)
            
        if self.curr_val is None:
            self.curr_val = rStructure
            self.curr_x = Structure

        else:   
            _, criterion, _ = sSelector._criterion(Structure, self.curr_x, rStructure, self.curr_val)
            if criterion:
                sSelector.action_samples += 1
                self.curr_val = rStructure
                self.curr_x = Structure


def run_random(taskF, num_episodes, SeqSize, NumClass, BATCH_SIZE, method='SA'):
    rewards = []
    exploration = int(BATCH_SIZE)
    t_init = 1.
    action_samples = 0
    rejects = 0
    search = SA(SeqSize, NumClass, exploration, num_episodes)
    sSelector = StructureSelection()

    for i_episode in range(num_episodes):
        Structure = search.suggest(method)
        rStructure, OptimalDone = taskF.reward(Structure)
        search.update(Structure, rStructure, sSelector)
        rewards.append(rStructure)
        
        if OptimalDone:
            taskF.get_best()
                        
    print(f'accepts {sSelector.action_samples} rejects {sSelector.rejects}')
    return np.maximum.accumulate(rewards)

def run_multiple_rl_seeds(df_init, nseeds=3, neps=2000, nclass=20, nseq=10, verbose=False, model='QL', BATCH_SIZE=32, gradient_steps=1, baseline=True, easy=False):
    
    df_model = df_init[df_init.model==model]
    if len(df_model)>0:
        start_seed = df_model.seed.max() + 1
    else:
        start_seed = 0
    for i in range(nseeds):
        seed = int(i + start_seed)
        torch.manual_seed(seed)
        np.random.seed(seed)
        df = pd.DataFrame({'model': [], 'seed': [], 'step': [], 'max_reward': [], 'criterion':[], 'task':[], 'size':[]})
        task = SeqOptimisation(nseq, nclass, easy=easy)
        if model=='QL' or model=='RQL':
            print(model)
            max_r = run_ql(task, num_episodes=neps, SeqSize=nseq, NumClass=nclass, BATCH_SIZE=BATCH_SIZE, gradient_steps=gradient_steps, model=model)
        elif model=='SQL':
            max_r, criterion = run_sql(task, num_episodes=neps, SeqSize=nseq, NumClass=nclass, BATCH_SIZE=BATCH_SIZE, verbose=verbose, gradient_steps=gradient_steps, model=model)
            df['criterion'] = criterion
        elif model=='PG' or model=='PGB':
            max_r = run_reinforce(task, num_episodes=neps, SeqSize=nseq, NumClass=nclass, BATCH_SIZE=BATCH_SIZE, verbose=verbose, gradient_steps=gradient_steps, model=model, baseline=baseline)
        elif model in ['SA','G','RS']:
            max_r = run_random(task, num_episodes=neps, SeqSize=nseq, NumClass=nclass, BATCH_SIZE=BATCH_SIZE, method=model)
        else:
            return NotImplementedErrort
        df['max_reward'] = -1*max_r
        df['step'] = np.arange(0, max_r.shape[0], 1)
        df['model'] = model
        df['seed'] = seed
        df['task'] = 'easy' if easy else 'hard'
        df['size'] = nseq
        df_init = df_init.append(df)
    return df_init

In [15]:
from tqdm import tqdm

def run_sql(taskF, num_episodes = 1000, SeqSize=11, NumClass=20, BATCH_SIZE=16, train_freq=1, verbose=False, saveD=False, d_model=32, gradient_steps=1, model='QL'):
    # Needs to have task that maximises reward, can flip it after
    SubStrCritic = TransformerNN(numberVars=NumClass+1, outputs=NumClass, d_model=d_model, seqSize=SeqSize, PolicyGradients=False).to(device)
    SubStroptimizer = optim.Adam(SubStrCritic.parameters(), lr=1e-4)
    SubStrmemory = ReplayMemory(min(num_episodes, 1e5))
    aSelector = ActionSelection(num_episodes*SeqSize)
    
    StrCritic = TransformerNN(numberVars=NumClass+1, outputs=1, d_model=d_model, seqSize=SeqSize, PolicyGradients=False).to(device)
    Stroptimizer = optim.Adam(StrCritic.parameters(), lr=1e-4)    
    Strmemory = CombReplayMemory(min(num_episodes, 1e5))
    sSelector = StructureSelection()

    if verbose:
        val_loss = []
        train_loss = []
        
    rewards = []
    criterions = []
    total_steps = 0
    OptimalDone = False
    
    for i_episode in range(num_episodes):
        if (i_episode%1000)==10:
            taskF.get_best()
        # Initialize the environment and state
        state = reset_state(SeqSize, NumClass)
        assert state.max().item()<=(NumClass+1)
        tmp_buffer = []
        for step in range(SeqSize):
            # Select and perform an action
            action = aSelector.select_action(state, SubStrCritic, NumClass, greedy=True)
            reward = 0.
            next_state = state.clone()
            next_state[0, step] = action
            
            if step==(SeqSize-1):
                done = True
                        
            else:
                done=False
                tmp_buffer.append((state, action, next_state, None))
                
            
            if done:
                # Perform n steps of the optimization (on the policy network)
                tloss = optimize_SubStr(BATCH_SIZE, SubStrCritic, SubStroptimizer, SubStrmemory, gradient_steps=gradient_steps, model=model)
                _ = optimize_Str(BATCH_SIZE, StrCritic, Stroptimizer, Strmemory, gradient_steps=2)
                nstructure = greedy_perturbation(Strmemory, SeqSize, NumClass)
                rew_std = np.std(rewards)
                Structure, criterion, acceptP = sSelector.select_MH(next_state, nstructure, StrCritic, rew_std)
                
                # If Criterion Was Accepted, Add SubStr Optimisation Result to Memory, if not, add Random Structure Into SubStr Memory Instead 
                rStructure, OptimalDone = taskF.reward(Structure)
                
                # Add full structure to structure Critic
                rStructure = torch.tensor([rStructure], device=device)
                Strmemory.push(Structure, rStructure)
                if criterion:
                    SubStrmemory.push(state, action, None, rStructure)
                    for (a_state, a_action, a_next_state, _) in tmp_buffer:
                        SubStrmemory.push(a_state, a_action, a_next_state, rStructure)
                    
                else:
                    # Final structure was not selected by agent.
                    backwardStore(SubStrmemory, Structure, rStructure, SeqSize, NumClass)

                rewards.append(rStructure.detach().item())
#                 criterions.append(acceptP)
                criterions.append(sSelector.action_samples/(i_episode+1))
                total_steps += 1
                break
                
            # Move to the next state
            state = next_state
            if OptimalDone:
                taskF.get_best()
                print(f'OPTIMAL with {i_episode} policy actions accepts {sSelector.action_samples} and rejects {sSelector.rejects}')
    
    taskF.get_best()
    print(f'Complete with {i_episode} policy actions accepts {sSelector.action_samples} and rejects {sSelector.rejects}')
    return np.maximum.accumulate(rewards), criterions


In [16]:
from tqdm import tqdm

def optimise_rnn(mem, main_model, BATCH_SIZE, optimizer, criterion, gradient_steps, GAMMA = 0.99):
    if len(mem)<BATCH_SIZE:
        return None
    for _ in range(gradient_steps):
        hidden_batch, cell_batch = main_model.init_hidden_states(bsize=BATCH_SIZE)

        batch = mem.get_batch(bsize=BATCH_SIZE)

        current_states = []
        acts = []
        rewards = []
        next_states = []

        for b in batch:
            cs,ac,rw,ns = [],[],[],[]
            for element in b:
                cs.append(element[0])
                ac.append(element[1])
                rw.append(element[2])
                ns.append(element[3])
            current_states.append(cs)
            acts.append(ac)
            rewards.append(rw)
            next_states.append(ns)

        current_states = np.array(current_states)
        acts = np.array(acts)
        rewards = np.array(rewards)
        next_states = np.array(next_states)
        optimizer.zero_grad()
        
        torch_current_states = torch.from_numpy(current_states).long().to(device)
        torch_acts = torch.from_numpy(acts).long().to(device)
        torch_rewards = torch.from_numpy(rewards).float().to(device).squeeze(-1)
        torch_next_states = torch.from_numpy(next_states).long().to(device)


        Q_next,_ = main_model.forward(torch_next_states, hidden_state=hidden_batch, cell_state=cell_batch)
        Q_next_max,__ = Q_next.clone().detach().max(dim=-1)
        target_values = torch_rewards + (GAMMA * Q_next_max)
        Q_s, _ = main_model.forward(torch_current_states, hidden_state=hidden_batch, cell_state=cell_batch)
        Q_s_a = Q_s.gather(dim=-1, index=torch_acts).squeeze(-1)
        # make previous grad zero
        loss = criterion(Q_s_a,target_values)    

    
        # backward
        loss.backward()
        torch.nn.utils.clip_grad_norm_(main_model.parameters(), 100)
        # update params
        optimizer.step()

    return loss

    

class RecurrentMemory():
    
    def __init__(self,memsize):
        self.memsize = memsize
        self.memory = deque(maxlen=self.memsize)
        self.currSize = 0 
        
    def __len__(self):
        return self.currSize
    
    def add_episode(self,epsiode):
        self.memory.append(epsiode)
        self.currSize += 1
        
    def get_batch(self,bsize):
        sampled_epsiodes = random.sample(self.memory,bsize)
        batch = []
        for episode in sampled_epsiodes:
            batch.append(episode)
        return batch

class LSTMQNetwork(nn.Module):
    
    def __init__(self,input_size=1,d_model=32,out_size=1, numberVars=20):
        super(LSTMQNetwork,self).__init__()
        self.Embedder = nn.Embedding(numberVars+1, d_model, max_norm=True)
        self.input_size = input_size
        self.out_size = out_size
        self.layers = 1
        self.d_model = d_model
        self.lstm_layer = nn.LSTM(input_size=d_model,hidden_size=d_model,num_layers=self.layers,batch_first=True)
        self.val = nn.Linear(in_features=d_model,out_features=out_size)
        
    def forward(self,x, hidden_state, cell_state):
        x = self.Embedder(x).squeeze(-2)
        lstm_out = self.lstm_layer(x,(hidden_state,cell_state))
        out = lstm_out[0]
        h_n = lstm_out[1][0]
        c_n = lstm_out[1][1]
        val_out = self.val(out)
        return val_out, (h_n,c_n)
    
    def init_hidden_states(self,bsize):
        h = torch.zeros(self.layers,bsize,self.d_model).float().to(device)
        c = torch.zeros(self.layers,bsize,self.d_model).float().to(device)
        
        return h,c

def run_ql(taskF, num_episodes = 1000, SeqSize=11, NumClass=20, BATCH_SIZE=16, train_freq=1, verbose=False, saveD=False, d_model=32, gradient_steps=1, model='QL'):
    if model=='RQL':
        recurrent = True 
    else:
        recurrent = False 
    if recurrent:
        main_model = LSTMQNetwork(1, d_model, NumClass)
        optimizer = optim.Adam(main_model.parameters(), lr=1e-4)
        memory = RecurrentMemory(min(num_episodes, 1e5))
        criterion = nn.SmoothL1Loss()
    else:
        SubStrCritic = TransformerNN(numberVars=NumClass+1, outputs=NumClass, d_model=d_model, seqSize=SeqSize, PolicyGradients=False).to(device)
        SubStroptimizer = optim.Adam(SubStrCritic.parameters(), lr=1e-4)
        SubStrmemory = ReplayMemory(min(num_episodes, 1e5))
    
    aSelector = ActionSelection(num_episodes*SeqSize)
        
    rewards = []
    loss_stat = []

    total_steps = 0
    OptimalDone = False
    
    
    for i_episode in tqdm(range(num_episodes)):
        # Initialize the environment and state [1,1]
        local_memory = []
        Structure = reset_state(SeqSize, NumClass)
        if recurrent:
            state = reset_state_naive()
            hidden_state, cell_state = main_model.init_hidden_states(bsize=1)

        for step in range(SeqSize):
            if recurrent:
                # Select and perform an action
                model_out = main_model.forward(state.long().unsqueeze(0), hidden_state=hidden_state, cell_state=cell_state)
                qvals = model_out[0][:,-1,:]
                hidden_state = model_out[1][0]
                cell_state = model_out[1][1]
                action = aSelector.select_recurrent(qvals, NumClass)
                next_state = action  
            else:
                action = aSelector.select_action(Structure, SubStrCritic, NumClass, greedy=False)
                
            reward = 0
                      
            if step==(SeqSize-1):
                done = True       
            else:
                if recurrent:
                    local_memory.append((state.long().flatten().numpy(), np.array([action]), np.array([reward]), next_state.long().flatten().numpy()))
                else:
                    nStructure = Structure.clone()
                    nStructure[:, step] = action
                    rStructure = torch.tensor([reward], device=device)
                    SubStrmemory.push(Structure.clone(), action, nStructure, rStructure)
                done=False
            
            if done:
                nStructure = Structure.clone()
                nStructure[:, step] = action
                reward, OptimalDone = taskF.reward(nStructure)
                if recurrent:
                    local_memory.append((state.float().flatten().numpy(), np.array([action]), np.array([reward]), next_state.long().flatten().numpy()))
                    memory.add_episode(local_memory)
                else:
                    rStructure = torch.tensor([reward], device=device)
                    SubStrmemory.push(Structure, action, None, rStructure)
                if recurrent:
                    loss = optimise_rnn(memory, main_model, BATCH_SIZE, optimizer, criterion, gradient_steps)
                else:
                    loss = optimize_SubStr(BATCH_SIZE, SubStrCritic, SubStroptimizer, SubStrmemory, gradient_steps=gradient_steps, model=model)
                    
                rewards.append(reward)
                total_steps += 1
                break
                
            # Move to the next state
            if recurrent:
                state = next_state
            
            Structure[:, step] = action
                
            if OptimalDone:
                taskF.get_best()
                print(f'OPTIMAL with {i_episode}')
    
    taskF.get_best()
    return np.maximum.accumulate(rewards)


In [74]:
EPS_START = 1.0
EPS_END = 0.05
EPS_DECAY = 100
steps_done = np.arange(0, 1000, 1)
eps_threshold = EPS_END + (EPS_START - EPS_END) * np.exp(-1. * steps_done/100)

In [40]:
def get_eg(eps_threshold, steps_done):
    return 1.0-np.cumsum(np.random.binomial(1, eps_threshold))/(steps_done+1)

In [237]:
r'{}'.format(1)

'1'

In [342]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

def plot_seaborn(df_runs, colourD, markerD, save=False, easy=True, x_label='Protein Designs', y_label='max_reward', legend=False, numeps=1000, color='red', length=10):
    plt.rcParams['text.usetex'] = True
    
    fsize = 25
    name = f'Synthetic {"Exploitation" if easy else "Exploration"} Task (Length {SeqSize})'
    matplotlib.rc('xtick', labelsize=fsize)
    matplotlib.rc('ytick', labelsize=fsize)
    cols = 1
    rows = 1.5
    fig, (axtop, axeps, ax) = plt.subplots(nrows=3, ncols=1, figsize=(6.2*cols, 5*rows),  gridspec_kw={'height_ratios': [2.5, 2.5, 10]})
    sns.set_style("white")
    
    df_runs = df_runs.reset_index()
    df_runs['max_reward'] *= length
    df_runs = df_runs.round({'max_reward': 0})
    
    print(df_runs.shape)
    greenpastel = sns.color_palette('pastel', 12)[2]
    redpastel = sns.color_palette('pastel', 12)[3]
    dfcrit = df_runs[df_runs.model=='SQL']
    if len(dfcrit) != 0:
        EPS_START = 1.0
        EPS_END = 0.05
        EPS_DECAY = numeps//10
        steps_done = np.arange(0, numeps, 1)

        critval = dfcrit.groupby(['step']).aggregate('mean')['criterion'].to_numpy()

        ones = np.ones_like(critval)
        xs = np.arange(0, len(ones), 1)
        print('Started Plotting Crit')       
        axtop.fill_between(xs,critval, color=greenpastel)
        axtop.fill_between(xs,critval, ones,where=ones>=critval, color=redpastel)
        sns.lineplot(ax=axtop, data=dfcrit, x='step', y='criterion', color=color, linewidth=1, legend=False)        
        axtop.set_xlabel('')
        axtop.set_ylabel(r"$\mathbf{\pi_{\mathcal{S}C}}$"
#           "\n"
#           r"$\textbf{Expolitation}$"
                         , fontsize=20, fontweight="bold", labelpad=15)
        axtop.set_xticks([])
        axtop.set_title(name, fontsize=fsize, fontweight="bold")
        s1 = axtop.spines["top"]
        s1.set_visible(False)
        s3 = axtop.spines["right"]
        s3.set_visible(False)
        s2 = axtop.spines["bottom"]
        s2.set_visible(False)
#         axeps.set_ylim([0, 1])
        axtop.set_yticks([])
        print('Finished Plotting Crit')
        
        print('Started Plotting Eps') 
        eps_threshold = EPS_END + (EPS_START - EPS_END) * np.exp(-1. * steps_done/EPS_DECAY)
        dfeps = pd.DataFrame(np.array([get_eg(eps_threshold, steps_done) for _ in range(3)]).T).melt()
        dfeps.columns = ['seed', 'criterion']
        dfeps['step'] = np.array([steps_done for _ in range(3)]).flatten()
        eps_threshold = dfeps.groupby(['step']).aggregate('mean')['criterion'].to_numpy()

        sns.lineplot(ax=axeps, data=dfeps, x='step', y='criterion', color=color, linewidth=1, legend=False)        
        axeps.fill_between(xs,eps_threshold, color=greenpastel)
        axeps.fill_between(xs,eps_threshold, ones,where=ones>=eps_threshold, color=redpastel)
        axeps.set_xlabel('')
        axeps.set_ylabel(r"$\mathbf{\pi_{\epsilon}}$"
#           "\n"
#           r"$\textbf{Expolitation}$"
                         , fontsize=20, fontweight="bold", labelpad=15)
#         axeps.set_ylim([0, 1])
        axeps.set_xticks([])
        axeps.set_yticks([])
        s1 = axeps.spines["top"]
        s1.set_visible(False)
        s3 = axeps.spines["right"]
        s3.set_visible(False)
        
        print('Finished Plotting Eps')
        
    else:
        ax.set_title(name, fontsize=fsize, fontweight="bold")

    
    ax.set_xlabel(r"$\textbf{Protein Designs}$", fontsize=20, fontweight="bold")
    ax.set_ylabel(r"$\textbf{Hamming Distance}$"
#                   "\n"
#                   r"$\textbf{Distance}$"
                  , fontsize=20, fontweight="bold", labelpad=15)
#     ax.yaxis.set_label_position("right")
    print('Started Plotting Curves')
    sns.lineplot(ax=ax, data=df_runs, x='step', y='max_reward', hue="model", legend=legend, linewidth=3, palette=colourD, markers=markerD, markevery=50, markersize=3)
    save_friendly = name.replace('\n','').replace(' ','_')
    plt.subplots_adjust(wspace=0, hspace=0)
#     s1 = ax.spines["top"]
#     s1.set_visible(False)
    s3 = ax.spines["right"]
    s3.set_visible(False)
    fig.text(0, .75, r"$\textbf{Expolitation}$", va='center', rotation='vertical', fontsize=20)
#     ax.set_ylim(0, 0.8)
    if save:
        plt.savefig(save_friendly+f'_legend_{legend}.pdf', bbox_inches="tight")
#     plt.close()

In [343]:
LabelDict = {'Criterion':'Criterion','RS': 'Random Search','SA': 'Simulated Annealing','AS': 'Active Search','AMS': 'Attention Model Sampling', 
             'AMG': 'Attention Model Greedy', 'RQL':'Recurrent Q Learning', 'SQL':'Structured Q Learning',
             'High Affinity':'High Affinity','Super Affinity':'Super Affinity', 'Super+ Affinity':'Super+ Affinity'}

In [344]:
colours = sns.color_palette('dark', len(LabelDict.keys()))
LabelDictinv = {v: k for k, v in LabelDict.items()}
colourDINV = {}
for i,(key,_) in enumerate(LabelDict.items()):
    colourDINV[key]=colours[i]
markerDINV = {}
all_markers = ['.', 'o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X']
from matplotlib.lines import Line2D
for i,(key,_) in enumerate(LabelDict.items()):
    markerDINV[key]=all_markers[i]

In [345]:
SeqSize = 10
easy=True
n_seeds = 5
neps = 10000
batch_size = 32
gradient_steps = 1 # 20 is good
save = True


In [346]:

df_mh = pd.read_csv('AllData.csv', index_col=0)

In [None]:
# for SeqSize in [10, 100]:    
for SeqSize in [10]: 
    for easy in [True]:
#     for easy in [True, False]:
        if easy:
            df_task = df_mh[df_mh.task=='easy']
        else:
            df_task = df_mh[df_mh.task=='hard']
        plot_seaborn(df_task, colourDINV, markerDINV, save, easy, legend=True, color=colourDINV['Criterion'], length=SeqSize, numeps=100)

(400000, 8)
Started Plotting Crit


In [288]:
# df_mh = pd.read_csv('AllData.csv', index_col=0)
# df_mh = pd.DataFrame({'model': [], 'seed': [], 'step': [], 'max_reward': [], 'criterion':[], 'task':[], 'size':[]})
for SeqSize in [10, 100]:    
# for SeqSize in [10]:    
    for easy in [True, False]:
#         df_mh = run_multiple_rl_seeds(df_mh, nseeds=n_seeds, neps=neps, BATCH_SIZE=batch_size, model='G', easy=easy, nseq=SeqSize)
        df_mh = run_multiple_rl_seeds(df_mh, nseeds=n_seeds, neps=neps, BATCH_SIZE=batch_size, gradient_steps=gradient_steps, model='SQL', easy=easy, nseq=SeqSize)        
#         df_mh = run_multiple_rl_seeds(df_mh, nseeds=n_seeds, neps=neps, BATCH_SIZE=batch_size, gradient_steps=gradient_steps, model='QL', easy=easy, nseq=SeqSize)
        df_mh = run_multiple_rl_seeds(df_mh, nseeds=n_seeds, neps=neps, BATCH_SIZE=int(batch_size//SeqSize)+1, gradient_steps=gradient_steps, model='RQL', easy=easy, nseq=SeqSize)
        df_mh = run_multiple_rl_seeds(df_mh, nseeds=n_seeds, neps=neps, BATCH_SIZE=batch_size, model='RS', easy=easy, nseq=SeqSize)
        df_mh = run_multiple_rl_seeds(df_mh, nseeds=n_seeds, neps=neps, BATCH_SIZE=batch_size, model='SA', easy=easy, nseq=SeqSize)
        df_task = df_mh[df_mh.task==str(easy)]
        df_task = df_task[df_task['size']==SeqSize]
        plot_seaborn(df_task, colourDINV, markerDINV, save, f'Synthetic {"Exploitative" if easy else "Explorative"} Task \n (Length {SeqSize}) \n', legend=True, color=colourDINV['Criterion'])
#         df_mh.to_csv('AllData.csv')

Target Sequence is tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3])
with hamming distance 10 and reward -0.7615941559557649


  keepdims=keepdims, where=where)
  subok=False)
  ret = ret.dtype.type(ret / rcount)


Found Optimal Sequence tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3]) at evaluation 172
with hamming distance 0 and reward 0.0
with hamming distance 0 and reward 0.0


KeyboardInterrupt: 

In [None]:
# BATCH_SIZE = 1000
for SeqSize in [1000]:    
# for SeqSize in [10]:    
    for easy in [True, False]:
#         df_mh = run_multiple_rl_seeds(df_mh, nseeds=n_seeds, neps=neps, BATCH_SIZE=batch_size, model='G', easy=easy, nseq=SeqSize)
        df_mh = run_multiple_rl_seeds(df_mh, nseeds=n_seeds, neps=neps, BATCH_SIZE=batch_size, gradient_steps=gradient_steps, model='SQL', easy=easy, nseq=SeqSize)        
#         df_mh = run_multiple_rl_seeds(df_mh, nseeds=n_seeds, neps=neps, BATCH_SIZE=batch_size, gradient_steps=gradient_steps, model='QL', easy=easy, nseq=SeqSize)
        df_mh = run_multiple_rl_seeds(df_mh, nseeds=n_seeds, neps=neps, BATCH_SIZE=int(batch_size//SeqSize)+1, gradient_steps=gradient_steps, model='RQL', easy=easy, nseq=SeqSize)
        df_mh = run_multiple_rl_seeds(df_mh, nseeds=n_seeds, neps=neps, BATCH_SIZE=batch_size, model='RS', easy=easy, nseq=SeqSize)
        df_mh = run_multiple_rl_seeds(df_mh, nseeds=n_seeds, neps=neps, BATCH_SIZE=batch_size, model='SA', easy=easy, nseq=SeqSize)
        df_task = df_mh[df_mh.task==str(easy)]
        df_task = df_task[df_task['size']==SeqSize]
        plot_seaborn(df_task, colourDINV, markerDINV, save, f'Synthetic {"Exploitative" if easy else "Explorative"} Task \n (Length {SeqSize}) \n', legend=True, color=colourDINV['Criterion'])
        df_mh.to_csv('AllData.csv')

Target Sequence is tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3

  keepdims=keepdims, where=where)
  subok=False)
  ret = ret.dtype.type(ret / rcount)


with hamming distance 674 and reward -0.5876049206498948
with hamming distance 653 and reward -0.5736860827264039
with hamming distance 624 and reward -0.5539069192039144
with hamming distance 603 and reward -0.5391808600618703


In [None]:


def batch_zero(batch, Nseq, Nclass, numzeros=1):    
    replace = (torch.zeros_like(batch)*Nclass).long()
    idx = torch.randint(Nseq,(batch.shape[0],))
    oneh = F.one_hot(idx, Nseq)
    
    if numzeros>1:
        for _ in range(numzeros-1):
            idx = torch.randint(Nseq,(batch.shape[0],))
            oneh += F.one_hot(idx, Nseq)
            
    neighs = torch.where(oneh==1, replace, batch)
    # Remove Duplicates
#     neighs = torch.unique(neighs, dim=0)
    # Remove Observed
    return neighs

def batch_perturbation(Nseq, Nsamples, Nclass):
    idx = torch.randint(Nseq,(Nsamples,))
    replace = torch.randint_like(best_s, Nclass)
    oneh = F.one_hot(idx, Nseq)
    neighs = torch.where(oneh==1, replace, best_s)
    # Remove Duplicates
#     neighs = torch.unique(neighs, dim=0)
    # Remove Observed
    return neighs

def optimize_structure(policy_net, memory, Nseq, Nclass, Nsamples=1000, BATCH_SIZE=32):
    if len(memory) < BATCH_SIZE:
        return torch.randint(Nclass,(1,Nseq)), torch.randint(Nclass,(1,Nseq))
    #Greedy Search 
    transitions = memory.memory
    batch = CombTransition(*zip(*transitions))
    structure_batch = torch.cat(batch.structure)
    rewards_idmax = torch.cat(batch.reward).squeeze().argmax()
    best_s_single = structure_batch[rewards_idmax][None, :]
    best_s = best_s_single.clone().repeat(Nsamples,1)
    neighs = batch_perturbation(memory, Nseq, Nsamples, Nclass)
    neighs[(structure_batch==neighs[:, None]).all(2).any(1).logical_not()]
    with torch.no_grad():
        npredsMAX = policy_net(neighs).squeeze().argmax()
    return neighs[npredsMAX][None, :], best_s_single

            
        

def run_sql(taskF, num_episodes = 1000, SeqSize=11, NumClass=20, BATCH_SIZE=32, train_freq=1, verbose=False, saveD=False, d_model=32):
    total_accepts = 0
    total_rejects = 0
    # TARGET_UPDATE = 10
    #Always add 1 to Q learning for padded class. 
    policyNet = TransformerNN(numberVars=NumClass, outputs=1, d_model=d_model, seqSize=SeqSize).to(device)
    policyNet2 = None
#     policyNet2 = TransformerNN(numberVars=NumClass, outputs=1, d_model=d_model, seqSize=SeqSize).to(device)
    optimizer = optim.Adam(policyNet.parameters(), lr=1e-4)
#     optimizer2 = optim.Adam(policyNet2.parameters(), lr=1e-4)
    memory = CombReplayMemory(min(num_episodes, 1e5))
    t_init = 1 
    selector = StructureAction(num_episodes)
    rewards = []
    criterion = []
    if verbose:
        val_loss = []
        train_loss = []
    if saveD:
        sequence_designs = []
#     selector = Action()
    total_steps = 0
    for i_episode in range(num_episodes):
        # Random Structure
        structure, bests = optimize_structure(policyNet, memory, SeqSize, NumClass)
        nstructure = perturbation(bests, SeqSize, NumClass)
#         nstructure = torch.randint(NumClass,(1,SeqSize))
        selector.select_action(structure, nstructure, policyNet, policyNet2, NumClass, MH=False)
#         structure = torch.randint(NumClass,(1,SeqSize))
        if saveD:
            sequence_designs.append(structure)
        reward, _ = taskF.reward(structure)
        # Store the transition in memory
        reward = torch.tensor([reward], device=device)
        memory.push(structure, reward)
#         if (i_episode%SeqSize)==3:
        tloss = optimize_model_sql(BATCH_SIZE, policyNet, optimizer, memory)
#         _ = optimize_model_sql(BATCH_SIZE, policyNet2, optimizer2, memory)
        if verbose:
            train_loss.append(tloss) 
            val_loss.append(val_sql_critic(taskF, policyNet, NumClass, SeqSize))
        rewards.append(reward.detach().item())
        
        
        
                
        if (i_episode%1000)==10:
            taskF.get_best()
                
    taskF.get_best()
    print(f'Complete with {i_episode} policy actions accepts {selector.action_samples} and rejects {selector.rejects}')
    if verbose:
        plot_rewards(rewards, val_loss, train_loss)
        return np.maximum.accumulate(rewards)
#     , np.array(sequence_designs)
    else:
        return np.maximum.accumulate(rewards)


In [None]:
batch_zero(structure_batch, 11, 20, 10)

In [None]:
NumClass = 20
SeqSize = 11
task = SeqOptimisation(SeqSize, NumClass)
rews = run_sql(task, num_episodes=2000, SeqSize=11, NumClass=20, BATCH_SIZE=32, verbose=True)

In [None]:
df_all.model.unique()

In [None]:
df_sql = run_multiple_rl_seeds(df_all, nseeds=10, neps=2000, model='SQL', verbose=False)

In [None]:
df_sql.model.unique()

In [None]:
plot_seaborn(df_sql, True)

In [None]:
df_sql[df_sql.model=='SQL'].max()

In [None]:
def plot_rewards(rewards, val_loss, train_loss):
    plt.figure(1)
    plt.clf()
    rewards_ep = torch.tensor(rewards, dtype=torch.float)
    val_loss = torch.tensor(val_loss, dtype=torch.float)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Max Rewards')
    
    plt.plot(np.maximum.accumulate(rewards_ep.numpy()), label='Max Reward')
    plt.show()
    
    plt.figure(1)
    plt.clf()
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Val Loss')
    plt.plot(val_loss.numpy(), label='val loss')
    plt.show()
    
    
    plt.figure(1)
    plt.clf()
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Train Loss')
    plt.plot(np.array(train_loss), label='Train loss')
    plt.show()
    
    plt.figure(1)
    plt.clf()
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('100 Step Average Reward')
    # Take 100 episode averages and plot them too
    if len(rewards_ep) >= 100:
        means = rewards_ep.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy(), label='100 steps Reward Average')

    plt.show()