## In this tutorial we'll show how to train a DDPG agent to learn optimal Kennedy receiver. This involve not only displacement optimization but also to learn a (trivial, yet demaning) guessing rule among the possible phases. We aim to clear the background so in the next tutorial we can consider complex displacements and thereby more phases

In [1]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.layers import Dense
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm_notebook as tqdm
tf.keras.backend.set_floatx('float64')
from collections import deque
from datetime import datetime
import random

#### this is the outcome probability, given by the overlap <0|\alpha - \beta>|¨^{2}
def Prob(alpha, beta, n):
    p0 = np.exp(-(alpha-beta)**2)
    if n == 0:
        return p0
    else:
        return 1-p0

### this is just p(R=1 | g, n; beta) = p((-1^{g} alpha | n)) = p(n|allpha) pr(alpha)(p(n))
def qval(beta, n, guess):
    #dolinar guessing rule (= max-likelihood for L=1, careful sign of \beta)
    alpha = 0.4
    pn = np.sum([Prob(g*alpha, beta, n) for g in [-1,1]])
    return Prob(guess*alpha, beta, n)/pn

def ps_maxlik(beta):
    #dolinar guessing rule (= max-likelihood for L=1, careful sign of \beta)
    alpha = 0.4
    p=0
    for n1 in [0,1]:
       p+=Prob(np.sign(beta)*(-1)**(n1)*alpha, beta, n1)
    return p/2


##### Now we create the replay memory, in which we store the transitions with the corresponding rewards, and also we add some data to test how the agent is doing

In [76]:
class ReplayBuffer():
    def __init__(self, buffer_size=10**3):
        self.buffer_size = buffer_size
        self.count = 0
        self.betas = np.arange(-1.5,1.5,0.01)
        self.buffer = deque()
        self.create_test_datasets()
        
    def create_test_datasets(self):
        dt_l0 = []
        dt_l1 = []
        
        for k in self.betas:
            dt_l0.append([k, ps_maxlik(k)])
            for n in [0.,1.]:
                for g in [-1.,1.]:
                    dt_l1.append([k, n, g, qval(k,n,g)])
        self.test_l0 = np.array(dt_l0)
        self.test_l1 = np.array(dt_l1)
        return
    
    def add(self, beta, outcome, guess, reward):
        experience = (beta, outcome, guess, reward)
        if self.count < self.buffer_size:
            self.buffer.append(experience)
            self.count += 1
        else:
            self.buffer.popleft()
            self.buffer.append(experience)

    def size(self):
        return self.count

    def sample(self, batch_size):
        batch = []
        if self.count < batch_size:
            batch = random.sample(self.buffer, self.count)
        else:
            batch = random.sample(self.buffer, int(batch_size))
        beta_batch, outcome_batch, guess_batch, r_batch= list(map(np.array, list(zip(*batch))))
        return np.array([beta_batch, outcome_batch, guess_batch, r_batch]).transpose()

    def clear(self):
        self.buffer.clear()
        self.count = 0

Next we define the networks! notice we add some features that may be unimportant for some objects

In [3]:

class Critic(tf.keras.Model):
    #input_dim: 1 if layer=0, 3 if layer= 2, for the Kennedy receiver ##
    def __init__(self, input_dim, valreg=0.01, seed_val=0.1):
        super(Critic,self).__init__()

        self.l1 = Dense(50, input_shape=(input_dim,),kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
        bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
        kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg))

        self.l2 = Dense(50, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val))
        self.l3 = Dense(50, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val))

        self.l4 = Dense(50, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val))

        self.l5 = Dense(1, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val))

        
        
    
    def update_target_parameters(self,primary_net, tau=0.01):
        #### only 
        prim_weights = primary_net.get_weights()
        targ_weights = self.get_weights()
        weights = []
        for i in tf.range(len(prim_weights)):
            weights.append(tau * prim_weights[i] + (1 - tau) * targ_weights[i])
        self.set_weights(weights)
        return

    def call(self, input):
        feat = tf.nn.relu(self.l1(input))
#        feat = tf.nn.dropout(feat, rate=0.01)
 #       feat = tf.nn.relu(self.l2(feat))
  #      feat = tf.nn.dropout(feat, rate=0.01)
   #     feat = tf.nn.relu(self.l3(feat))
        feat = tf.nn.relu(self.l4(feat))
        feat = tf.nn.sigmoid(self.l5(feat))
        return feat

    def calculate_greedy_from_batch(self, batch):
        """ this function is only to intended for Q(n, beta, guess).
        Assuming batch = np.array([[beta, n, guess], [beta1, n1, guess], ...])
        
        """
        a = batch.copy()
        preds1 = self(a)
        a[:,2] = -a[:,2]
        preds2 = self(a)
        both = tf.concat([preds1,preds2],1)
        maxs = np.squeeze(tf.math.reduce_max(both,axis=1))
        maxs = np.expand_dims(maxs, axis=1)
        return maxs
    
    def give_favourite_guess(self, beta, outcome):
        """"This funciton is only intended for Q(n, beta, guess)"""
        h1a2 = np.array([[beta, outcome,-1.]])
        pred_minus = self(h1a2)
        h1a2[:,2] = 1.
        pred_plus = self(h1a2)
        both = tf.concat([pred_plus,pred_minus],1)
        maxs = tf.argmax(both,axis=1)
        guess = (-1)**maxs.numpy()[0]
        return guess

    def __str__(self):
        return self.name

    
    
##### ACTOR CLASSS ####    
class Actor(tf.keras.Model):
    #input_dim: 1 if layer=0, 3 if layer= 2, for the Kennedy receiver ##
    def __init__(self, input_dim=1, valreg=0.01, seed_val=0.1):
        super(Actor,self).__init__()

        self.l1 = Dense(50, input_shape=(input_dim,),kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
        bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
        kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg))

        self.l2 = Dense(50, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val))
        self.l3 = Dense(50, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val))

        self.l4 = Dense(50, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val))

        self.l5 = Dense(1, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val))

        
    def update_target_parameters(self,primary_net, tau=0.01):
        #### only 
        prim_weights = primary_net.get_weights()
        targ_weights = self.get_weights()
        weights = []
        for i in tf.range(len(prim_weights)):
            weights.append(tau * prim_weights[i] + (1 - tau) * targ_weights[i])
        self.set_weights(weights)
        return

    def call(self, input):
        feat = tf.nn.relu(self.l1(input))
#        feat = tf.nn.dropout(feat, rate=0.01)
 #       feat = tf.nn.relu(self.l2(feat))
        feat = tf.nn.dropout(feat, rate=0.01)
   #     feat = tf.nn.relu(self.l3(feat))
        feat = tf.nn.relu(self.l4(feat))
        feat = tf.nn.tanh(self.l5(feat))
        return feat

    def calculate_greedy_from_batch(self, batch):
        """ this function is only to intended for Q(n, beta, guess)"""
        a = batch[1].copy()
        preds1 = self(a)
        a[:,2] = -a[:,2]
        preds2 = self(a)
        both = tf.concat([preds1,preds2],1)
        maxs = np.squeeze(tf.math.reduce_max(both,axis=1))
        maxs = np.expand_dims(maxs, axis=1)
        return maxs

    def __str__(self):
        return self.name


This are some functions for 1) compute the predictions and 2) optimization step

In [114]:
def testing_data(buffer,networks):
    actor_q0, critic_q0, critic_guess, target_guess = networks
    ### this is the test data for the guess network, defined in Dataset() classs
    predstest = critic_guess(buffer.test_l1[:,[0,1,2]])
    targets_1 = np.expand_dims(buffer.test_l1[:,3], axis=1)
    loss_test_l1 = tf.keras.losses.MSE(targets_1, predstest)
    loss_test_l1 = tf.reduce_mean(loss_test_l1)
    test_loss_l1(loss_test_l1)
    print("test loss!!!! l1!!!: ",loss_test_l1.numpy())
    
    ### this is the test data for the \hat{Q}('beta) #####
    preds_test_l0 = critic_q0(np.expand_dims(buffer.test_l0[:,0], axis=1))
    loss_y0 = tf.keras.losses.MSE(np.expand_dims(buffer.test_l0[:,1], axis=1), preds_test_l0)
    loss_y0 = tf.reduce_mean(loss_y0)
    test_loss_l0(loss_y0)

    return



def optimization_step(networks, optimizers, buffer, batch_size=500., tau=0.01, repetitions=1):
    actor_q0, critic_q0, critic_guess, target_guess = networks
    optimizer_critic_guess,  optimizer_actor_l0, optimizer_critic_l0 = optimizers
    for thoughts in range(repetitions):
        experiences = buffer.sample(batch_size)

        ##### update the critic guess according to rewards obtained 
        with tf.GradientTape() as tape:
            tape.watch(critic_guess.trainable_variables)
            preds_cguess = critic_guess(experiences[:,[0,1,2]])
            labels_cguess = np.expand_dims(experiences[:,3],axis=1)
            loss_prim_guess = tf.keras.losses.MSE(labels_cguess, preds_cguess)
            loss_prim_guess = tf.reduce_mean(loss_prim_guess)
            grads = tape.gradient(loss_prim_guess, critic_guess.trainable_variables)
            optimizer_critic_guess.apply_gradients(zip(grads, critic_guess.trainable_variables))
            train_loss_l1(loss_prim_guess)

        ##### update the target guess ######
        target_guess.update_target_parameters(critic_guess, tau=0.01) #check this value !

        #### obtain the labels for the update of Q(\beta)
        labels_critic_l0 = target_guess.calculate_greedy_from_batch(experiences[:,[0,1,2]]) #greedy from target; this is the label for net_0!!

        with tf.GradientTape() as tape:
            tape.watch(critic_q0.trainable_variables)
            preds0 = critic_q0(np.expand_dims(experiences[:,0],axis=1))
            loss_0 = tf.keras.losses.MSE(labels_critic_l0,preds0)
            loss_0 = tf.reduce_mean(loss_0)
            grads0 = tape.gradient(loss_0, critic_q0.trainable_variables)
            optimizer_critic_l0.apply_gradients(zip(grads0, critic_q0.trainable_variables))
        train_loss_l0(loss_0)

        #### obtain the components for the chain for the update of \pi( h_0 = nada!) = \beta
        with tf.GradientTape() as tape:
            actions = actor_q0(np.expand_dims(np.zeros(len(experiences)),axis=1))
            tape.watch(actions)
            qvals = critic_q0(actions)
            dq_da = tape.gradient(qvals, actions)

        ### update actor \pi( h_0) = \beta
        with tf.GradientTape() as tape:
            actions = actor_q0(np.expand_dims(np.zeros(len(experiences)),axis=1))
            da_dtheta = tape.gradient(actions, actor_q0.trainable_variables, output_gradients=-dq_da)
            optimizer_actor_l0.apply_gradients(zip(da_dtheta, actor_q0.trainable_variables))
    return

In [115]:
def BigPlot(buffer, rt, pt, history_betas, history_betas_would_have_done, histo_preds, losses, directory):

    plt.figure(figsize=(40,40), dpi=100)
    plt.subplots_adjust(wspace=0.3, hspace=0.3)

    T=len(rt)
    ax1=plt.subplot2grid((2,4),(0,0))
    ax2=plt.subplot2grid((2,4),(1,0))
    ax3=plt.subplot2grid((2,4),(0,1))
    ax4=plt.subplot2grid((2,4),(1,1))
    ax5=plt.subplot2grid((2,4),(0,2))
    ax6=plt.subplot2grid((2,4),(1,2))
    ax7=plt.subplot2grid((2,4),(0,3))
    ax8=plt.subplot2grid((2,4),(1,3))

    optimal = max([ps_maxlik(b) for b in buffer.betas])

    ### ploting the \Rt and \Pt ###
    ax1.plot(np.log10(np.arange(1,T+1)),rt, color="red", linewidth=15, alpha=0.8, label=r'$R_t$')
    ax1.plot(np.log10(np.arange(1,T+1)),optimal*np.ones(T), color="black",  linewidth=15,alpha=0.5, label="optimal")
    ax1.plot(np.log10(np.arange(1,T+1)),pt, color="blue", linewidth=8, alpha=0.3, label=r'$P_t (fluctuates!)$')

    ## ploting the histogram for betas ##
    optimal_beta = buffer.betas[np.where(ps_maxlik(buffer.betas) == max(ps_maxlik(buffer.betas)))[0][0]]
    ax2.hist(history_betas,bins=100, facecolor='r', alpha=0.6, edgecolor='blue', label="done")
    ax2.hist(history_betas_would_have_done,bins=100, facecolor='g', alpha=0.4, edgecolor='black', label="would have done")
    ax2.text(optimal_beta, 0, "*", size=30)
    ax2.text(-optimal_beta, 0, "*", size=30)

    ## ploting the history of betas ##
    ax3.plot(np.arange(1, len(history_betas)+1),history_betas, color="red", linewidth=15, alpha=0.8, label="done")
    ax3.plot(np.arange(1, len(history_betas)+1),history_betas_would_have_done, color="green", linewidth=15, alpha=0.8, label="would have done")
    ax3.plot(np.arange(1, len(history_betas)+1),np.ones(len(history_betas))*optimal_beta, color="black", linewidth=15, alpha=0.8, label="optimal-beta")
    ax3.plot(np.arange(1, len(history_betas)+1),-np.ones(len(history_betas))*optimal_beta, color="black", linewidth=15, alpha=0.8)#, label="optimal-beta")


    #### in here i plot the loss for the first Q(0), the test and the train. Notice they have different scale! I use different colors!
    c=0
    lab = ["train","test"]
    colors = ["tab:red","tab:blue"]
    for loss in losses[0]:
        color = colors[c]
        ax4.plot(np.arange(1,len(loss)+1),loss,'--',alpha=0.85,c=colors[c], linewidth=5, label="Preds Q(\beta) - "+lab[c])#, label="Q(n1=0,"+r'$\beta$'+"; g=-1)")
        ax4.scatter(np.arange(1,len(loss)+1),loss,s=150,alpha=0.85,c=colors[c], linewidth=5,label="Preds Q(\beta) - "+lab[c])#, label="Q(n1=0,"+r'$\beta$'+"; g=-1)")
        ax4.set_xlabel("epoch", size=20)
        ax4.set_ylabel("Loss Q(\beta_1)",size=20, color =colors[c])
        ax4.tick_params(axis='y', labelcolor=colors[c])
        ax4 = ax4.twinx()  # instantiate a second axes that shares the same x-axis
        c+=1

    #### in here i plot the loss for the first Q(\beta, n, guess ), the test and the train. Notice they have different scale! I use different colors!
    c=0
    for loss in losses[1]:
        ax5.plot(np.arange(1,len(loss)+1),loss,'--',alpha=0.85,c=colors[c], linewidth=5, label="Preds Q(n, \beta, guess) - "+lab[c])#, label="Q(n1=0,"+r'$\beta$'+"; g=-1)")
        ax5.scatter(np.arange(1,len(loss)+1),loss,s=150,alpha=0.85,c=colors[c], linewidth=5, label="Preds Q(n, \beta, guess) - "+lab[c])#, label="Q(n1=0,"+r'$\beta$'+"; g=-1)")
        ax5.set_xlabel("epoch", size=20)
        ax5.set_ylabel("Loss Q(\beta_1, n, guess)",size=20, color =colors[c])
        ax5.tick_params(axis='y', size=20, labelcolor=colors[c])
        ax5 = ax5.twinx()  # instantiate a second axes that shares the same x-axis
        c+=1
        #plt.tight_layout()  # otherwise the right y-label is slightly clipped    


    betas_train = buffer.betas
    for predictions in histo_preds["net_1"].values():
        ax7.plot(betas_train,predictions["values"]["0"],alpha=0.25, linewidth=5)#, label="epoch: "+str(predictions["epoch_number"])) #, label=r'$\hat{Q}$'+"(n1=0,"+r'$\beta$'+"; g=-1)")
        ax7.plot(betas_train,predictions["values"]["1"],alpha=0.25, linewidth=5)#, label="epoch: "+str(predictions["epoch_number"]))#,label=r'$\hat{Q}$'+"(n1=0,"+r'$\beta$'+"; g=1)")

        ax8.plot(betas_train,predictions["values"]["2"] ,alpha=0.25,  linewidth=5)#, label="epoch: "+str(predictions["epoch_number"]))#label=r'$\hat{Q}$'+"(n1=1,"+r'$\beta$'+"; g=-1)")
        ax8.plot(betas_train,predictions["values"]["3"] ,alpha=0.25,  linewidth=5)#, label="epoch: "+str(predictions["epoch_number"]))#,label=r'$\hat{Q}$'+"(n1=1,"+r'$\beta$'+"; g=1)")

    #Now we take the last and plot it in bold!
    ax7.plot(betas_train,predictions["values"]["0"],alpha=0.85, c="black",linewidth=8)#), label="epoch: "+str(predictions["epoch_number"])) #, label=r'$\hat{Q}$'+"(n1=0,"+r'$\beta$'+"; g=-1)")
    ax7.plot(betas_train,predictions["values"]["1"],alpha=0.85, c="purple", linewidth=8)#, label="epoch: "+str(predictions["epoch_number"]))#,label=r'$\hat{Q}$'+"(n1=0,"+r'$\beta$'+"; g=1)")
    ax7.scatter(betas_train,predictions["values"]["0"],alpha=0.85, c="black",s=150)
    ax7.scatter(betas_train,predictions["values"]["1"],alpha=0.85, c="purple",s=150)

    ax8.plot(betas_train,predictions["values"]["2"] ,alpha=0.85, c="black", linewidth=8)#, label="epoch: "+str(predictions["epoch_number"]))#label=r'$\hat{Q}$'+"(n1=1,"+r'$\beta$'+"; g=-1)")
    ax8.plot(betas_train,predictions["values"]["3"] ,alpha=0.85,  c="purple",linewidth=8)#, label="epoch: "+str(predictions["epoch_number"]))#,label=r'$\hat{Q}$'+"(n1=1,"+r'$\beta$'+"; g=1)")
    ax8.scatter(betas_train,predictions["values"]["2"],alpha=0.85, c="black",s=150)
    ax8.scatter(betas_train,predictions["values"]["3"],alpha=0.85, c="purple",s=150)


        ### we do the same for ax3:

    for predictions in histo_preds["net_0"].values():
        ax6.plot(betas_train,predictions["values"],alpha=0.15, linewidth=5)#, label="epoch: "+str(predictions["epoch_number"])) #, label=r'$\hat{Q}$'+"(n1=0,"+r'$\beta$'+"; g=-1)")

    #The last one black and bigger!
    ax6.plot(betas_train,predictions["values"],alpha=0.85,c="black", linewidth=5)#, label="epoch: "+str(predictions["epoch_number"])) #, label=r'$\hat{Q}$'+"(n1=0,"+r'$\beta$'+"; g=-1)")


    ##### here we plot the true values (that we want to learn!!!) ###
    ax7.plot(buffer.betas,[qval(b, 0, -1) for b in buffer.betas],'--',alpha=0.85,c="red", linewidth=8, label="Q(n1=0,"+r'$\beta$'+"; g=-1)")
    ax7.plot(buffer.betas,[qval(b, 0, 1) for b in buffer.betas],'--',alpha=0.85,c="blue",  linewidth=8,label="Q(n1=0,"+r'$\beta$'+"; g=1)")

    ax8.plot(buffer.betas,[qval(b, 1, -1) for b in buffer.betas],'--',alpha=0.85,c="red",  linewidth=8,label="Q(n1=1,"+r'$\beta$'+"; g=-1)")
    ax8.plot(buffer.betas,[qval(b, 1, 1) for b in buffer.betas],'--',alpha=0.85,c="blue",  linewidth=8,label="Q(n1=1,"+r'$\beta$'+"; g=1)")

    ax6.plot(buffer.betas,[ps_maxlik(b) for b in buffer.betas],'--',alpha=0.85,c="red", linewidth=8)
    ax6.set_ylabel(r'$P_s\; ( \beta )$', size=20)
    ##### here we plot the true values (that we want to learn!!!) ###



    for ax in [ax6, ax7, ax8]:
        ax.set_xlabel(r'$\beta$', size=20)

    for ax in [ax1, ax2, ax3,ax4,ax5,ax6, ax7, ax8]:
        ax.legend(prop={"size":30})

    plt.savefig(directory+"/big_plot.png")
    plt.close()
    return

In [116]:
def plot_inside_buffer(buffer, directory):
    

    plt.figure(figsize=(15,10))
    ax1 =  plt.subplot2grid((1,2),(0,0))
    ax2 =  plt.subplot2grid((1,2),(0,1))

    plt.subplots_adjust(wspace=0.3, hspace=0.3)

    histo = {}
    number = {}

    data_collected = np.asarray(buffer.buffer)
    for k in data_collected[:,0]:
        for g in [-1.,1.]:
            for outcome in [0.,1.]:

                histo[str(np.round(k,2))+"n"+str(outcome)+"g"+str(g)] = 0
                number[str(np.round(k,2))+"n"+str(outcome)+"g"+str(g)] = 1

    for dato in data_collected:
        histo[str(np.round(dato[0],2))+"n"+str(dato[1])+"g"+str(dato[2])] += dato[3]
        number[str(np.round(dato[0],2))+"n"+str(dato[1])+"g"+str(dato[2])] += 1

    for k in data_collected[:,0]:
        for g in [-1.,1.]:
            for outcome in [0.,1.]:
                histo[str(np.round(k,2))+"n"+str(outcome)+"g"+str(g)] /=number[str(np.round(k,2))+"n"+str(outcome)+"g"+str(g)] 



    betas  = [np.round(b,2) for b in data_collected[:,0]]
    ax1.plot(betas,[histo[str(np.round(b,2))+"n0.0g-1.0"] for b in data_collected[:,0]],alpha=0.5,c="red", linewidth=5, label="Q(n1=0,"+r'$\beta$'+"; g=-1)")
    ax1.plot(betas,[histo[str(np.round(b,2))+"n0.0g1.0"] for b in data_collected[:,0]],alpha=0.5,c="blue", linewidth=5, label="Q(n1=0,"+r'$\beta$'+"; g=-1)")

    ax2.plot(betas,[histo[str(np.round(b,2))+"n1.0g-1.0"] for b in data_collected[:,0]],alpha=0.5,c="red", linewidth=5, label="Q(n1=1,"+r'$\beta$'+"; g=-1)")
    ax2.plot(betas,[histo[str(np.round(b,2))+"n1.0g1.0"] for b in data_collected[:,0]],alpha=0.5,c="blue", linewidth=5, label="Q(n1=1,"+r'$\beta$'+"; g=-1)")

    betas = np.arange(-1.5,1.5,.01)
    ax1.plot(betas,[qval(b, 0, -1) for b in betas],'--',alpha=0.5,c="red", linewidth=5, label="True Q(n1=0,"+r'$\beta$'+"; g=-1)")
    ax1.plot(betas,[qval(b, 0, 1) for b in betas],'--',alpha=0.5,c="blue",  linewidth=5,label="True Q(n1=0,"+r'$\beta$'+"; g=1)")

    ax2.plot(betas,[qval(b, 1, -1) for b in betas],'--',alpha=0.5,c="red",  linewidth=5,label="True Q(n1=1,"+r'$\beta$'+"; g=-1)")
    ax2.plot(betas,[qval(b, 1, 1) for b in betas],'--',alpha=0.5,c="blue",  linewidth=5,label="True Q(n1=1,"+r'$\beta$'+"; g=1)")


    for ax in [ax1, ax2]:
        ax.set_xlabel(r'$\beta$', size=20)
        ax.legend(prop={"size":15})

    plt.savefig(directory+"/inside_buffer.png")
    plt.close()
    return

###### if you want to delete the folder with data generated by tensorflow (and the program), you can do it with this command .... !rm -rf "logs/ddpg_results"

In [123]:
def ddpgKennedy(total_episodes = 10**3,buffer_size=500, batch_size=64, ep_guess=0.01, noise_displacement=0.5,lr_actor=0.01, lr_critic=0.001, tau=0.005, repetitions=1):

    amplitude = 0.4
    buffer = ReplayBuffer(buffer_size=buffer_size)
    
    critic_q0 = Critic(input_dim=1)
    actor_q0 = Actor(input_dim=1)
    critic_guess = Critic(input_dim=3)
    target_guess = Critic(input_dim=3)

    critic_q0(np.array([[0.],[1.]])) #initialize the network 0, arbitrary inputs.
    actor_q0(np.array([[0.],[1.]])) #initialize the network 0, arbitrary inputs.
    critic_guess(np.array([[0.,1.,1.]]))
    target_guess(np.array([[0.,1.,1.]]))
    #
    optimizer_critic_guess = tf.keras.optimizers.Adam(lr=lr_critic)
    optimizer_actor_l0 = tf.keras.optimizers.Adam(lr=lr_actor)
    optimizer_critic_l0 = tf.keras.optimizers.Adam(lr=lr_actor)



    
    rt = []
    pt = [] 

    global train_loss_l0, train_loss_l1, test_loss_l0, test_loss_l1 #define this global so i use them in a function defined above... optimizatin step and testing()
    train_loss_l0 = tf.keras.metrics.Mean('train_loss_l0', dtype=tf.float32)
    test_loss_l0 = tf.keras.metrics.Mean('test_loss_l0', dtype=tf.float32)
    train_loss_l1 = tf.keras.metrics.Mean('train_loss_l1', dtype=tf.float32)
    test_loss_l1 = tf.keras.metrics.Mean('test_loss_l1', dtype=tf.float32)


    current_run_and_time = "time-{}".format(datetime.now().strftime("%Y%m%d-%H%M"))
    directory = 'logs/ddpg_results/' + current_run_and_time 
    train_log_dir_0 = 'logs/ddpg_results/' + current_run_and_time + '/train_l0'
    test_log_dir_0 = 'logs/ddpg_results/' +  current_run_and_time + '/test_l0'
    train_log_dir_1 = 'logs/ddpg_results/' +  current_run_and_time + '/train_l1'
    test_log_dir_1 = 'logs/ddpg_results/' +  current_run_and_time + '/test_l1'
    train_summary_writer_0 = tf.summary.create_file_writer(train_log_dir_0)
    train_summary_writer_1 = tf.summary.create_file_writer(train_log_dir_1)
    test_summary_writer_0 = tf.summary.create_file_writer(test_log_dir_0)
    test_summary_writer_1 = tf.summary.create_file_writer(test_log_dir_1)

    info_optimizers = "optimizer_critic_guess: {} \nOptimizer_actor_l0: {}\nOptimizer_critic_l0: {}\n".format(optimizer_critic_guess.get_config(), optimizer_actor_l0.get_config(), optimizer_critic_l0)
    infor_buffer = "Buffer_size: {}\n Batch_size for sampling: {}\n".format(buffer.buffer_size, batch_size)
    info_epsilons= "epsilon-guess: {}\nepsilon_displacement_noise: {}".format(ep_guess,noise_displacement)
    
    data = "tau: {}, repetitions per optimization step (would be like epochs): {}".format(tau,repetitions) + "\n \n**** optimizers ***\n"+info_optimizers+"\n\n\n*** BUFFER ***\n"+infor_buffer+"\n\n\n *** NOISE PARAMETERS *** \n"+info_epsilons
    with open(directory+"/info.txt", 'w') as f:
        f.write(data)
        f.close()    
    
    print("Beggining to train! \n \n")
    print(data)
    print("starting time: {}".format(datetime.now().strftime("%Y%m%d-%H%M")))
    print("saving results in " + str(directory))
    avg_train_l0 = []
    avg_train_l1 = []
    avg_test_l0 = []
    avg_test_l1 = []

    history_betas = [] #to put in histogram
    history_betas_would_have_done=[] #to put in histogram
    histo_preds = {"net_0":{}, "net_1":{}} #here i save the predictions to plot in a "straightforward way"

    #######
    for episode in tqdm(range(total_episodes)):

        alice_phase = np.random.choice([-1.,1.],1)[0]
        beta_would_do = actor_q0(np.array([[0.]])).numpy()[0][0]
        beta =  beta_would_do + np.random.uniform(-noise_displacement, noise_displacement)
        proboutcome = Prob(alice_phase*amplitude,beta,0)
        outcome = np.random.choice([0.,1.],1,p=[proboutcome, 1-proboutcome])[0]

        history_betas.append(beta)
        history_betas_would_have_done.append(beta_would_do)
        #epsilon-greedy choice of the guessing! Do you imagine other way to do this? How would you apply UCB ? discretize?
        if np.random.random()< ep_guess:
            guess = np.random.choice([-1.,1.],1)[0]
        else:
            guess = critic_guess.give_favourite_guess(beta, outcome) 
        if guess == alice_phase:
            reward = 1.
        else:
            reward = 0.
        buffer.add(beta, outcome, guess, reward)


        ### optimization step and testing the generalization performance ! ####
        optimization_step(networks=[actor_q0, critic_q0, critic_guess, target_guess], 
                          optimizers = [optimizer_critic_guess,  optimizer_actor_l0, optimizer_critic_l0 ],buffer=buffer,
                          batch_size=batch_size,repetitions=repetitions)
        testing_data(buffer, networks=[actor_q0, critic_q0, critic_guess, target_guess])

        ### i append the losses to plot them later ###
        avg_train_l0.append(train_loss_l0.result().numpy())
        avg_train_l1.append(train_loss_l1.result().numpy())
        avg_test_l0.append(test_loss_l0.result().numpy())
        avg_test_l1.append(test_loss_l1.result().numpy())


        ### appending the reward to calculate cumulative! ###
        rt.append(reward)

        ### calculate success probability if the agent went greedy ###
        p=0
        for outcome in [0.,1.]:
            p+=Prob(critic_guess.give_favourite_guess(beta_would_do, outcome)*amplitude, beta_would_do,outcome)
        p/=2
        pt.append(p)
        

        
        with train_summary_writer_0.as_default():
            tf.summary.scalar('loss', train_loss_l0.result(), step=episode)
        with test_summary_writer_0.as_default():
            tf.summary.scalar('loss', test_loss_l0.result(), step=episode)
        with train_summary_writer_1.as_default():
            tf.summary.scalar('loss', train_loss_l1.result(), step=episode)
        with test_summary_writer_1.as_default():
            tf.summary.scalar('loss', test_loss_l1.result(), step=episode)

        if episode%(total_episodes/10) == 0: #this is for showing 10 results in total.

            template = 'Episode {}, \Rt: {}, \Pt: {}, Train loss_l1: {}, Test loss_l1: {}, Train Loss_l0: {}, Test Loss_l0: {}'
            print(template.format(episode+1,
                                np.sum(rt)/(episode+1),
                                  pt[-1],
                                 np.round(train_loss_l1.result().numpy(),5), 
                                 test_loss_l1.result().numpy(), 
                                 np.round(train_loss_l0.result().numpy(),15),
                                 np.round(test_loss_l0.result().numpy(),5))
                  )

            for nett in ["net_0","net_1"]: #net_0 will be critic_q0, net_1 will be critic_qguess

                histo_preds[nett][str(episode)] ={}
                histo_preds[nett][str(episode)]["episode"] = episode
                histo_preds[nett][str(episode)]["values"] = {}

                histo_preds["net_0"][str(episode)]["values"] = np.squeeze(critic_q0(np.expand_dims(buffer.betas,axis=1)))

            index=0
            for n1 in [0.,1.]:
                for guess in [-1.,1.]:
                    foo =np.array([[b,n1,guess] for b in buffer.betas]) #betas_train defined as global in create_dataset_l2()
                    histo_preds["net_1"][str(episode)]["values"][str(index)] = np.squeeze(critic_guess(foo))
                    index+=1


    rt = [np.sum(rt[:k]) for k in range(len(rt))]
    rt = rt/np.arange(1,len(rt)+1)
    losses = [[avg_train_l0, avg_test_l0], [ avg_train_l1, avg_test_l1]]
    BigPlot(buffer,rt, pt, history_betas, history_betas_would_have_done, histo_preds, losses, directory)
    plot_inside_buffer(buffer, directory)
    return #rt, pt, history_betas, history_betas_would_have_done, histo_preds, losses, name_directory-

In [124]:
ddpgKennedy(total_episodes=10**2, noise_displacement=0.1, tau=0.001, buffer_size=10**5, batch_size=500, lr_critic=0.0001, lr_actor=0.001)

Beggining to train! 
 

tau: 0.001, repetitions per optimization step (would be like epochs): 1
 
**** optimizers ***
optimizer_critic_guess: {'name': 'Adam', 'learning_rate': 0.0001, 'decay': 0.0, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-07, 'amsgrad': False} 
Optimizer_actor_l0: {'name': 'Adam', 'learning_rate': 0.001, 'decay': 0.0, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-07, 'amsgrad': False}
Optimizer_critic_l0: <tensorflow.python.keras.optimizer_v2.adam.Adam object at 0x7fc90d1dbb38>



*** BUFFER ***
Buffer_size: 100000
 Batch_size for sampling: 500



 *** NOISE PARAMETERS *** 
epsilon-guess: 0.01
epsilon_displacement_noise: 0.1
starting time: 20200406-1758
saving results in logs/ddpg_results/time-20200406-1758


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


test loss!!!! l1!!!:  0.09226625959589313
[0.09226626]
Episode 1, \Rt: 1.0, \Pt: 0.5, Train loss_l1: 0.2748599946498871, Test loss_l1: 0.0922662615776062, Train Loss_l0: 0.0017411451553925872, Test Loss_l0: 0.058079998940229416
test loss!!!! l1!!!:  0.09226148452127304




[0.09226626, 0.09226388]
test loss!!!! l1!!!:  0.09225898237974116
[0.09226626, 0.09226388, 0.092262246]
test loss!!!! l1!!!:  0.09225674718090728
[0.09226626, 0.09226388, 0.092262246, 0.092260875]
test loss!!!! l1!!!:  0.09225413228502305
[0.09226626, 0.09226388, 0.092262246, 0.092260875, 0.092259526]
test loss!!!! l1!!!:  0.09225089479350025
[0.09226626, 0.09226388, 0.092262246, 0.092260875, 0.092259526, 0.09225809]
test loss!!!! l1!!!:  0.09224759449046362
[0.09226626, 0.09226388, 0.092262246, 0.092260875, 0.092259526, 0.09225809, 0.09225659]
test loss!!!! l1!!!:  0.09224410923342456
[0.09226626, 0.09226388, 0.092262246, 0.092260875, 0.092259526, 0.09225809, 0.09225659, 0.092255026]
test loss!!!! l1!!!:  0.0922406751630975
[0.09226626, 0.09226388, 0.092262246, 0.092260875, 0.092259526, 0.09225809, 0.09225659, 0.092255026, 0.09225343]
test loss!!!! l1!!!:  0.09223755926842642
[0.09226626, 0.09226388, 0.092262246, 0.092260875, 0.092259526, 0.09225809, 0.09225659, 0.092255026, 0.092253

test loss!!!! l1!!!:  0.09220124720096047
[0.09226626, 0.09226388, 0.092262246, 0.092260875, 0.092259526, 0.09225809, 0.09225659, 0.092255026, 0.09225343, 0.09225185, 0.0922503, 0.092248835, 0.09224745, 0.09224613, 0.09224485, 0.0922436, 0.09224238, 0.09224121, 0.09224012, 0.092239074, 0.0922381, 0.09223716, 0.09223627, 0.09223542, 0.09223461, 0.09223383, 0.09223307, 0.0922323, 0.092231534, 0.09223076, 0.092229955, 0.09222914, 0.09222829]
test loss!!!! l1!!!:  0.09219811476130012
[0.09226626, 0.09226388, 0.092262246, 0.092260875, 0.092259526, 0.09225809, 0.09225659, 0.092255026, 0.09225343, 0.09225185, 0.0922503, 0.092248835, 0.09224745, 0.09224613, 0.09224485, 0.0922436, 0.09224238, 0.09224121, 0.09224012, 0.092239074, 0.0922381, 0.09223716, 0.09223627, 0.09223542, 0.09223461, 0.09223383, 0.09223307, 0.0922323, 0.092231534, 0.09223076, 0.092229955, 0.09222914, 0.09222829, 0.09222741]
test loss!!!! l1!!!:  0.09219453896056916
[0.09226626, 0.09226388, 0.092262246, 0.092260875, 0.0922595

test loss!!!! l1!!!:  0.0921346034639958
[0.09226626, 0.09226388, 0.092262246, 0.092260875, 0.092259526, 0.09225809, 0.09225659, 0.092255026, 0.09225343, 0.09225185, 0.0922503, 0.092248835, 0.09224745, 0.09224613, 0.09224485, 0.0922436, 0.09224238, 0.09224121, 0.09224012, 0.092239074, 0.0922381, 0.09223716, 0.09223627, 0.09223542, 0.09223461, 0.09223383, 0.09223307, 0.0922323, 0.092231534, 0.09223076, 0.092229955, 0.09222914, 0.09222829, 0.09222741, 0.09222647, 0.09222548, 0.09222443, 0.09222331, 0.09222213, 0.092220895, 0.09221962, 0.09221831, 0.09221695, 0.09221556, 0.09221414, 0.092212684, 0.09221121, 0.0922097, 0.09220816]
test loss!!!! l1!!!:  0.09213047304286702
[0.09226626, 0.09226388, 0.092262246, 0.092260875, 0.092259526, 0.09225809, 0.09225659, 0.092255026, 0.09225343, 0.09225185, 0.0922503, 0.092248835, 0.09224745, 0.09224613, 0.09224485, 0.0922436, 0.09224238, 0.09224121, 0.09224012, 0.092239074, 0.0922381, 0.09223716, 0.09223627, 0.09223542, 0.09223461, 0.09223383, 0.09223

test loss!!!! l1!!!:  0.0920893233979288
[0.09226626, 0.09226388, 0.092262246, 0.092260875, 0.092259526, 0.09225809, 0.09225659, 0.092255026, 0.09225343, 0.09225185, 0.0922503, 0.092248835, 0.09224745, 0.09224613, 0.09224485, 0.0922436, 0.09224238, 0.09224121, 0.09224012, 0.092239074, 0.0922381, 0.09223716, 0.09223627, 0.09223542, 0.09223461, 0.09223383, 0.09223307, 0.0922323, 0.092231534, 0.09223076, 0.092229955, 0.09222914, 0.09222829, 0.09222741, 0.09222647, 0.09222548, 0.09222443, 0.09222331, 0.09222213, 0.092220895, 0.09221962, 0.09221831, 0.09221695, 0.09221556, 0.09221414, 0.092212684, 0.09221121, 0.0922097, 0.09220816, 0.09220661, 0.09220504, 0.09220345, 0.09220185, 0.092200235, 0.0921986, 0.092196964, 0.092195325, 0.092193685, 0.092192054, 0.09219042, 0.092188805, 0.092187196]
test loss!!!! l1!!!:  0.09208718485421716
[0.09226626, 0.09226388, 0.092262246, 0.092260875, 0.092259526, 0.09225809, 0.09225659, 0.092255026, 0.09225343, 0.09225185, 0.0922503, 0.092248835, 0.09224745, 

KeyboardInterrupt: 