# Section 5.2: Partial sensitivity modelling in (DP-)SGD

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from deuterium import Variable, to_vec, random_symbols, get_gradients
from sympy import sympify
from scipy.optimize import shgo


import symengine as se
from sklearn.metrics import accuracy_score
import sys
sys.setrecursionlimit(1_000_000)
import warnings
warnings.filterwarnings("ignore")

Define some utility functions, notably the loss functions and tempered sigmoid activation functions.

In [2]:
to_data = np.vectorize(lambda x: x.data)

def sigmoid(x, s=1, T=1, o=0):
        return (s/(1+np.exp(-T*x)))-o

def tanh(x):
    return sigmoid(x, 2, 2, 1)

bce_loss = lambda y_pred, y_true: -np.mean(np.multiply(y_true, np.log(y_pred)) + np.multiply((1 - y_true), np.log(1 - y_pred)))
normalize = lambda x: (x-x.min())/(x.max()-x.min())

Define the network architecture

In [3]:
IN=2
INTERMEDIATE=2

In [4]:
# This creates symbolic representations for all the layers
x = to_vec(np.array(random_symbols(IN, "x")).reshape((1,IN))) 
y = to_vec(np.array(random_symbols(1, "y")))

w1 = to_vec(np.array(random_symbols(IN*INTERMEDIATE, "w1")).reshape(IN, INTERMEDIATE))
b = to_vec(np.array(random_symbols(INTERMEDIATE, "b")).reshape(1, INTERMEDIATE))
w2 = to_vec(np.array(random_symbols(INTERMEDIATE, "w2")).reshape(INTERMEDIATE,1))

In [5]:
# This replaces some of the variables with *values*
# w1 (weights layer 1)
# w2 (weights layer 2)
# b (bias terms)
# y (label)

w1 = to_vec(np.random.normal(size=IN*INTERMEDIATE).reshape(IN, INTERMEDIATE))
b = to_vec(np.random.normal(size=INTERMEDIATE).reshape(1, INTERMEDIATE))
w2 = to_vec(np.random.normal(size=INTERMEDIATE).reshape(INTERMEDIATE,1))
y = to_vec(np.array(1))


Symbolically calculate the network output

In [6]:
layer_1 = sigmoid(x@w1)+b
y_pred = sigmoid(layer_1@w2)
loss = bce_loss(y_pred, y)

Obtain the gradients w.r.t all inputs

In [7]:
loss.backward()

In [8]:
x_grad = np.array([i.grad for i in x.flatten().tolist()])
y_grad = np.array([i.grad for i in y.flatten().tolist()])
w1_grad = np.array([i.grad for i in w1.flatten().tolist()])
b_grad = np.array([i.grad for i in b.flatten().tolist()])
w2_grad = np.array([i.grad for i in w2.flatten().tolist()])

full_grad = to_vec(np.concatenate((x_grad, y_grad, w1_grad, b_grad, w2_grad)))


In [9]:
# the gradient we care about is just wrt the weights

my_grad = to_vec(np.concatenate((w1_grad, b_grad, w2_grad)))
len(my_grad)

8

# Optimize the Gradient Norm one Element at a Time

In [11]:
%%time

print(np.sum(my_grad).data.free_symbols)
intervals = [(-1, 1) for _ in np.sum(my_grad).data.free_symbols]

elems = []
for g in my_grad:
    gp = g**2
    f = se.Lambdify(list(gp.data.free_symbols), gp.data)
    sol = shgo(f, intervals)
    elems.append(sol.fun)

print(elems)
np.sqrt(np.sum(elems))

{x_1, x_0}
[0.0, 0.0, 0.0, 0.0, 0.021051515390329727, 0.3168526122151559, 0.06569637159210578, 0.017294591974371054]
CPU times: user 14.2 ms, sys: 427 µs, total: 14.6 ms
Wall time: 13.6 ms


0.6487642801295109

# Optimize the Gradient Norm all at once

In [12]:
my_grad_norm = np.linalg.norm(my_grad, ord=2)
my_grad_norm.data.free_symbols
my_grad_norm_func = se.Lambdify(list(my_grad_norm.data.free_symbols), my_grad_norm.data)

In [13]:
%%time

print(my_grad_norm.data.free_symbols)
intervals = [(-1, 1) for _ in my_grad_norm.data.free_symbols]
sol = shgo(my_grad_norm_func, intervals)
sol.fun

{x_1, x_0}
CPU times: user 2.93 ms, sys: 0 ns, total: 2.93 ms
Wall time: 2.39 ms


0.6917728201880243