# 1. Import Libraries & Define Paths

In [1]:
# check if notebook is in colab
try:
    # install ezkl
    import google.colab
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "ezkl"])
    subprocess.check_call([sys.executable, "-m", "pip", "install", "onnx"])

# rely on local installation of ezkl if the notebook is not in colab
except:
    pass


# here we create and (potentially train a model)

# make sure you have the dependencies required here already installed
from torch import nn
import ezkl
import os
import json
import torch
from PIL import Image

In [2]:
!mkdir generating_files

In [3]:
model_path = os.path.join('generating_files/network.onnx')
compiled_model_path = os.path.join('generating_files/network.compiled')
pk_path = os.path.join('generating_files/proving_key.pk')
vk_path = os.path.join('generating_files/verification_key.vk')
settings_path = os.path.join('generating_files/settings.json')

witness_path = os.path.join('generating_files/witness.json')
data_path = os.path.join('generating_files/input.json')

In [4]:
# function divide image into 4 parts
def divide_image_to_4_parts(x, w, h):
    # Create a copy of x without gradients
    x_copy = x.detach().clone()

    # Compute midpoints
    mid_w, mid_h = w // 2, h // 2

    # Divide into four parts
    top_left = x_copy[:, :, :mid_w, :mid_h]
    top_right = x_copy[:, :, :mid_w, mid_h:]
    bottom_left = x_copy[:, :, mid_w:, :mid_h]
    bottom_right = x_copy[:, :, mid_w:, mid_h:]

    return top_left, top_right, bottom_left, bottom_right

# 2. Create Merkle Tree

## Two parties jointly create random seed for pseudo number generator

In [7]:
# (Model Owner do) Model Owner generate a random
import random

random_model_owner = random.randint(1000, 100000000)
print(random_model_owner)

68671620


In [6]:
# (Data Owner do) Data Owner generate a random
random_data_owner = 79696078         # get the random of data owner and replace here

In [8]:
# (Both do) Set seed for random generator
torch.manual_seed(random_model_owner + random_data_owner)

<torch._C.Generator at 0x78319a42cf50>

## Two parties create commit convolutional layer (with weight and bias generate by pseudo number generator)

In [9]:
# (Both do) Define the commit-convolutional layer (it can anything, here we chose a conv2d with 2 filter and ker-size=5, stride=2)

commit_conv = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=5, stride=2)

# Initialize custom weights and bias
commit_conv.weight = nn.Parameter(torch.randn(2, 1, 5, 5))  # Shape must match (out_channels, in_channels, kernel_size, kernel_size)
commit_conv.bias = nn.Parameter(torch.randn(2))  # Shape must match the out_channels

## Data Owner process image segment through commit convolutional layer

Model Owner don't do this part

# 3. Monte-Carlo Check

In [10]:
# Example we get the top-left image from Data Owner, but here we demo by get the top-left part by directly from the original image
# It just a demo

# Input - use grayscale (28x28) image from MNIST dataset to demo
# Input for model, use grayscale image from MNIST dataset
from keras.datasets import mnist
(train_X, train_y), (test_X, test_y) = mnist.load_data()
train_X = torch.from_numpy(train_X) / 255.0
train_X = train_X.unsqueeze(1)
# print(train_X.shape)
# print(train_X[0])
input_image = train_X[0].unsqueeze(0)
print(input_image.shape)
top_left, _, _, _ = divide_image_to_4_parts(input_image, input_image.shape[2], input_image.shape[3])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
torch.Size([1, 1, 28, 28])


In [24]:
# (Data Owner do) Send the mean conv result of each part to Model Owner
# (Model Owner do) Model Owner send query to Model Owner and get the result, then do the above process again and compare

# ex: check 1st (top-left) of image

validate_value = torch.mean(commit_conv(top_left)).unsqueeze(0)

mean_tl = -0.3358482718 # Copy the value print in Data-Owner and paste to here

tolerance = 1e-9

assert(abs(validate_value - mean_tl) < tolerance)

print("Monte-Carlo check pass ==> Agree on dataset")

Monte-Carlo check pass ==> Agree on dataset


# 4. Define Circuit Structure & Generate ONNX File

In [27]:
# Manually create the tensor with the given values
weights = torch.tensor([[[[ 0.4051, -1.0027, -0.6679,  0.5655, -0.1901],
                         [-0.5042, -1.8444,  1.0797, -1.7020,  0.8695],
                         [-1.1467, -0.4110,  2.3773, -0.2481, -1.2287],
                         [-1.0960, -0.8917, -0.6548,  1.5678, -0.9284],
                         [-1.2372, -1.4590,  0.5970, -0.9647,  3.1039]]],

                       [[[-1.7351, -0.9498,  0.0699, -1.2860,  0.8400],
                         [-0.3068,  0.2217, -0.3690, -0.1901, -0.0881],
                         [-0.5473,  1.0560,  0.5286,  0.4584, -0.4555],
                         [-1.7153,  0.0264,  0.2541, -0.1590, -0.4631],
                         [-1.6218,  1.0727, -1.2665, -0.5445, -0.2996]]]])


bias = torch.tensor([-0.7043, 0.0027])

In [29]:
# Define a model
# In this example we define a convolutional layer with 2 filters, kernel-size = 5 and slide 2 step for each stride
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()

        self.conv = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=5, stride=2)
        self.conv.weight = nn.Parameter(weights)
        self.conv.bias = nn.Parameter(bias)

        self.commit_conv = commit_conv

    def forward(self, x):
        top_left, top_right, bottom_left, bottom_right = divide_image_to_4_parts(x, x.shape[2], x.shape[3])

        # mean conv result of each part
        mean_tl = torch.mean(self.commit_conv(top_left)).unsqueeze(0)
        mean_tr = torch.mean(self.commit_conv(top_right)).unsqueeze(0)
        mean_bl = torch.mean(self.commit_conv(bottom_left)).unsqueeze(0)
        mean_br = torch.mean(self.commit_conv(bottom_right)).unsqueeze(0)


        # Pass through convolution layer and flatten the result
        res = self.conv(x).flatten()

        # Concatenate sum tensor with flattened convolution output
        final = torch.cat((mean_tl, mean_tr, mean_bl, mean_br, res), dim=0)

        return final


circuit = MyModel()

In [30]:
# create a dummy data use for export onnx model (because export require run the model to trace)
x = torch.rand(1, 1, 28, 28)

In [31]:
# Flips the neural net into inference mode
circuit.eval()

print(circuit(x).shape)

# Export the model
torch.onnx.export(circuit,               # model being run
                      x,                   # model input (or a tuple for multiple inputs)
                      model_path,            # where to save the model (can be a file or file-like object)
                      export_params=True,        # store the trained parameter weights inside the model file
                      opset_version=18,          # the ONNX version to export the model to
                      do_constant_folding=False,  # whether to execute constant folding for optimization
                      input_names = ['input'],   # the model's input names
                      output_names = ['output'], # the model's output names
                      dynamic_axes={'input' : {0 : 'batch_size'},    # variable length axes
                                    'output' : {0 : 'batch_size'}})


torch.Size([292])


In [32]:
# print onnx file
import onnx

# Load the ONNX model
model = onnx.load(model_path)

# Print a human-readable representation of the model
print(onnx.helper.printable_graph(model.graph))

graph main_graph (
  %input[FLOAT, batch_sizex1x28x28]
) initializers (
  %conv.weight[FLOAT, 2x1x5x5]
  %conv.bias[FLOAT, 2]
  %commit_conv.weight[FLOAT, 2x1x5x5]
  %commit_conv.bias[FLOAT, 2]
) {
  %/Constant_output_0 = Constant[value = <Scalar Tensor []>]()
  %/Shape_output_0 = Shape(%input)
  %/Constant_1_output_0 = Constant[value = <Scalar Tensor []>]()
  %/Gather_output_0 = Gather[axis = 0](%/Shape_output_0, %/Constant_1_output_0)
  %/Constant_2_output_0 = Constant[value = <Scalar Tensor []>]()
  %/Shape_1_output_0 = Shape(%input)
  %/Constant_3_output_0 = Constant[value = <Scalar Tensor []>]()
  %/Gather_1_output_0 = Gather[axis = 0](%/Shape_1_output_0, %/Constant_3_output_0)
  %/Constant_4_output_0 = Constant[value = <Scalar Tensor []>]()
  %/Div_output_0 = Div(%/Gather_output_0, %/Constant_4_output_0)
  %/Cast_output_0 = Cast[to = 7](%/Div_output_0)
  %/Cast_1_output_0 = Cast[to = 7](%/Cast_output_0)
  %/Constant_5_output_0 = Constant[value = <Scalar Tensor []>]()
  %/Div_1_ou

# 5. Setup Phase

We setup **input** is *private*, **output** is *public* and model parameters is *fixed* (mean baked into model)

In [33]:
py_run_args = ezkl.PyRunArgs()
py_run_args.input_visibility = "private"
py_run_args.output_visibility = "public"
py_run_args.param_visibility = "fixed" # private by default
py_run_args.input_scale = 40
py_run_args.param_scale = 43
py_run_args.scale_rebase_multiplier = 10

res = ezkl.gen_settings(model_path, settings_path, py_run_args=py_run_args)

assert res == True

In [34]:
res = ezkl.compile_circuit(model_path, compiled_model_path, settings_path)
assert res == True

In [35]:
# srs path
res = await ezkl.get_srs(settings_path)

In [37]:
# now generate the witness file

res = await ezkl.gen_witness(data_path, compiled_model_path, witness_path)
assert os.path.isfile(witness_path)

In [38]:

# HERE WE SETUP THE CIRCUIT PARAMS
# WE GOT KEYS
# WE GOT CIRCUIT PARAMETERS
# EVERYTHING ANYONE HAS EVER NEEDED FOR ZK


import time

# Start time
start_time = time.time()

res = ezkl.setup(
        compiled_model_path,
        vk_path,
        pk_path,

    )

assert res == True
assert os.path.isfile(vk_path)
assert os.path.isfile(pk_path)
assert os.path.isfile(settings_path)

# End time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.6f} seconds")

Elapsed time: 32.256114 seconds


# 6. Proving Phase

NOTE: Because our input is image with shape (1, 28, 28), the convolutional layer we define with 2 out channel, stride 2 --> output of conv layer will have shape (1,2,14,14)

So when we flatten it, we will have 2x14x14 = 288 elements. Plus 4 element for commit value, we have total 292 elements.

Because we set up output is public, you will see 292 value in "instances" of the proof - corresponding to 292 elements.



In [None]:
## Model-owner don't do this part


# # GENERATE A PROOF

# # Start time
# start_proving_time = time.time()

# proof_path = os.path.join('generating_files/proof.pf')

# res = ezkl.prove(
#         witness_path,
#         compiled_model_path,
#         pk_path,
#         proof_path,

#         "single",
#     )

# print(res)
# assert os.path.isfile(proof_path)


# # End time
# end_proving_time = time.time()

# # Calculate elapsed time
# elapsed_proving_time = end_proving_time - start_proving_time
# print(f"Elapsed time: {elapsed_time:.6f} seconds")

{'instances': [['010000f0d3764defc60b2a4354dad00f5d588181b64550b829a031e1724e6430', '010000f093cd528e156d576384d588075d588181b64550b829a031e1724e6430', '010000f0932d8426ab9ea42470a48f0d5d588181b64550b829a031e1724e6430', '010000f013c6916aeb1d6017b541150b5d588181b64550b829a031e1724e6430', '00000000000000000000000080a3830a00000000000000000000000000000000', '00000000000000000000000080a3830a00000000000000000000000000000000', '00000000000000000000000080a3830a00000000000000000000000000000000', '00000000000000000000000080a3830a00000000000000000000000000000000', '00000000000000000000000080a3830a00000000000000000000000000000000', '00000000000000000000000080a3830a00000000000000000000000000000000', '00000000000000000000000080a3830a00000000000000000000000000000000', '00000000000000000000000080a3830a00000000000000000000000000000000', '00000000000000000000000080a3830a00000000000000000000000000000000', '00000000000000000000000080a3830a00000000000000000000000000000000', '00000000000000000000000080a3830

# 7. Verification Phase

In [39]:
# copy proof file from Data-Owner and past to here to verify
proof_path = os.path.join('generating_files/proof.pf')

In [40]:
# Start time
start_verify_time = time.time()

# VERIFY IT

res = ezkl.verify(
        proof_path,
        settings_path,
        vk_path,
    )

assert res == True
print("verified")


# End time
end_verify_time = time.time()

# Calculate elapsed time
elapsed_verify_time = end_verify_time - start_verify_time
print(f"Elapsed time: {elapsed_verify_time:.6f} seconds")

verified
Elapsed time: 0.570134 seconds


# 8. Extract Data from Proof & Continue Training:

In this step, we will "translate" from each value in "instances" column of proof to float value - corresponding with 292 elements (4 commit values + 288 values of result)


![Proof-Instances](https://raw.githubusercontent.com/BaoNinh2808/cm24-projects/refs/heads/main/AI%20Training%20Data%20Selling%20with%20Privacy%20Protection%20(for%20client)/images/proof.png)

We will use api felt_to_float to 'translate'

In [41]:
# mean of commit-convolutional layer of the 1st part (top-left) of the image
print(ezkl.felt_to_float('010000f0d3764defc60b2a4354dad00f5d588181b64550b829a031e1724e6430', 126))
# the result will be -0.38104580735147514

-0.38104580735147514


The output will equal to rescale output that have pre-calculate in the proof

But in real life, Model Owner must 'translate' the output from the proof again, don't rely on the value that Prover (Data Owner) have calculated and write in the proof, because Prover can cheat at this point.

![Rescale-Output](https://raw.githubusercontent.com/BaoNinh2808/cm24-projects/refs/heads/main/AI%20Training%20Data%20Selling%20with%20Privacy%20Protection%20(for%20client)/images/rescaled_outputs.png)

In [None]:
# extract result from proof ....
# ...
# continue training ....