In [1]:
!git clone https://github.com/awslabs/privacy-preserving-xgboost-inference.git
%cd privacy-preserving-xgboost-inference
!pip install -r requirements.txt

fatal: destination path 'privacy-preserving-xgboost-inference' already exists and is not an empty directory.
/home/ec2-user/SageMaker/myAWSStudyBlog/privacy-preserving-ml/ppxgboost/privacy-preserving-xgboost-inference
Obtaining file:///home/ec2-user/SageMaker/myAWSStudyBlog/privacy-preserving-ml/ppxgboost/privacy-preserving-xgboost-inference (from -r requirements.txt (line 8))
  Preparing metadata (setup.py) ... [?25ldone
Installing collected packages: ppxgboost
  Attempting uninstall: ppxgboost
    Found existing installation: ppxgboost 0.0.1
    Uninstalling ppxgboost-0.0.1:
      Successfully uninstalled ppxgboost-0.0.1
  Running setup.py develop for ppxgboost
Successfully installed ppxgboost-0.0.1


Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
SPDX-License-Identifier: Apache-2.0

( Run <code>jupyter notebook</code> under the project directory )

# XGBoost for Iris Dataset

We use this example to demenstrate how to use ppxgboost for encypting an xgboost model for multi-class
 prediction. We directly use the iris data from Sklearn, but one
 can go to https://archive.ics.uci.edu/ml/datasets/iris to download the original dataset.


In [2]:
import sys
sys.path.append('third-party')

import time
import pandas as pd
import numpy as np
import xgboost as xgb
from secrets import token_bytes

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from ppxgboost import BoosterParser as boostparser
from ppxgboost import PPBooster as ppbooster
from ppxgboost.PPBooster import MetaData
from ppxgboost.PPKey import PPBoostKey
from ope.pyope.ope import OPE
from ppxgboost import PaillierAPI as paillier

In [3]:
bc = load_breast_cancer()
X = bc.data
y = bc.target

# Pre-assign the column name first.
# the default feature name from the xgboost -- iris have 4 columns
feature_names = bc.feature_names
X = pd.DataFrame(X, columns=feature_names)

### Remove Spaces in Column Names

Since the `BoosterParser` in ppxgboost cannot parser column name with space in it, we have to remove the space for the parser to work properly.

In [4]:
X.columns = [c.replace(" ", "") for c in X.columns]

In [5]:
X.head()

Unnamed: 0,meanradius,meantexture,meanperimeter,meanarea,meansmoothness,meancompactness,meanconcavity,meanconcavepoints,meansymmetry,meanfractaldimension,...,worstradius,worsttexture,worstperimeter,worstarea,worstsmoothness,worstcompactness,worstconcavity,worstconcavepoints,worstsymmetry,worstfractaldimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [6]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [7]:
# splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

test_input_vector = pd.DataFrame(X_test, columns=feature_names)

In [8]:
# Train a xgboost model 
dtrain = xgb.DMatrix(X_train, label=y_train)
params = {'eta': 0.1}
model = xgb.train(params=params, dtrain=dtrain)

# predict using the plaintext prediction
plaintext_predict = model.predict(xgb.DMatrix(X_test))

In [9]:
# 1. parsing to internal tree data structure, and output feature set
start = time.time()
min_max = boostparser.training_dataset_parser(X_test)
enc_tree, feature_set, min_max = boostparser.model_to_trees(model, min_max)
end = time.time()
print("Paramaeter Extraction Time: ", end - start)

Paramaeter Extraction Time:  0.007462263107299805


In [10]:
# 2. Set up encryption materials.
start = time.time()
prf_key = token_bytes(16)
public_key, private_key = paillier.he_key_gen()
encrypter = OPE(token_bytes(16))
ppBoostKey = PPBoostKey(public_key, prf_key, encrypter)
end = time.time()
print("Encrypter and Key Generation: ", end - start)

Encrypter and Key Generation:  0.08191537857055664


In [11]:
# 3. process the tree into enc_tree
start = time.time()
ppbooster.enc_xgboost_model(ppBoostKey, enc_tree, MetaData(min_max))
end = time.time()
print("Create Encrypted Tree: ", end - start)

Create Encrypted Tree:  3.721642255783081


In [12]:
# 4. Encrypts the input vector for prediction (using prf_key_hash and ope-encrypter) based on the feature set.
start = time.time()
ppbooster.enc_input_vector(prf_key, encrypter, feature_set, X_test, MetaData(min_max))
end = time.time()
print("Create Encrypted Test Data: ", end - start)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Create Encrypted Test Data:  14.867348670959473


In [13]:
# 5. privacy-preserving evaluation.
start = time.time()
values = ppbooster.predict_binary(enc_tree, X_test)
end = time.time()
print("PP Predict Elapsed Time: ", end - start)

PP Predict Elapsed Time:  0.1878814697265625


In [14]:
# 6. decryption
start = time.time()
decryptions = []
for c in values:
    decryptions.append(paillier.decrypt(private_key, c))
decryptions = np.array([round(x, 7) for x in decryptions])
end = time.time()
print("Decrypt Prediction Result Time: ", end - start)

Decrypt Prediction Result Time:  0.44504714012145996


In [15]:
# if the predicted values are same (the ppxgboost might not produce same values
#                                   as the plaintext value due to precision)
assert len(plaintext_predict) == len(decryptions)
for i in range(len(plaintext_predict)):
    assert abs(plaintext_predict[i] - decryptions[i]) < 0.000001

## Clean Up

Remove the ppxgboost folder!

In [16]:
%cd ..
!rm -rf privacy-preserving-xgboost-inference

/home/ec2-user/SageMaker/myAWSStudyBlog/privacy-preserving-ml/ppxgboost
