In [3]:
import sys
sys.path.append('/home/huyvu/workspace/thesis/SEAL-Python/')
from seal import *
from homomorphic_encryption_functions import dot_product_ciphertexts, eval_poly, sum_slots

from homomorphic_encryption_functions import setup_ckks_params, create_tools
from seal import *  # type: ignore

poly_modulus_degree = 2**14
coeff_modulus_chain = [60, 50, 50, 50, 50, 50, 50, 60]

scale = 2.0 ** 50
context = setup_ckks_params(poly_modulus_degree, coeff_modulus_chain)
secret_key, public_key, relin_keys, galois_keys, encryptor, decryptor, evaluator, ckks_encoder = create_tools(context)
slot_count = ckks_encoder.slot_count()




/
| Encryption parameters
| scheme: ckks
| poly_modulus_degree: 16384
| coeff_modulus size: 420(60 + 50 + 50 + 50 + 50 + 50 + 50 + 60) bits
\


In [2]:
def encrypt_feature(df, encryptor, ckks_encoder, scale, slot_count):
    
    num_observations, num_columns = df.shape
    slot_null = [0]*(slot_count - num_observations)
    
    bias_ptx = ckks_encoder.encode(np.concatenate(([1]*num_observations, slot_null)), scale)
    bias_ctx = encryptor.encrypt(bias_ptx)
    encrypted_data = [bias_ctx]
    
    # Apply batch encode to each column
    for i in range(num_columns):
        feature = df[:, i]
        
        feature_ptx = ckks_encoder.encode(np.concatenate((feature, slot_null)), scale)
        feature_ctx = encryptor.encrypt(feature_ptx)
            
        encrypted_data.append(feature_ctx)
    return encrypted_data


In [4]:
import pandas as pd 
iDASH_df = pd.read_csv("./data/idash.txt")
display(iDASH_df)

Unnamed: 0,Cancer_status,BRCA_status,Family_history_2,SNP2,SNP7,SNP13,SNP20,SNP24,SNP25,SNP32,SNP36,SNP41,SNP55,SNP58,SNP68,SNP81,SNP87,SNP92,SNP93
0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0
1,0,0,0,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0
2,0,0,0,1,0,0,1,1,1,1,0,1,0,1,0,0,0,1,1
3,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0
4,1,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1574,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,1,0,1
1575,1,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,1,0
1576,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
1577,1,0,0,0,1,1,1,1,1,1,1,0,0,0,1,1,1,1,0


In [80]:
import numpy as np

class LogisticRegression:
    def __init__(self, learning_rate=0.002, num_iterations=100, momentum = 0.9):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None
        self.momentum = momentum
        
    @staticmethod
    def sigmoid(x):
        return 0.5 + 0.197*x - 0.004*(x**3)

    def fit(self, X, y):
        X = np.c_[np.ones((X.shape[0], 1)), X]
        _, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0
        velocity = np.zeros_like(self.weights)
        
        for _ in range(self.num_iterations):
            lookahead_weights = self.weights + self.momentum * velocity
            dot_X_W = np.dot(X, lookahead_weights)
            y_predict = self.sigmoid(dot_X_W)
            gradient =  np.dot((y_predict - y), X)
            velocity = self.momentum * velocity - self.learning_rate * gradient
            self.weights += velocity
            
    def predict(self, X):
        X = np.c_[np.ones((X.shape[0], 1)), X]
        linear_model = np.dot(X, self.weights) 
        y_predicted = self.sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return y_predicted_cls
    
    def predict_proba(self, X):
        X = np.c_[np.ones((X.shape[0], 1)), X]
        linear_model = np.dot(X, self.weights)
        y_predicted = self.sigmoid(linear_model)
        return y_predicted 

In [62]:
from homomorphic_encryption_functions import dot_product_ciphertexts, eval_poly, sum_slots, switch_cipher_modulus


class HELogisticRegression():   
    
    def __init__(self, learning_rate, momentum = 0.9):
        self.learning_rate = learning_rate
        self.momentum = momentum
         
    def fit(self, X_ctx, y_ctx, w_ctx, v_ctx, ckks_encoder, scale, evaluator, relin_keys, galois_keys, slot_count):
            
            m_ptx = ckks_encoder.encode([self.momentum]*slot_count, scale)    
            lookahead_weights = []
            momen_volocity_arr = []
            for i in range(len(v_ctx)):
                momen_volocity = evaluator.multiply_plain(v_ctx[i], m_ptx)  # m*v 1
                evaluator.relinearize_inplace(momen_volocity, relin_keys)
                evaluator.rescale_to_next_inplace(momen_volocity)
                momen_volocity.scale(scale)
                momen_volocity_arr.append(momen_volocity)  
                 
                switch_weight = evaluator.mod_switch_to(w_ctx[i], momen_volocity.parms_id())
                lookahead_weight = evaluator.add(switch_weight, momen_volocity)  # w + m*v 
                lookahead_weights.append(lookahead_weight) 
            
            switched_X_ctx = switch_cipher_modulus(X_ctx, lookahead_weights[i].parms_id(), evaluator)
            X_W = dot_product_ciphertexts(switched_X_ctx, lookahead_weights, scale, evaluator, relin_keys)  # X(w + m*v) #2
            y_pred = eval_poly(X_W, ckks_encoder, scale, evaluator, relin_keys) # y^ 4
            switched_y_ctx = evaluator.mod_switch_to(y_ctx, y_pred.parms_id()) 
            diff = evaluator.sub(y_pred, switched_y_ctx) # y^ - y
        
            new_w_ctx = []
            new_v_ctx = [] 
  
            for i in range(len(X_ctx)):
                
                switched_X = evaluator.mod_switch_to(X_ctx[i], diff.parms_id())  
                dw = dot_product_ciphertexts([diff], [switched_X], scale, evaluator, relin_keys) # X(y^ - y) #5
                sumslots_dw = sum_slots(dw, evaluator, galois_keys, slot_count) 
                
                lr_ptx = ckks_encoder.encode([self.learning_rate], scale)
                switched_lr_ptx = evaluator.mod_switch_to(lr_ptx, sumslots_dw.parms_id())
                grad = evaluator.multiply_plain(sumslots_dw, switched_lr_ptx)  # alpha*X(y^ - y)#6
                evaluator.relinearize_inplace(grad, relin_keys)
                evaluator.rescale_to_next_inplace(grad)
                grad.scale(scale)

                switched_m_v = evaluator.mod_switch_to(momen_volocity_arr[i], grad.parms_id()) 
                switched_w = evaluator.mod_switch_to(w_ctx[i], grad.parms_id())
                
                new_v = evaluator.sub(switched_m_v, grad) # v = m*v - alpha*X(y^ - y)
                new_v_ctx.append(new_v) 
                new_w = evaluator.add(switched_w, new_v) # w = w + v
                new_w_ctx.append(new_w)
            
            
            return new_v_ctx, new_w_ctx

In [63]:
from sklearn.metrics import  roc_auc_score, accuracy_score
from sklearn.model_selection import KFold
from homomorphic_encryption_functions import encrypt_label, encrypt_feature, encrypt_weights, decrypt_weights

import numpy as np

def decrypt_weights(encrypted_weights, ckks_encoder, decryptor):
    weights = []
    for weight_ctx in encrypted_weights:
        weight_ptx = decryptor.decrypt(weight_ctx)
        weight = ckks_encoder.decode(weight_ptx)

        weights.append(weight[0])
            
    return weights
def encrypt_weights(weights, ckks_encoder, scale, encryptor):
    encrypted_weights = []
    for weight in weights:
        weight_ptx = ckks_encoder.encode([weight] * ckks_encoder.slot_count(), scale)
        weight_ctx = encryptor.encrypt(weight_ptx)
        encrypted_weights.append(weight_ctx)
    
    return encrypted_weights

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def predict(X, weights):
    X = np.c_[np.ones((X.shape[0], 1)), X]
    linear_model = np.dot(X, weights) 
    proba = sigmoid(linear_model)
    predict_label = [1 if i > 0.5 else 0 for i in proba]
    return predict_label
    
def predict_proba(X, weights):
    X = np.c_[np.ones((X.shape[0], 1)), X]
    linear_model = np.dot(X, weights)
    proba = sigmoid(linear_model)
    return proba 
def average_accuracy_and_auc_score(model, X, y):

    kf = KFold(n_splits=10, shuffle=True, random_state=0)
    acc_scores = []
    auc_scores = []
 
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
        model.fit(X_train, y_train)
        y_prob = model.predict_proba(X_test)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_prob)
        acc_scores.append(acc)
        auc_scores.append(auc)
     
    average_acc = np.mean(acc_scores) 
    average_auc = np.mean(auc_scores)
    
    return average_acc, average_auc


def average_accuracy_and_auc_score_helr(model, n, X, y, encryptor, decryptor, ckks_encoder, scale, evaluator, relin_keys, galois_keys, slot_count):
    kf = KFold(n_splits=10, shuffle=True, random_state=0)
    auc_scores = []
    acc_scores = []
   
    for train_index, test_index in kf.split(X):
      
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_ctx = encrypt_feature(X_train, encryptor, ckks_encoder, scale, slot_count)
        y_ctx = encrypt_label(y_train, encryptor, ckks_encoder, scale)

        w = [0] * len(X_ctx)
        w_ctx  = encrypt_weights(w, ckks_encoder, scale, encryptor)
        v_ctx  = w_ctx
        for _ in range(n):
            new_v_ctx, new_w_ctx = model.fit(X_ctx, y_ctx, w_ctx, v_ctx,ckks_encoder, scale, evaluator, relin_keys, galois_keys, slot_count)
            new_v = decrypt_weights(new_v_ctx, ckks_encoder, decryptor)
            new_w = decrypt_weights(new_w_ctx, ckks_encoder, decryptor)
    
            re_encrypted_w = encrypt_weights(new_w, ckks_encoder, scale, encryptor)
            re_encrypted_v = encrypt_weights(new_v, ckks_encoder, scale, encryptor)
            w_ctx = re_encrypted_w
            v_ctx = re_encrypted_v
        new_w = decrypt_weights(new_w_ctx, ckks_encoder, decryptor)
        y_prob = predict_proba(X_test, new_w)
        y_pred = predict(X_test, new_w)
        acc = accuracy_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_prob)
        acc_scores.append(acc)
        auc_scores.append(auc)    
   

    average_acc = np.mean(acc_scores)
    average_auc = np.mean(auc_scores)
    
    return average_acc, average_auc



In [64]:
import time
from logistic_regression import average_accuracy_and_auc_score

def prepare_data(df, target):
    feature_names = df.columns.tolist()
    feature_names.remove(target)
    X = df[feature_names]
    y = df[target]
    return X.values , y


X_idash, y_idash = prepare_data(iDASH_df, target = 'Cancer_status')

import time
def train_and_evaluate_models_performance(X, y, best_params, encryptor, ckks_encoder, scale, evaluator, relin_keys, galois_keys, slot_count):
    
    print("- Logistic Regression over unencrypted dataset...")
    lr = LogisticRegression(learning_rate=best_params['learning_rate'], num_iterations= best_params['num_iterations'], momentum=best_params['momentum'])
    start_time = time.time()
    avg_accuracy, avg_auc = average_accuracy_and_auc_score(lr, X, y) 
    end_time = time.time()
    print(f"\t+ Total training time: {(end_time - start_time):.2f}s")
    print(f"\t+ Average accuracy score: {(avg_accuracy*100):.2f}%")
    print(f"\t+ Average auc score: {avg_auc:.4f}")

    print("\n- Logistic Regression over encrypted dataset...\n")
    num_iterations = best_params['num_iterations']
    helr  = HELogisticRegression(learning_rate=best_params['learning_rate'], momentum=best_params['momentum'])
    start_time = time.time()
    avg_accuracy_helr, avg_auc_helr = average_accuracy_and_auc_score_helr(helr, num_iterations, X, y, encryptor, decryptor, ckks_encoder, scale, evaluator, relin_keys, galois_keys, slot_count)
    end_time = time.time()
    execution_time = end_time - start_time
    minutes = int(execution_time // 60)
    seconds = execution_time % 60
    print(f"\t+ Total training time: {minutes}m{seconds:.2f}s")
    print(f"\t+ Average accuracy score: {(avg_accuracy_helr*100):.2f}%")
    print(f"\t+ Average auc score: {avg_auc_helr:.4f}")


In [65]:
best_params = {'learning_rate': 0.001, 'num_iterations': 20, 'momentum': 0.9}

In [66]:
train_and_evaluate_models_performance(X_idash, y_idash, best_params, encryptor, ckks_encoder, scale, evaluator, relin_keys, galois_keys, slot_count)

- Logistic Regression over unencrypted dataset...
	+ Total training time: 0.44s
	+ Average accuracy score: 62.19%
	+ Average auc score: 0.6906

- Logistic Regression over encrypted dataset...

	+ Total training time: 7m18.52s
	+ Average accuracy score: 62.19%
	+ Average auc score: 0.6910


In [25]:
from sklearn.preprocessing import StandardScaler
heart_df = pd.read_csv("./data/heart.csv")
columns_name = ['age', 'trtbps', 'chol', 'thalachh']

filled_heart_df = heart_df[columns_name]
scaled_filled_diabetes = np.round(StandardScaler().fit_transform(filled_heart_df), 2)
scaled_filled_heart_df = pd.DataFrame(scaled_filled_diabetes, columns=columns_name)
heart_df[columns_name] = scaled_filled_heart_df
heart_df
X_heart, y_heart = prepare_data(heart_df, 'output')

In [26]:

best_params = {'learning_rate': 0.001, 'num_iterations': 40, 'momentum': 0.3}
train_and_evaluate_models_performance(X_heart, y_heart, best_params, encryptor, ckks_encoder, scale, evaluator, relin_keys, galois_keys, slot_count)

- Logistic Regression over unencrypted dataset...
	+ Total training time: 0.11s
	+ Average accuracy score: 81.18%
	+ Average auc score: 0.8849

- Logistic Regression over encrypted dataset...

	+ Total training time: 12m28.60s
	+ Average accuracy score: 81.19%
	+ Average auc score: 0.8835


In [81]:
columns = ['label' if i == 0 else f'Pixel_{i}' for i in range(197)]
MNIST_df = pd.read_csv("./data/mnist.csv", header=None, names = columns)
new_label = MNIST_df['label'].apply(lambda x: 0 if x == -1 else 1)
MNIST_df['label'] = new_label
def normalizeMNISTData(df):
    columns = MNIST_df.columns.tolist()
    columns.remove('label')
    
    for col in columns:
        MNIST_df[col] =(MNIST_df[col]/32).astype('int32')
        
    return df

normalized_MNIST_df = normalizeMNISTData(MNIST_df)
normalized_MNIST_df.describe()


Unnamed: 0,label,Pixel_1,Pixel_2,Pixel_3,Pixel_4,Pixel_5,Pixel_6,Pixel_7,Pixel_8,Pixel_9,...,Pixel_187,Pixel_188,Pixel_189,Pixel_190,Pixel_191,Pixel_192,Pixel_193,Pixel_194,Pixel_195,Pixel_196
count,1984.0,1984.0,1984.0,1984.0,1984.0,1984.0,1984.0,1984.0,1984.0,1984.0,...,1984.0,1984.0,1984.0,1984.0,1984.0,1984.0,1984.0,1984.0,1984.0,1984.0
mean,0.509073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.004032,0.002016,0.00252,0.003024,0.003528,0.001008,0.000504,0.0,0.0,0.0
std,0.500044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.095189,0.05497,0.067322,0.083969,0.092523,0.044901,0.022451,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,2.0,2.0,3.0,3.0,2.0,1.0,0.0,0.0,0.0


In [84]:
X_mnist, y_mnist = prepare_data(normalized_MNIST_df, 'label')
print("- Logistic Regression over unencrypted dataset...")
lr = LogisticRegression(learning_rate=1e-5, num_iterations= 10, momentum=0.5)
start_time = time.time()
avg_accuracy, avg_auc = average_accuracy_and_auc_score(lr, X_mnist, y_mnist) 
end_time = time.time()
print(f"\t+ Total training time: {(end_time - start_time):.2f}s")
print(f"\t+ Average accuracy score: {(avg_accuracy*100):.2f}%")
print(f"\t+ Average auc score: {avg_auc:.4f}")


- Logistic Regression over unencrypted dataset...
	+ Total training time: 0.50s
	+ Average accuracy score: 94.46%
	+ Average auc score: 0.9859


In [18]:
from sklearn.preprocessing import StandardScaler
framingham_df = pd.read_csv("./data/framingham.csv")
framingham_df = framingham_df.dropna().reset_index(drop=True)
columns_name = framingham_df.columns
columns_name = columns_name.drop(['male', 'education', 'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'TenYearCHD'])

filled_framingham_df = framingham_df[columns_name]
scaled_filled_diabetes = np.round(StandardScaler().fit_transform(filled_framingham_df), 2)
scaled_filled_framingham_df = pd.DataFrame(scaled_filled_diabetes, columns=columns_name)
framingham_df[columns_name] = scaled_filled_framingham_df
framingham_df

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,-1.23,4.0,0,-0.76,0.0,0,0,0,-0.95,-1.19,-1.08,0.29,0.36,-0.20,0
1,0,-0.42,2.0,0,-0.76,0.0,0,0,0,0.30,-0.51,-0.16,0.72,1.61,-0.24,0
2,1,-0.18,1.0,1,0.92,0.0,0,0,0,0.18,-0.22,-0.24,-0.11,-0.06,-0.50,0
3,0,1.34,3.0,1,1.76,0.0,0,1,0,-0.27,0.80,1.01,0.69,-0.90,0.88,1
4,0,-0.42,3.0,1,1.17,0.0,0,0,0,1.09,-0.11,0.09,-0.66,0.77,0.13,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3651,1,0.99,3.0,0,-0.76,0.0,0,1,0,-1.13,0.39,-0.16,-0.20,0.36,-0.04,0
3652,1,2.15,1.0,0,-0.76,0.0,0,1,0,-1.38,1.61,1.18,-0.65,-1.31,-0.12,1
3653,1,0.05,1.0,1,-0.67,0.0,0,1,0,1.73,2.11,0.76,0.05,-0.81,0.17,1
3654,1,0.17,3.0,1,2.85,0.0,0,0,0,-0.68,-0.27,-0.24,-1.49,-0.90,-0.58,0


In [19]:
from functions import prepare_data
from imblearn.over_sampling import SMOTE

X_framingham, y_framingham = prepare_data(framingham_df, 'TenYearCHD')
resampled_X_framingham, resampled_y_framingham = SMOTE().fit_resample(X_framingham, y_framingham)

In [20]:

print("- Logistic Regression over unencrypted dataset...")
lr = LogisticRegression(learning_rate=0.0001, num_iterations= 10)
start_time = time.time()
avg_accuracy, avg_auc = average_accuracy_and_auc_score(lr, resampled_X_framingham, resampled_y_framingham) 
end_time = time.time()
print(f"\t+ Total training time: {(end_time - start_time):.2f}s")
print(f"\t+ Average accuracy score: {(avg_accuracy*100):.2f}%")
print(f"\t+ Average auc score: {avg_auc:.4f}")

- Logistic Regression over unencrypted dataset...
	+ Total training time: 0.66s
	+ Average accuracy score: 66.99%
	+ Average auc score: 0.7356


In [21]:
diabetes_df = pd.read_csv("./data/diabetes.csv")
columns_name = diabetes_df.columns
columns_name = columns_name.drop(['DiabetesPedigreeFunction', 'Outcome'])

filled_diabetes_df = diabetes_df[columns_name]
scaled_filled_diabetes = np.round(StandardScaler().fit_transform(filled_diabetes_df),3)
scaled_filled_diabetes_df = pd.DataFrame(scaled_filled_diabetes, columns=columns_name)
diabetes_df[columns_name] = scaled_filled_diabetes_df
X_diabetes, y_diabetes = prepare_data(diabetes_df, 'Outcome')
resampled_X_diabetes, resampled_y_diabetes = SMOTE().fit_resample(X_diabetes, y_diabetes)

In [22]:
print("- Logistic Regression over unencrypted dataset...")
lr = LogisticRegression(learning_rate=0.0001, num_iterations= 10)
start_time = time.time()
avg_accuracy, avg_auc = average_accuracy_and_auc_score(lr, resampled_X_diabetes, resampled_y_diabetes) 
end_time = time.time()
print(f"\t+ Total training time: {(end_time - start_time):.2f}s")
print(f"\t+ Average accuracy score: {(avg_accuracy*100):.2f}%")
print(f"\t+ Average auc score: {avg_auc:.4f}")

- Logistic Regression over unencrypted dataset...
	+ Total training time: 0.06s
	+ Average accuracy score: 73.00%
	+ Average auc score: 0.8142
