# Homomorphic Encryption Test

Needed libraries<br>
SEAL-Python: https://github.com/Huelse/SEAL-Python <br>
SimpleFHE: https://github.com/wgxli/simple-fhe <br>
Note: The code has to be in the same folder where SEAL-Python is installed, otherwise python can't find the SEAL library

This project is a proof of concept to using Homomorphic encryption for a Linear Regression Model and for an Autoregressive model for time series.<br> The data owner (referred to as the Client) Encrypts his data and sends it to a ML-Company that uses its model to compute the predictions for the Client.

### Imports

In [1]:
import numpy as np
import pandas as pd
from time import time
from simplefhe import (
    encrypt, decrypt,
    generate_keypair,
    set_public_key, set_private_key, set_relin_keys,
    initialize, display_config,
    encrypt, load_public_key, load_relin_keys,
    load_encrypted_value
)
import simplefhe
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split 

## Client side
Actions that take place at the side of the data owner

### Functions

In [2]:
class Dummy_data():
    """
    Class to house all function relating to the creation and encryption of dummy data
    """
    def create_dummy_ds (count = 10):
        """
        Function to create a data frame filled with dummy variables
        """
        #create df with wanted dummy columns
        df = pd.DataFrame(columns = ["intercept","age","education_level", "years_of_experience", "gender", "salary"])
        #create a dictionary where the values are lamba functions that create random data
        d = {"age" : lambda: np.random.normal(45,5),
             "education_level" : lambda: np.random.normal(5,1),
             "years_of_experience" : lambda: np.random.normal(25,5),
             "gender" : lambda: np.random.randint(1,3),
             "salary" : lambda: np.random.normal(90000,15000)}
        #fill the df with the wanted number of data rows
        for i in range(count):
            r = [d[k]() for k in d.keys()]
            df.loc[i,"intercept"] = 1
            df.loc[i, "age"] = int(r[0])
            df.loc[i, "education_level"] = int(r[1])
            df.loc[i, "years_of_experience"] = int(r[2])
            df.loc[i, "gender"] = int(r[3])
            df.loc[i, "salary"] = int(r[4])
        return df

    def create_input_data(df):
        """
        Function to create input data in a dictionary
        """
        #empty dictionary that houses the input data
        inputs = {}
        #fill the dictionary
        for index,row in df.iterrows():
            inputs[f'y-{index}'] = row["salary"]
            inputs[f"x{0}-{index}"] = row["intercept"]
            inputs[f"x{1}-{index}"] = row["age"]
            inputs[f"x{2}-{index}"] = row["education_level"]
            inputs[f"x{3}-{index}"] = row["years_of_experience"]
            inputs[f"x{4}-{index}"] = row["gender"]
        return inputs
    
    def encrypt_df(df):
        """
        Function to create the encrypted version of the data frame
        """
        #because the df was creating using test_train split, the indexes have to be reset first
        df.reset_index(inplace = True, drop = True)
        #create encrypted data frame
        df_enc = pd.DataFrame(columns = ["intercept","age","education_level", "years_of_experience", "gender", "salary"])
        #fill encrypted data frame
        for index,row in df.iterrows():
            df_enc.loc[index, "intercept"] = encrypt(df.loc[index, "intercept"])
            df_enc.loc[index, "age"] = encrypt(df.loc[index, "age"])
            df_enc.loc[index, "education_level"] = encrypt(df.loc[index, "education_level"])
            df_enc.loc[index, "years_of_experience"] = encrypt(df.loc[index, "years_of_experience"])
            df_enc.loc[index, "gender"] = encrypt(df.loc[index, "gender"])
            df_enc.loc[index, "salary"] = encrypt(df.loc[index, "salary"])
        return df_enc

### Clients side preparation

In [3]:
#initialize encrypt for float variables
initialize("float")

# Generate keypair
public_key, private_key, relin_keys = generate_keypair()

# Set keys
set_private_key(private_key)
set_public_key(public_key)
set_relin_keys(relin_keys)

In [4]:
# create dummy data
df = Dummy_data.create_dummy_ds(100)
#length of the train data
data_length = int(len(df)*0.8)

In [5]:
#splits the data set into a train and test versions
X_train, X_test = train_test_split(df, test_size=0.2, random_state=0)
#create encrypted versions of the train and test data
X_train_enc = Dummy_data.encrypt_df(X_train)
X_test_enc = Dummy_data.encrypt_df(X_test)

In [6]:
#create input data
inputs = Dummy_data.create_input_data(X_train)
inputs_enc = Dummy_data.create_input_data(X_train_enc)

## LR Modeling company side

### Functions

In [7]:
class Linear_Regression:
    """
    Linear regression class that performs the linear regression on the side 
    of the company that receives encrypted data from the data owner
    """
    def __init__(self, dimension):
        """
        :dimension: Count of independent variables.
        """
        self.XtX = [[0] * dimension for i in range(dimension)]
        self.XtY = [0] * dimension
        self.dimension = dimension
        self.coefficients = [[0] * dimension]
        self.coefficients_enc = [[0] * dimension]

    def update(self, xs, y):
        """
        Update the model with a new datapoint.
        """
        for i in range(self.dimension):
            self.XtY[i] += xs[i] * y
            for j in range(self.dimension):
                self.XtX[i][j] += xs[i] * xs[j]

    def dump(self) -> dict:
        """
        Export the regression coefficients.
        """
        output = {}
        for i in range(self.dimension):
            output[f"XtY-{i}"] = self.XtY[i]
            for j in range(self.dimension):
                output[f"XtX-{i}-{j}"] = self.XtX[i][j]
        return output
    
    def get_matrix_for_coeff(self, outputs):
        """
        Function that mimicks operations that would be required  on the data owner side
        in order to train on encrypted data. Namely the decryption and recryption of the indexes
        """
        XtX = np.zeros(shape=[self.dimension, self.dimension])
        XtY = np.zeros(shape=self.dimension)
        for i in range(self.dimension):
            XtY[i] = decrypt(outputs[f"XtY-{i}"])
            for j in range(self.dimension):
                XtX[i, j] = decrypt(outputs[f"XtX-{i}-{j}"])
        coefficients = np.linalg.inv(XtX) @ XtY
        coeff_enc = []
        for elem in coefficients:
            coeff_enc.append(encrypt(elem))
        
        return coeff_enc
    
    def train(self, inputs, nrow_data):
        """
        Function that trains the model on plain data
        """
        t_start = time()
        for i in range(nrow_data):
            xs = []
            y = inputs[f"y-{i}"]
            for j in range(self.dimension):
                xs.append(inputs[f"x{j}-{i}"])
            
            # Update regression
            self.update(xs, y)
      
        outputs = {}
        # Dump regression coefficients
        coefficients = self.dump()
        for name, value in coefficients.items():
            outputs[f"{name}"] = value
        
        #construct needed matrices
        XtX = np.zeros(shape=[self.dimension, self.dimension])
        XtY = np.zeros(shape=self.dimension)
        for i in range(self.dimension):
            XtY[i] = outputs[f"XtY-{i}"]
            for j in range(self.dimension):
                XtX[i, j] = outputs[f"XtX-{i}-{j}"]
        
        #update coefficients
        self.coefficients = np.linalg.inv(XtX) @ XtY
        t_end = time()
        #print(f"Training of the plain-set took {int(t_end - t_start)} seconds")
    
    def train_enc(self, inputs, nrow_data):
        """
        Function that trains the model on encrypted data. This is a matrix wise implementation
        of the linear regression algorithm and the encrypted leaning can proceed all the way until
        the inverse of X'X is needed. Because the Microsoft Seal Homomorphic encryption library
        doesn't support division of encrypted number, the inverse of the matrix can't be calculated
        withoput decrypting the matrix and inversing it when decrypted. The lack of division severly
        limits what computations can be perfomed on encrypted numbers.
        """
        t_start = time()
        for i in range(nrow_data):
            xs = []
            y = inputs[f"y-{i}"]
            for j in range(self.dimension):
                xs.append(inputs[f"x{j}-{i}"])
            
            # Update regression
            self.update(xs, y)
      
        outputs = {}
        # Dump regression coefficients
        coefficients = self.dump()
        for name, value in coefficients.items():
            outputs[f"{name}"] = value
            
            
        # Mimicks sending the XtX and XtY matices to the data owner for decryption and coeff calculation
        #After which the coefs are encrypted and returned to the model
        self.coefficients_enc = self.get_matrix_for_coeff(outputs)
            
        t_end = time()
        #print(f"Training of the encrypted-set took {int(t_end - t_start)} seconds")
        
    def predict(self, data_point):
        """
        Function for predicting values when training is done on plain data
        """
        prediction = 0
        pred = []
        for i in range(len(data_point)):
            for j in range(self.dimension):
                prediction += data_point[i][j]*self.coefficients[j]
            pred.append(prediction)
            prediction = 0
        return pred
    
    def predict_enc(self, data_point):
        """
        Function for predicting values when training is done on encrypted data
        """
        prediction = 0
        pred = []
        for i in range(len(data_point)):
            for j in range(self.dimension):
                prediction += data_point[i][j]*self.coefficients_enc[j]
            pred.append(prediction)
            prediction = 0
        return pred

### Create sklearn model to check for coefficient accuracy

In [8]:
def sklearn_benchmark(df, x_train):
    """
    Function that creates an sklearn linear model to compare the predicitions of our model against
    """
    y = x_train.salary
    x = x_train.drop(["salary","intercept"],axis=1)
    reg = LinearRegression().fit(x, y)
    return reg

### Show SKlearn can be used to predict for encrypted values
An sklearn model that is trained on plain data can alse be used for the prediction of encrypted values <br>
by taking the intercept and coefficients of the sklearn model and multipliying them with the encrypted values

In [9]:
skl_bench = sklearn_benchmark(df,X_train)
params = [encrypt(25),encrypt(4), encrypt(10), encrypt(1)]
sk_df = pd.DataFrame(columns = ["age","education_level", "years_of_experience", "gender"])
sk_df.loc[0] = [25,4,10,1]
pred_benc = skl_bench.intercept_
for i in range(len(params)):
    pred_benc += params[i]*skl_bench.coef_[i]
#the predicition is also an encrypted value becuase it was created by multiplying encrypted values with numbers    
print(f"Prediction on the encrypted variable was {decrypt(pred_benc)}")
print(f"Prediction on the plain variable was {skl_bench.predict(sk_df)[0]}")
print(f"Difference between predictions is {round(skl_bench.predict(sk_df)[0] - decrypt(pred_benc),7)}")

Prediction on the encrypted variable was 79048.82308857857
Prediction on the plain variable was 79048.82254604307
Difference between predictions is -0.0005425


### Train on plain data

In [10]:
#initialize our Linear Regression model
Model = Linear_Regression(5)
#Train our LR model on plain data
Model.train(inputs, data_length)

In [11]:
#Compare the performance of our model with sklearn benchmark
sk_df.loc[0] = [64,1,20,1]
# our model has the first column as intercept therefore the first number is 1
data_point = [[1,64,1,20,1]]
print(f"Prediction by our model when trained on plain data is {Model.predict(data_point)[0]}")
print(f"Prediction by sklearn for this data point is {skl_bench.predict(sk_df)[0]}")
print(f"Difference between our prediction and sklearn benchmark is {skl_bench.predict(sk_df)[0]-Model.predict(data_point)[0]}")

Prediction by our model when trained on plain data is 96462.1967745889
Prediction by sklearn for this data point is 96462.1967745873
Difference between our prediction and sklearn benchmark is -1.6007106751203537e-09


### Train on Encrypted data

In [12]:
#Train our LR model on encrypted data
Model.train_enc(inputs_enc, data_length)

### Compare results

In [13]:
#create a data frame for comparison
df_comp = pd.DataFrame(columns = ["Sklearn_Prediction","Model_plain","Model_enc"])
#fill the column for sklearn predicitions
df_comp["Sklearn_Prediction"] = skl_bench.predict(X_test.drop(["intercept","salary"],axis=1))
#column for our predictions
df_comp["Model_plain"] = Model.predict(X_test.drop(["salary"],axis=1).to_numpy())
#calculate predictions of encrypted data in our model
enc_pred = Model.predict_enc(X_test_enc.drop(["salary"],axis=1).to_numpy())
enc_pred = [decrypt(x) for x in enc_pred]
#fill the encrypted model predicitions
df_comp["Model_enc"] = enc_pred
#get the difference between our encrypted predicition and sklearn benchmark
df_comp["Diff"] = df_comp["Sklearn_Prediction"] - df_comp["Model_enc"]
#%difference
df_comp["perc_change"] = (df_comp["Diff"]/df_comp["Sklearn_Prediction"])*100
print(f"The mean difference between encrypted prediction and sklearn benchmark is {df_comp['Diff'].mean()}")
print(f"The mean % difference between encrypted prediction and sklearn benchmark is {df_comp['perc_change'].mean()}")

The mean difference between encrypted prediction and sklearn benchmark is -0.01195331497438019
The mean % difference between encrypted prediction and sklearn benchmark is -1.3427768541051226e-05


In [14]:
df_comp

Unnamed: 0,Sklearn_Prediction,Model_plain,Model_enc,Diff,perc_change
0,82266.800996,82266.800996,82266.812141,-0.011145,-1.4e-05
1,84555.347811,84555.347811,84555.359236,-0.011425,-1.4e-05
2,94944.918681,94944.918681,94944.931385,-0.012704,-1.3e-05
3,86458.44494,86458.44494,86458.456449,-0.011509,-1.3e-05
4,85942.320871,85942.320871,85942.332297,-0.011426,-1.3e-05
5,89111.815242,89111.815242,89111.827282,-0.01204,-1.4e-05
6,91783.77737,91783.77737,91783.789509,-0.012139,-1.3e-05
7,88413.551251,88413.551251,88413.56333,-0.012079,-1.4e-05
8,85556.871184,85556.871184,85556.882527,-0.011342,-1.3e-05
9,84969.487889,84969.487889,84969.499423,-0.011534,-1.4e-05


# Timeseries AutoRegression

The difference between encrypted and plain data performance of the model on a timeseries using autoregression was also tested.

In [15]:
class AutoRegression():
    """
    Class for our autoregression model
    """
    def __init__(self, dimension = 2):
        """
        :dimension: Count of independent variables.
        """
        self.dimension = dimension
    
    def compute(self,data, period_window, pred_num):
        """
        Function to perform autoregression on plain data
        """
        inputs = {}
        for num_pred in range(pred_num):
            time_period = list(data.index.values)
            df = data[-period_window:]
            df.reset_index(inplace = True, drop = True)
            for index,row in df.iterrows():
                inputs[f"y-{index}"] = row["time_series"]
                inputs[f"x{0}-{index}"] = 1
                inputs[f"x{1}-{index}"] = row["time_period"]

            model = Linear_Regression(self.dimension)
            model.train(inputs, period_window)
            coefs = model.coefficients
            y_pred = coefs[0] +coefs[1] * (max(time_period)+1)
            next_time_period = max(time_period) + 1
            data.loc[next_time_period] = [y_pred,next_time_period]
        return data
    
    def compute_encrypted(self, data, period_window, pred_num, inter):
        inputs = {}
        for num_pred in range(pred_num):
            time_period = list(data.index.values)
            df = data[-period_window:]
            df.reset_index(inplace = True, drop = True)
            for index,row in df.iterrows():
                inputs[f"y-{index}"] = row["time_series"]
                inputs[f"x{0}-{index}"] = inter
                inputs[f"x{1}-{index}"] = row["time_period"]

            model = Linear_Regression(self.dimension)
            model.train_enc(inputs, period_window)
            coefs = model.coefficients_enc
            y_pred = coefs[0] +coefs[1] * (max(time_period)+1)
            next_time_period = max(time_period) + inter
            data.loc[max(time_period)+1] = [y_pred,next_time_period]

        return data

In [16]:
def create_dfs(file):
    """
    Function to create data frames for the autoregression
    """
    one_series = pd.read_csv(file)
    ts_plain = one_series.iloc[0:100].copy()
    ts_enc = one_series.iloc[0:100].copy()
    ts_plain["time_period"] = ts_plain.index
    for index,row in ts_enc.iterrows():
        ts_enc.loc[index,'time_series'] = encrypt(row["time_series"])
        ts_enc.loc[index,'time_period'] = encrypt(index)
        
    return ts_plain, ts_enc

In [17]:
#we used the secret_training_data csv that was given to the students for the time series project
ts_plain,ts_enc = create_dfs("time_series_test.csv")

In [18]:
#initialize the Autoregression class
AR = AutoRegression(2)
#compute the plain data autoregression
plain_ar = AR.compute(ts_plain, 10, 15)
#encrypt 1 because of the intercept column that has just encrypted ones in it
inter = encrypt(1)
#compute autoregression on encrypted data
enc_ar = AR.compute_encrypted(ts_enc, 10, 15, inter)
#decrypt the data
list_pred = enc_ar["time_series"]
list_pred = [decrypt(x) for x in list_pred]
enc_ar["time_series"] = list_pred

In [19]:
#Create the data frame for the results comparison
df_autoreg = pd.DataFrame(columns = ["Model_plain","Model_enc","Diff"])
#exctract predicition from time series and add them to the data fram
plain_pred = ts_plain.iloc[100:115].time_series
df_autoreg["Model_plain"] = plain_pred
enc_pred = ts_enc.iloc[100:115].time_series
df_autoreg["Model_enc"] = enc_pred
#compute differences
df_autoreg["Diff"] = df_autoreg["Model_plain"]-df_autoreg["Model_enc"]
df_autoreg["perc_Diff"] = abs(df_autoreg["Diff"]/df_autoreg["Model_plain"]*100)
print(f"The mean % difference between encrypted prediction and sklearn benchmark is {df_autoreg['perc_Diff'].mean()}")

The mean % difference between encrypted prediction and sklearn benchmark is 7.672573264757181e-06


In [20]:
df_autoreg

Unnamed: 0,Model_plain,Model_enc,Diff,perc_Diff
100,260.563866,260.563894,-2.7e-05,1.1e-05
101,260.545205,260.545239,-3.5e-05,1.3e-05
102,259.588835,259.588844,-9e-06,4e-06
103,262.105774,262.105795,-2.1e-05,8e-06
104,262.511969,262.511992,-2.3e-05,9e-06
105,262.687434,262.687458,-2.4e-05,9e-06
106,263.971641,263.971671,-3e-05,1.1e-05
107,263.31406,263.314086,-2.5e-05,1e-05
108,262.619029,262.619048,-1.9e-05,7e-06
109,265.190796,265.190822,-2.6e-05,1e-05
