In [1]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# %matplotlib inline
# import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import sklearn.base as skb
import sklearn.metrics as skm
import sklearn.model_selection as skms
import sklearn.preprocessing as skp
import sklearn.utils as sku
import sklearn.linear_model as sklm
import sklearn.neighbors as skn
import sklearn.ensemble as ske
import scipy.stats as sstats
import random

seed = 24
np.random.seed(seed)

# sns.set_style('whitegrid')
# plt.style.use('fivethirtyeight')
# plt.rcParams['font.family'] = 'sans-serif'
# plt.rcParams['font.serif'] = 'Ubuntu'
# plt.rcParams['font.monospace'] = 'Ubuntu Mono'
# plt.rcParams['font.size'] = 10
# plt.rcParams['axes.labelsize'] = 12
# plt.rcParams['axes.titlesize'] = 12
# plt.rcParams['xtick.labelsize'] = 8
# plt.rcParams['ytick.labelsize'] = 8
# plt.rcParams['legend.fontsize'] = 12
# plt.rcParams['figure.titlesize'] = 14
# plt.rcParams['figure.figsize'] = (12, 8)

pd.options.mode.chained_assignment = None
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 400)

from datetime import date

import time

In [2]:
data_dir = './data/'
train_file = data_dir + "train.csv"
test_file = data_dir + "test.csv"

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
# SET TARGET FEATURES
target_disorder = 'Genetic Disorder'
target_subclass = 'Disorder Subclass'

# REMOVE PatientID and duplicates from train data
train_df.drop(['Patient Id'], inplace=True, axis=1)
train_df.drop_duplicates(inplace=True)

# REMOVE INRELEVANT ATTRIBUTES
irrelevant_cols = [
    'Patient First Name',
    'Family Name',
    "Father's name",
    "Father's age",
    "Mother's age",
    'Institute Name',
    'Location of Institute',
    'Status',
    'Parental consent',
    'Autopsy shows birth defect (if applicable)',
    'Place of birth',
    'No. of previous abortion',
    "Test 1","Test 2","Test 3","Test 4","Test 5",
]


train_df.drop(irrelevant_cols, inplace=True, axis=1)
test_df.drop(irrelevant_cols, inplace=True, axis=1)
train_df.dropna(axis=1, how="all", inplace=True)
train_df.dropna(axis=0, how="all", inplace=True)
train_df.dropna(subset=[target_disorder, target_subclass], how='any', inplace=True)

test_df.dropna(axis=1, how="all", inplace=True)

fill_nan_dict = {
    'Assisted conception IVF/ART': 'Yes',
    'H/O radiation exposure (x-ray)': '-',
    'Respiratory Rate (breaths/min)': 'Normal (30-60)',
    'Folic acid details (peri-conceptional)': 'Yes',
    'H/O serious maternal illness': 'No',
    'Birth asphyxia': 'Not available',
    'Birth defects': 'Singular',
    'Blood test result': 'inconclusive',
    'H/O substance abuse': '-',
    'White Blood cell count (thousand per microliter)': train_df['White Blood cell count (thousand per microliter)'].mean(),
    'History of anomalies in previous pregnancies': 'No',
    'Inherited from father': 'No',
    'Gender': 'Ambiguous',
    'Follow-up': 'Low',
    'Maternal gene': 'No',
    'Patient Age': train_df['Patient Age'].mean(),
    'Symptom 1': train_df['Symptom 1'].mode()[0],
    'Symptom 2': train_df['Symptom 2'].mode()[0],
    'Symptom 3': train_df['Symptom 3'].mode()[0],
    'Symptom 4': train_df['Symptom 4'].mode()[0],
    'Symptom 5': train_df['Symptom 5'].mode()[0],
    'Heart Rate (rates/min': train_df['Heart Rate (rates/min'].mode()[0]
}

for i in fill_nan_dict:
    train_df[i].fillna(fill_nan_dict[i], inplace=True)
    test_df[i].fillna(fill_nan_dict[i], inplace=True)

#ONE HOT ENCODING
one_hot = [
    "Genes in mother's side",
    'Inherited from father',
    'Maternal gene',
    'Paternal gene',
    'Respiratory Rate (breaths/min)',
    'Heart Rate (rates/min',
    'Follow-up',
    'Gender',
    'Birth asphyxia',
    'Folic acid details (peri-conceptional)',
    'H/O serious maternal illness',
    'H/O radiation exposure (x-ray)',
    'H/O substance abuse',
    'Assisted conception IVF/ART',
    'History of anomalies in previous pregnancies',
    'Birth defects',
    'Blood test result'
]

for feat in one_hot:
    train_dummies = pd.get_dummies(train_df[feat], drop_first=True, prefix=feat+"_", dtype=float)
    train_df = pd.concat([train_df, train_dummies], axis=1)
    train_df.drop(feat, axis=1, inplace=True)

    test_dummies = pd.get_dummies(test_df[feat], drop_first=True, prefix=feat+"_", dtype=float)
    test_df = pd.concat([test_df, test_dummies], axis=1)
    test_df.drop(feat, axis=1, inplace=True)
train_df_disorder = train_df.sample(frac=1, random_state=seed).reset_index(drop=True)
train_df_subclass = train_df.sample(frac=1, random_state=seed).reset_index(drop=True)

train_df_disorder.drop(target_subclass, inplace=True, axis=1)

le = skp.LabelEncoder()
train_df_subclass[target_disorder] = le.fit_transform(train_df_subclass[target_disorder])
train_df_disorder_shuffle = train_df_disorder.sample(frac=1, random_state=seed).reset_index(drop=True)

train_df_disorder_y = train_df_disorder_shuffle.pop(target_disorder)
train_df_disorder_X = train_df_disorder_shuffle

disorder_label = skp.LabelEncoder()
train_df_disorder_y = disorder_label.fit_transform(train_df_disorder_y)

X_disorder_train, X_disorder_test, y_disorder_train, y_disorder_test = skms.train_test_split(train_df_disorder_X, train_df_disorder_y, train_size=0.7, random_state=seed)
train_df_subclass_shuffle = train_df_subclass.sample(frac=1, random_state=seed).reset_index(drop=True)

train_df_subclass_y = train_df_subclass_shuffle.pop(target_subclass)
train_df_subclass_X = train_df_subclass_shuffle

subclass_label = skp.LabelEncoder()
train_df_subclass_y = subclass_label.fit_transform(train_df_subclass_y)

X_subclass_train, X_subclass_test, y_subclass_train, y_subclass_test = skms.train_test_split(train_df_subclass_X, train_df_subclass_y, train_size=0.7, random_state=seed)
# FEATURE SCALING
numerical_features = train_df.select_dtypes(include=[np.number])
X_disorder_train.reset_index(drop=True, inplace=True)
X_disorder_test.reset_index(drop=True, inplace=True)
X_disorder_train.index[:5]

scaler = skp.StandardScaler()

X_disorder_train[numerical_features.columns] = pd.DataFrame(scaler.fit_transform(X_disorder_train[numerical_features.columns]), columns=numerical_features.columns)

X_disorder_test[numerical_features.columns] = pd.DataFrame(scaler.transform(X_disorder_test[numerical_features.columns]), columns=numerical_features.columns)

# view sample data
X_disorder_train.describe()
numerical_features = train_df.select_dtypes(include=[np.number])

X_subclass_train.reset_index(drop=True, inplace=True)
X_subclass_test.reset_index(drop=True, inplace=True)
X_subclass_train.index[:5]

scaler = skp.StandardScaler()

X_subclass_train[numerical_features.columns] = pd.DataFrame(scaler.fit_transform(X_subclass_train[numerical_features.columns]), columns=numerical_features.columns)

X_subclass_test[numerical_features.columns] = pd.DataFrame(scaler.transform(X_subclass_test[numerical_features.columns]), columns=numerical_features.columns)

X_subclass_train.describe()
class_weights_disorder = sku.class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_disorder_train), y=y_disorder_train)
class_weights_disorder = dict(enumerate(class_weights_disorder))
sample_weights_disorder = sku.class_weight.compute_sample_weight('balanced', y_disorder_train)
sample_weights_disorder
class_weights_subclass = sku.class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_subclass_train), y=y_subclass_train)
class_weights_subclass = dict(enumerate(class_weights_subclass))
sample_weights_subclass = sku.class_weight.compute_sample_weight('balanced', y_subclass_train)
sample_weights_subclass

array([0.50615058, 0.42948456, 0.50615058, ..., 3.44853945, 0.42948456,
       1.58953064])

In [3]:
from concrete.ml.deployment import FHEModelClient, FHEModelServer, FHEModelDev

disorder_fhe_dir = './tmp/disorder/fhe_client_server_files/'
disorder_key_dir = "./tmp/disorder/keys_client"

disorder_client = FHEModelClient(path_dir=disorder_fhe_dir, key_dir=disorder_key_dir)
disorder_server = FHEModelServer(path_dir=disorder_fhe_dir)
disorder_server.load()

subclass_fhe_dir = './tmp/sub_class/fhe_client_server_files/'
subclass_key_dir = "./tmp/sub_class/keys_client"

subclass_client = FHEModelClient(path_dir=subclass_fhe_dir, key_dir=subclass_key_dir)

subclass_server = FHEModelServer(path_dir=subclass_fhe_dir)
subclass_server.load()


subclass_serialized_evaluation_keys = subclass_client.get_serialized_evaluation_keys()
disorder_serialized_evaluation_keys = disorder_client.get_serialized_evaluation_keys()


In [4]:
# encrypted_disorder = disorder_client.quantize_encrypt_serialize(np.array([X_disorder_test.iloc[0].values]))
# encrypted_subclass = subclass_client.quantize_encrypt_serialize(np.array([X_subclass_test.iloc[0].values]))
# encrypted_result_disorder = disorder_server.run(encrypted_disorder, disorder_serialized_evaluation_keys)
# encrypted_result_subclass = subclass_server.run(encrypted_subclass, subclass_serialized_evaluation_keys)
# # # # Client decrypts the result
# disorder_result = np.array(disorder_client.deserialize_decrypt_dequantize(encrypted_result_disorder)).argmax(axis=1)
# subclass_result = np.array(subclass_client.deserialize_decrypt_dequantize(encrypted_result_subclass)).argmax(axis=1)


In [5]:
# print(f"Decrypted Result, { disorder_label.inverse_transform(disorder_result) }")
# print(f"True Result, { disorder_label.inverse_transform([y_disorder_test[0]]) }")

In [6]:
# print(f"Decrypted Result, { subclass_label.inverse_transform(subclass_result) }")
# print(f"True Result, { subclass_label.inverse_transform([y_subclass_test[0]]) }")

In [7]:
# import textwrap

# print(f"Disorder Encrypted Result:\n {textwrap.fill(encrypted_result_disorder.hex()[0:200], 20)}")
# print(f"Subclass Encrypted Result:\n {textwrap.fill(encrypted_result_subclass.hex()[0:200], 20)}")

In [29]:
N_SAMPLE_FHE = 3

# Pick N_SAMPLE_FHE random samples from the test set
idx_test = np.random.choice(X_disorder_test.shape[0], N_SAMPLE_FHE, replace=False)
X_disorder_test_fhe = X_disorder_test.iloc[idx_test].values
y_disorder_test_fhe = y_disorder_test[idx_test]


time_begin = time.time()
encrypted_records = []
encrypted_results = []

# # Compute the predictions using the Concrete ML (quantized) model in the clear
for i in X_disorder_test_fhe:
    encrypted_disorder = disorder_client.quantize_encrypt_serialize(np.array([i]))
    encrypted_records.append(encrypted_disorder)
    disorder_pred = disorder_server.run(encrypted_disorder, disorder_serialized_evaluation_keys)
    encrypted_results.append(disorder_pred)

# # Compute the predictions using the Concrete ML model in FHE
print(f"FHE execution time (Genetic Disorder): {(time.time() - time_begin) / len(X_disorder_test_fhe):.2f} seconds per sample")

FHE execution time (Genetic Disorder): 997.50 seconds per sample


In [45]:
import textwrap

for i in encrypted_disorders:
    print(f"Encrypted Records:\n {textwrap.fill(i.hex()[80:120], 20)}")


Encrypted Records:
 b4010000010001000100
00000e00000001000000
Encrypted Records:
 cc842500010001000100
00000e00000001000000
Encrypted Records:
 b4010000010001000100
00000e00000001000000
Encrypted Records:
 cc842500010001000100
00000e00000001000000
Encrypted Records:
 b4010000010001000100
00000e00000001000000
Encrypted Records:
 cc842500010001000100
00000e00000001000000


In [None]:
disorder_results = []
for i in encrypted_results:
    disorder_result = np.array(disorder_client.deserialize_decrypt_dequantize(i)).argmax(axis=1)
    disorder_results.append(disorder_result)
    print(f"Decrypted Result, { disorder_label.inverse_transform(disorder_result) }")


for i in y_disorder_test_fhe:
    print(f"True Result, { disorder_label.inverse_transform([i]) }")

Decrypted Result, ['Multifactorial genetic inheritance disorders']
Decrypted Result, ['Single-gene inheritance diseases']
Decrypted Result, ['Single-gene inheritance diseases']
True Result, ['Mitochondrial genetic inheritance disorders']
True Result, ['Single-gene inheritance diseases']
True Result, ['Single-gene inheritance diseases']


[]

In [None]:
# # Pick N_SAMPLE_FHE random samples from the test set
# idx_test = np.random.choice(X_subclass_test.shape[0], N_SAMPLE_FHE, replace=False)
# X_subclass_test_fhe = X_subclass_test.iloc[idx_test]
# y_subclass_test_fhe = y_subclass_test[idx_test]

# # Compute the predictions using the Concrete ML (quantized) model in the clear
# y_preds_subclass_clear = concrete_subclass_model.predict(X_subclass_test_fhe)

# # Compute the predictions using the Concrete ML model in FHE
# time_begin = time.time()
# y_preds_subclass_fhe = concrete_subclass_model.predict(X_subclass_test_fhe, fhe="execute")
# print(f"FHE execution time: {(time.time() - time_begin) / len(X_subclass_test_fhe):.2f} seconds per sample")
# # Compare the clear quantized inference vs FHE inference
# print(
#     f"{(y_preds_subclass_fhe == y_preds_subclass_clear).sum()}/{N_SAMPLE_FHE} "
#     "Disorder Subclass: FHE predictions match the clear quantized predictions"
# )