In [1]:
import pandas  as pd
from sklearn.preprocessing import StandardScaler
from piheaan.math import approx
import math

Training data loading

In [2]:
train = pd.read_csv("train.csv")
features = train.columns
train_x = train[features[1:-1]]
train_y = train[features[-1]]

In [3]:
train_x, train_y

(      LVR  REF  INSUR   RATE   AMOUNT  CREDIT  TERM  ARM
 0    69.8    1      1  7.400  1.78000     558    30    1
 1    95.0    0      0  9.625  2.96065     508    30    1
 2    85.0    1      1  7.700  0.97750     549    30    1
 3    80.0    0      0  7.700  1.48000     602    30    1
 4    43.4    1      1  6.450  1.15000     533    30    1
 ..    ...  ...    ...    ...      ...     ...   ...  ...
 507  76.2    1      1  7.900  1.29600     529    30    1
 508  85.0    1      0  6.450  2.38000     636    30    0
 509  63.8    1      0  7.700  1.50000     582    30    1
 510  85.0    1      0  8.500  2.04000     614    30    1
 511  80.0    1      0  8.650  1.00800     588    30    1
 
 [512 rows x 8 columns],
 0      0
 1      0
 2      0
 3      1
 4      0
       ..
 507    0
 508    1
 509    1
 510    0
 511    0
 Name: DELINQUENT, Length: 512, dtype: int64)

Preprocessing

In [4]:
from sklearn.preprocessing import StandardScaler

col_names = ['LVR', 'RATE', 'AMOUNT', 'CREDIT', 'TERM']
feature = train_x[col_names]

scaler = StandardScaler()
scaler.fit(feature)
s_train_x = scaler.transform(feature)

train_x = pd.DataFrame(s_train_x, columns=col_names)

In [5]:
train_x

Unnamed: 0,LVR,RATE,AMOUNT,CREDIT,TERM
0,-0.028868,-0.298899,0.057371,-1.039018,0.354729
1,1.055468,1.066247,1.269561,-1.860504,0.354729
2,0.625176,-0.114834,-0.766568,-1.186886,0.354729
3,0.410030,-0.114834,-0.250644,-0.316111,0.354729
4,-1.164840,-0.881770,-0.589459,-1.449761,0.354729
...,...,...,...,...,...
507,0.246519,0.007875,-0.439559,-1.515480,0.354729
508,0.625176,-0.881770,0.673399,0.242499,0.354729
509,-0.287044,-0.114834,-0.230109,-0.644705,0.354729
510,0.625176,0.376005,0.324317,-0.118955,0.354729


Test data loading

In [6]:
test = pd.read_csv("test.csv")

In [7]:
test_x = test[features[1:-1]]
test_y = test[features[-1]]

In [8]:
col_names = ['LVR', 'RATE', 'AMOUNT', 'CREDIT', 'TERM']
feature = test_x[col_names]

scaler = StandardScaler()
scaler.fit(feature)
s_test_x = scaler.transform(feature)

test_x = pd.DataFrame(s_test_x, columns=col_names)

In [9]:
test_x

Unnamed: 0,LVR,RATE,AMOUNT,CREDIT,TERM
0,-2.192733,0.846046,-1.357284,0.767113,-2.560727
1,0.391852,0.540178,0.305117,-1.810741,0.414520
2,0.391852,-0.388651,-0.758820,-0.934271,0.414520
3,0.391852,-0.276406,1.236061,-0.556185,0.414520
4,1.240458,0.060330,1.568541,0.320285,0.414520
...,...,...,...,...,...
507,0.391852,-0.854468,-0.553632,0.028128,0.414520
508,0.745079,-0.192222,1.055572,0.045314,0.414520
509,0.391852,-1.146306,0.046732,1.042084,0.414520
510,-0.081988,-0.079977,-0.757870,-0.642114,0.414520


Params init

In [10]:
import numpy as np

logslots = 9
numslots = 2 ** logslots
datanum = train.shape[0]
weightnum = 9
weight = np.random.uniform(size = weightnum)
weight

array([0.58830922, 0.30808678, 0.19313997, 0.85589473, 0.50503738,
       0.43283021, 0.92742636, 0.84969859, 0.75405047])

bias를 새로 변수로 만들어 사용하지 않고, constant 열을 추가해서 사용?

In [11]:
cons = pd.DataFrame({'constant': [1 for i in range(datanum) ]})
train_x = pd.concat([cons, train_x], axis=1)
test_x = pd.concat([cons, test_x], axis=1)

In [12]:
train_x

Unnamed: 0,constant,LVR,RATE,AMOUNT,CREDIT,TERM
0,1,-0.028868,-0.298899,0.057371,-1.039018,0.354729
1,1,1.055468,1.066247,1.269561,-1.860504,0.354729
2,1,0.625176,-0.114834,-0.766568,-1.186886,0.354729
3,1,0.410030,-0.114834,-0.250644,-0.316111,0.354729
4,1,-1.164840,-0.881770,-0.589459,-1.449761,0.354729
...,...,...,...,...,...,...
507,1,0.246519,0.007875,-0.439559,-1.515480,0.354729
508,1,0.625176,-0.881770,0.673399,0.242499,0.354729
509,1,-0.287044,-0.114834,-0.230109,-0.644705,0.354729
510,1,0.625176,0.376005,0.324317,-0.118955,0.354729


Key init

In [13]:
import piheaan as heaan

params = heaan.ParameterPreset.FVa
context = heaan.make_context(params)
heaan.make_bootstrappable(context)

In [14]:
key_dir_path = "./key_new"
secret_key = heaan.SecretKey(context)
secret_key.save("./secret_key.bin")
key_generator = heaan.KeyGenerator(context, secret_key)
key_generator.gen_common_keys()
key_generator.save(key_dir_path)
public_key = key_generator.keypack

In [15]:
encryptor = heaan.Encryptor(context)
decryptor = heaan.Decryptor(context)
evaluator = heaan.HomEvaluator(context, public_key)

In [16]:
ct = []
features_num = train_x.shape[1]
weightnum = features_num
print(weightnum)

for i in range(features_num):
    message = heaan.Message(logslots)
    for j in range(numslots):
        # message[j] = train_x.loc[j][i]
        message[j] = train_x.iloc[j, i]
    ciphertext = heaan.Ciphertext(context)
    encryptor.encrypt(message, public_key, ciphertext)
    ct.append(ciphertext)

6


In [17]:
for i in range(features_num):
    print(ct[i])

(level: 29, log(num slots): 9, data: [ (1.000000+0.000000j), (1.000000+0.000000j), (1.000000+0.000000j), (1.000000+0.000000j), (1.000000+0.000000j), ..., (1.000000+0.000000j), (1.000000+0.000000j), (1.000000+0.000000j), (1.000000+0.000000j), (1.000000+0.000000j) ])
(level: 29, log(num slots): 9, data: [ (-0.028868+0.000000j), (1.055468+0.000000j), (0.625176+0.000000j), (0.410030+0.000000j), (-1.164840+0.000000j), ..., (0.246519+0.000000j), (0.625176+0.000000j), (-0.287044+0.000000j), (0.625176+0.000000j), (0.410030+0.000000j) ])
(level: 29, log(num slots): 9, data: [ (-0.298899+0.000000j), (1.066247+0.000000j), (-0.114834+0.000000j), (-0.114834+0.000000j), (-0.881770+0.000000j), ..., (0.007875+0.000000j), (-0.881770+0.000000j), (-0.114834+0.000000j), (0.376005+0.000000j), (0.468037+0.000000j) ])
(level: 29, log(num slots): 9, data: [ (0.057371+0.000000j), (1.269561+0.000000j), (-0.766568+0.000000j), (-0.250644+0.000000j), (-0.589459+0.000000j), ..., (-0.439559+0.000000j), (0.673399+0.0

In [18]:
ans = heaan.Message(logslots)
for i in range(numslots):
    ans[i] = train_y.loc[i]

ans_ct = heaan.Ciphertext(context)
encryptor.encrypt(ans, public_key, ans_ct)

In [19]:
ans_ct

(level: 29, log(num slots): 9, data: [ (0.000000+0.000000j), (0.000000+0.000000j), (0.000000+0.000000j), (1.000000+0.000000j), (0.000000+0.000000j), ..., (0.000000+0.000000j), (1.000000+0.000000j), (1.000000+0.000000j), (0.000000+0.000000j), (0.000000+0.000000j) ])

In [20]:
def cal_avg(ciphertext, N, logslots):
    evaluator = heaan.HomEvaluator(context, public_key)   
    ciphertext_out_avg = heaan.Ciphertext(context, logslots)
    
    for i in range(N):
        evaluator.add(ciphertext, ciphertext_out_avg, ciphertext_out_avg)
        evaluator.right_rotate(ciphertext, 1, ciphertext)
    
    evaluator.mult(ciphertext_out_avg, 1/N, ciphertext_out_avg)
    return ciphertext_out_avg

In [21]:
def hypothesis(eval, ct, weight_ct):
    sum_ct = heaan.Ciphertext(context, logslots)
    for i in range(weightnum):
        mult_ct = heaan.Ciphertext(context, logslots)
        evaluator.mult(ct[i], weight_ct[i], mult_ct)
        evaluator.add(sum_ct, mult_ct, sum_ct)
    hy_ct = heaan.Ciphertext(context, logslots)
    approx.sigmoid(evaluator, sum_ct, hy_ct, 1.0)
    
    return hy_ct

In [22]:
def loss_fn_cross_entropy(eval, hy_ct, ans_ct):
    # loss = -((Y * log(Hypothesis)) + ((1 - Y) * log(1 - Hypothesis)))
    loss_l = heaan.Ciphertext(context, logslots)        # Y * log(Hypothesis)
    loss_r = heaan.Ciphertext(context, logslots)        # (1 - Y) * log(1 - Hypothesis)
    one_miuns_y = heaan.Ciphertext(context, logslots)   # 1 - Y
    
    # Y * log(Hypothesis)
    approx.log_2(eval, hy_ct, loss_l)
    eval.mult(ans_ct, loss_l, loss_l)
    
    # 1 - Y
    eval.sub(ans_ct, 1, one_miuns_y)
    eval.negate(one_miuns_y, one_miuns_y)
    
    # log(1 - Hypothesis)
    eval.sub(hy_ct, 1, loss_r)
    eval.negate(loss_r, loss_r)
    approx.log_2(eval, loss_r, loss_r)
    
    # (1 - Y) * log(1 - Hypothesis)
    eval.mult(loss_r, one_miuns_y, loss_r)
    
    # -((Y * log(Hypothesis)) + ((1 - Y) * log(1 - Hypothesis)))
    eval.add(loss_l, loss_r, loss_l)
    eval.negate(loss_l, loss_l)
    
    return loss_l
    
    
def fit_cross_entropy(ct, ans_ct, epoch=30, lr=0.05):
    evaluator = heaan.HomEvaluator(context, public_key)
    encryptor = heaan.Encryptor(context)

    weight = np.random.uniform(size = weightnum)
    weight_ct = []

    # weight_ct를 6x1의 벡터로 만드려 했으나,
    # CipherText는 index로 접근할 수 없어서 6x512의 행렬로 만들었습니다.
    for i in range(weightnum):
        message = heaan.Message(logslots)
        for j in range(numslots):
            message[j] = weight[i]
        weight_ct.append(heaan.Ciphertext(context, logslots))
        encryptor.encrypt(message, public_key, weight_ct[i])

    # weight_ct_new = weight_ct - avg(((hypothesis - Y) * lr) * X)
    for j in range(epoch):
        # 가설 함수
        hy_ct = hypothesis(evaluator, ct, weight_ct)
        
        # hy_ct = (hypothesis - Y) * lr
        evaluator.sub(hy_ct, ans_ct, hy_ct)
        evaluator.mult(hy_ct, lr, hy_ct)

        # 편미분 값 계산 및 가중치 업데이트
        grad = []
        for i in range(weightnum):
            grad.append(heaan.Ciphertext(context))
            
            # hy_ct = (hypothesis - Y) * lr
            # grad[i] = avg(hy_ct * X)
            evaluator.mult(ct[i], hy_ct, grad[i])
            grad[i] = cal_avg(grad[i], numslots, logslots)
            
            # weight_ct_new[i] = weight_ct[i] - grad[i]
            evaluator.sub(weight_ct[i], grad[i], weight_ct[i])
            
            # 부트스트랩의 적절한 시점을 잘 모르겠습니다.
            # 6보다 작을 때 부트스트랩 시행 기준
            # 많은 횟수의 트레이닝에서 간혹 nan이 발생하는 것을 확인했습니다.
            # 또한, 부트스트랩의 이론적 원리를 모르는 상태이므로 help를 이용해서
            # 함수의 호출 방법만 확인하고 사용했습니다.
            if(weight_ct[i].level < 6):
                evaluator.bootstrap(weight_ct[i], weight_ct[i])
                # print("bootstrap", i)
                
        print("cost", j, ": ", loss_fn_cross_entropy(evaluator, hy_ct, ans_ct))
        
    return weight_ct

In [23]:
def loss_fn_mse(eval, loss):
    eval.mult(loss, loss, loss)
    loss = cal_avg(loss, numslots, logslots)
    return loss

def fit_mse(ct, ans_ct, epoch=30, lr=0.05):
    evaluator = heaan.HomEvaluator(context, public_key)
    encryptor = heaan.Encryptor(context)

    weight = np.random.uniform(size = weightnum)
    weight_ct = []

    # weight_ct를 6x1의 벡터로 만드려 했으나,
    # CipherText는 index로 접근할 수 없어서 6x512의 행렬로 만들었습니다.
    for i in range(weightnum):
        message = heaan.Message(logslots)
        for j in range(numslots):
            message[j] = weight[i]
        weight_ct.append(heaan.Ciphertext(context, logslots))
        encryptor.encrypt(message, public_key, weight_ct[i])

    # weight_ct_new = MSE
    for j in range(epoch):
        hy_ct = hypothesis(evaluator, ct, weight_ct)

        error_ct = heaan.Ciphertext(context, logslots)
        evaluator.sub(hy_ct, ans_ct, error_ct)

        # 기울기 및 가중치 업데이트
        for i in range(weightnum):
            grad_ct = heaan.Ciphertext(context, logslots)
            evaluator.mult(ct[i], error_ct, grad_ct)

            avg_grad_ct = cal_avg(grad_ct, numslots, logslots)
            evaluator.mult(avg_grad_ct, 2.0 / numslots, avg_grad_ct)

            evaluator.mult(avg_grad_ct, lr, avg_grad_ct)
            evaluator.sub(weight_ct[i], avg_grad_ct, weight_ct[i])

            if weight_ct[i].level < 6:
                evaluator.bootstrap(weight_ct[i], weight_ct[i])
        print("cost", j, ": ", loss_fn_mse(evaluator, error_ct))
        if error_ct.level < 6:
            evaluator.bootstrap(error_ct, error_ct)
        
    return weight_ct

In [24]:
weight_ct = fit_cross_entropy(ct, ans_ct, epoch=30, lr=0.05)
# weight_ct = fit_mse(ct, ans_ct, epoch=30, lr=0.05)

cost 0 :  (level: 9, log(num slots): 9, data: [ (0.046220+0.000000j), (0.066698+0.000000j), (0.043473+0.000000j), (-900702204359787028371632571667668453351266621839358846026052040012438890239508902279621305868590468479090842297616582772074202376522452970671913385746431524256915019038638424279564377688761030625224831036292710777749486469823124483081735245806403030311960591234702152321466368.000000+0.000000j), (0.017511+0.000000j), ..., (0.040491+0.000000j), (-1524049813806299346810983132359713662561673901202472960.000000+0.000000j), (nan+nanj), (0.062480+0.000000j), (0.046218+0.000000j) ])
cost 1 :  (level: 9, log(num slots): 9, data: [ (0.045737+0.000000j), (0.066478+0.000000j), (0.042959+0.000000j), (-456979039686074012575933336407554064974728557243375570447431044150845717731320961581210642362833871716471634426072327144086140677166278150538833628238806981254170318373430611727699980464983107484667424053935577040769533646436941927586093858899972136685887549415160456204474422195109662556

In [25]:
weight_ct

[(level: 7, log(num slots): 9, data: [ (0.301161+0.000000j), (0.301161+0.000000j), (0.301161+0.000000j), (0.301161+0.000000j), (0.301161+0.000000j), ..., (0.301161+0.000000j), (0.301161+0.000000j), (0.301161+0.000000j), (0.301161+0.000000j), (0.301161+0.000000j) ]),
 (level: 7, log(num slots): 9, data: [ (0.575665+0.000000j), (0.575665+0.000000j), (0.575665+0.000000j), (0.575665+0.000000j), (0.575665+0.000000j), ..., (0.575665+0.000000j), (0.575665+0.000000j), (0.575665+0.000000j), (0.575665+0.000000j), (0.575665+0.000000j) ]),
 (level: 7, log(num slots): 9, data: [ (0.291466+0.000000j), (0.291466+0.000000j), (0.291466+0.000000j), (0.291466+0.000000j), (0.291466+0.000000j), ..., (0.291466+0.000000j), (0.291466+0.000000j), (0.291466+0.000000j), (0.291466+0.000000j), (0.291466+0.000000j) ]),
 (level: 7, log(num slots): 9, data: [ (0.668427+0.000000j), (0.668427+0.000000j), (0.668427+0.000000j), (0.668427+0.000000j), (0.668427+0.000000j), ..., (0.668427+0.000000j), (0.668427+0.000000j), (

In [26]:
for i in range(weightnum):
    print(ct[i])

(level: 29, log(num slots): 9, data: [ (1.000000+0.000000j), (1.000000+0.000000j), (1.000000+0.000000j), (1.000000+0.000000j), (1.000000+0.000000j), ..., (1.000000+0.000000j), (1.000000+0.000000j), (1.000000+0.000000j), (1.000000+0.000000j), (1.000000+0.000000j) ])
(level: 29, log(num slots): 9, data: [ (-0.028868+0.000000j), (1.055468+0.000000j), (0.625176+0.000000j), (0.410030+0.000000j), (-1.164840+0.000000j), ..., (0.246519+0.000000j), (0.625176+0.000000j), (-0.287044+0.000000j), (0.625176+0.000000j), (0.410030+0.000000j) ])
(level: 29, log(num slots): 9, data: [ (-0.298899+0.000000j), (1.066247+0.000000j), (-0.114834+0.000000j), (-0.114834+0.000000j), (-0.881770+0.000000j), ..., (0.007875+0.000000j), (-0.881770+0.000000j), (-0.114834+0.000000j), (0.376005+0.000000j), (0.468037+0.000000j) ])
(level: 29, log(num slots): 9, data: [ (0.057371+0.000000j), (1.269561+0.000000j), (-0.766568+0.000000j), (-0.250644+0.000000j), (-0.589459+0.000000j), ..., (-0.439559+0.000000j), (0.673399+0.0

In [27]:
sum_ct = heaan.Ciphertext(context, logslots)
for i in range(weightnum):
    mult_ct = heaan.Ciphertext(context, logslots)
    evaluator.mult(ct[i], weight_ct[i], mult_ct)
    evaluator.add(sum_ct, mult_ct, sum_ct)

In [28]:
sum_ct

(level: 6, log(num slots): 9, data: [ (-0.158526+0.000000j), (1.413346+0.000000j), (-0.326000+0.000000j), (0.171133+0.000000j), (-1.544959+0.000000j), ..., (-0.393831+0.000000j), (0.866243+0.000000j), (-0.320621+0.000000j), (0.884886+0.000000j), (-0.055846+0.000000j) ])

In [29]:
pred_ct = heaan.Ciphertext(context)
approx.sigmoid(evaluator, sum_ct, pred_ct, 1.0)

In [30]:
pred_ct

(level: 10, log(num slots): 9, data: [ (0.460452+0.000000j), (0.804293+0.000000j), (0.419216+0.000000j), (0.542678+0.000000j), (0.175817+0.000000j), ..., (0.402797+0.000000j), (0.703965+0.000000j), (0.420526+0.000000j), (0.707835+0.000000j), (0.486043+0.000000j) ])

In [31]:
ans_ct

(level: 29, log(num slots): 9, data: [ (0.000000+0.000000j), (0.000000+0.000000j), (0.000000+0.000000j), (1.000000+0.000000j), (0.000000+0.000000j), ..., (0.000000+0.000000j), (1.000000+0.000000j), (1.000000+0.000000j), (0.000000+0.000000j), (0.000000+0.000000j) ])

In [32]:
pred_pt = heaan.Message(logslots)
decryptor.decrypt(pred_ct, secret_key, pred_pt)

pred = []
for i in range(numslots):
    pred.append(pred_pt[i])

In [33]:
for i in range(len(pred)):
    pred[i] = pred[i].real

In [34]:
threshold = 0.95
prediction = [1 if i > threshold else 0 for i in pred]

In [35]:
from sklearn.metrics import accuracy_score

# epoch 30 기준, 약 0.8의 정확도를 보였습니다.
# 예상치에 가까운 것인지 확인 부탁드립니다.
accuracy_score(train_y, prediction)

0.802734375

In [36]:
test_x, test_y

(     constant       LVR      RATE    AMOUNT    CREDIT      TERM
 0           1 -2.192733  0.846046 -1.357284  0.767113 -2.560727
 1           1  0.391852  0.540178  0.305117 -1.810741  0.414520
 2           1  0.391852 -0.388651 -0.758820 -0.934271  0.414520
 3           1  0.391852 -0.276406  1.236061 -0.556185  0.414520
 4           1  1.240458  0.060330  1.568541  0.320285  0.414520
 ..        ...       ...       ...       ...       ...       ...
 507         1  0.391852 -0.854468 -0.553632  0.028128  0.414520
 508         1  0.745079 -0.192222  1.055572  0.045314  0.414520
 509         1  0.391852 -1.146306  0.046732  1.042084  0.414520
 510         1 -0.081988 -0.079977 -0.757870 -0.642114  0.414520
 511         1  1.037999  0.354973  3.606169  0.045314  0.414520
 
 [512 rows x 6 columns],
 0      0
 1      1
 2      0
 3      1
 4      0
       ..
 507    0
 508    0
 509    0
 510    1
 511    1
 Name: DELINQUENT, Length: 512, dtype: int64)

In [37]:
test_ct = []
features_num = test_x.shape[1]

for i in range(features_num):
    message = heaan.Message(logslots)
    for j in range(numslots):
        message[j] = test_x.iloc[j, i]
    ciphertext = heaan.Ciphertext(context)
    encryptor.encrypt(message, public_key, ciphertext)
    test_ct.append(ciphertext)

In [38]:
test_ans = heaan.Message(logslots)
for i in range(numslots):
    test_ans[i] = test_y.loc[i]

test_ans_ct = heaan.Ciphertext(context)
encryptor.encrypt(test_ans, public_key, test_ans_ct)

In [39]:
test_sum_ct = heaan.Ciphertext(context, logslots)
for i in range(weightnum):
    mult_ct = heaan.Ciphertext(context, logslots)
    evaluator.mult(test_ct[i], weight_ct[i], mult_ct)
    evaluator.add(test_sum_ct, mult_ct, test_sum_ct)

In [40]:
test_pred_ct = heaan.Ciphertext(context)
approx.sigmoid(evaluator, test_sum_ct, test_pred_ct, 1.0)

In [41]:
pred_pt = heaan.Message(logslots)
decryptor.decrypt(test_pred_ct, secret_key, pred_pt)

pred = []
for i in range(numslots):
    pred.append(pred_pt[i])

In [42]:
for i in range(len(pred)):
    pred[i] = pred[i].real

In [43]:
prediction = [1 if i > threshold else 0 for i in pred]

In [44]:
from sklearn.metrics import accuracy_score

# 여기서 정확도가 안나옵니다....
# 0.4~5정도로... 떨어집니다...
accuracy_score(train_y, prediction)

0.798828125