In [2]:
# Pre-train용
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
from sklearn.model_selection import train_test_split

class CustomBertForSequenceClassification(BertForSequenceClassification):
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        labels=None,
        output_hidden_states=True
    ):
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            labels=labels,
            output_hidden_states=output_hidden_states
        )
        logits = outputs.logits
        hidden_states = outputs.hidden_states[-5]  # n번째 레이어의 hidden states를 반환합니다.
        loss = outputs.loss
        return logits, loss, hidden_states

# 데이터 로드 및 전처리
data_A = pd.read_csv("output1.csv")  # data set A 파일명에 맞게 수정
data_B = pd.read_csv("infected.csv")  # data set B 파일명에 맞게 수정
# 모델 저장 경로
model_path = "Pre-trained.pt"

# X_train, Y_train 생성
X_train = []
Y_train = []

for index, row in data_A.iterrows():  # 중복 제거를 하지 않고 원본 데이터 사용
    patient_id = row["ID"]
    patient_info = [str(row[column]) for column in data_A.columns if column != "ID" and column != "DESCRIPTION"]
    symptoms = ", ".join(data_A[data_A["ID"] == patient_id]["DESCRIPTION"].tolist())
    combined_info = ", ".join(patient_info) + ", " + symptoms
    X_train.append(combined_info)
    if patient_id in data_B.values:
        Y_train.append(1)
    else:
        Y_train.append(0)

print("X_train\n", X_train[:10])
print("Y_train\n", Y_train[:10])
        
# BERT 토크나이저 및 모델 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# 모델이 이미 저장되어 있는지 확인하고, 저장된 모델이 있으면 불러오고 없으면 새로운 모델 생성
if os.path.exists(model_path):
    # 저장된 모델이 있을 경우 불러오기
    model = CustomBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.load_state_dict(torch.load(model_path))
    print("Pre-train model loaded.")
else:
    # 저장된 모델이 없을 경우 새로운 모델 생성
    model = CustomBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    print("New model generated.")

# 입력 데이터를 BERT의 입력 형식으로 변환
max_len = 128  # 입력 시퀀스의 최대 길이

input_ids = []
attention_masks = []

for info in X_train:
    encoded_dict = tokenizer.encode_plus(
                        info,                         # 환자 정보 및 증상
                        add_special_tokens = True,    # [CLS], [SEP] 토큰 추가
                        max_length = max_len,         # 최대 길이 지정
                        pad_to_max_length = True,     # 패딩을 추가하여 최대 길이로 맞춤
                        return_attention_mask = True, # 어텐션 마스크 생성
                        return_tensors = 'pt',        # PyTorch 텐서로 반환
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(Y_train)

# 데이터셋 및 데이터로더 생성
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = 0.8
train_dataset, val_dataset = train_test_split(dataset, test_size=1-train_size, random_state=42)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

# 모델을 GPU로 이동
model.to(device)

# 옵티마이저 및 학습률 설정
# 기본 학습률 : 2e-6
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# 에폭 설정
epochs = 10

# 학습 루프
hidden_states_list = []  # 모든 에폭에 대한 hidden state를 저장할 리스트
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs[1]  # loss가 outputs의 두 번째 값입니다.
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}')
        # hidden state를 저장합니다.
        #hidden_states = outputs[2]
        #hidden_states_list.append(hidden_states)

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}')

# 모든 에폭에 대한 hidden state를 합쳐서 CSV 파일로 저장합니다.
#hidden_states_concat = torch.cat(hidden_states_list, dim=0)
#hidden_states_concat = hidden_states_concat[:, 0, :].cpu().detach().numpy()
#hidden_states_df = pd.DataFrame(hidden_states_concat)
#hidden_states_df.to_csv("hidden_states_all_epochs.csv", index=False)

# 모델 저장
torch.save(model.state_dict(), model_path)

# 모델 평가
model.eval()
val_accuracy = 0
for batch in val_dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs[0]  # logits가 outputs의 첫 번째 값입니다.
    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()
    val_accuracy += (logits.argmax(axis=1) == label_ids).mean().item()

print(f'Validation Accuracy: {val_accuracy / len(val_dataloader)}')


X_train
 ['7/29/1966, nan, 999-91-3709, S99988287, X5601074X, Mrs., Celia938, Roberts511, nan, Mayert710, M, white, hispanic, F, Agawam  Massachusetts  US, 362 Pacocha Gateway Apt 1, Northborough, Massachusetts, Worcester County, 1532.0, 42.27341123, -71.63243239, 1166971.45, 13416.2, 4/20/1989, nan, 5cfda74f-b462-4c73-aa96-d90da4002f8a, 40055000.0, Chronic sinusitis (disorder), Body mass index 30+ - obesity (finding), Miscarriage in first trimester, Prediabetes, Hyperlipidemia, Nasal congestion (finding), Cough (finding), Sore throat symptom (finding), Sputum finding (finding), Muscle pain (finding), Joint pain (finding), Fever (finding)', '12/19/1965, 3/1/2020, 999-70-4989, S99948277, X2560575X, Mrs., Kala987, Prohaska837, nan, Gleason633, M, white, nonhispanic, F, Boston  Massachusetts  US, 310 Effertz Promenade, Gloucester, Massachusetts, Essex County, 1930.0, 42.63072986, -70.6443488, 1229943.52, 20003.74, 2/12/1984, nan, 3fc7077f-903c-4601-8078-a016e9b5a630, 59621000.0, Hypertens

Some weights of CustomBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Pre-train model loaded.




True
Epoch 1/10, Batch Loss: 0.2224544733762741
Epoch 1/10, Batch Loss: 0.61092209815979
Epoch 1/10, Batch Loss: 0.34972819685935974
Epoch 1/10, Batch Loss: 0.8476120829582214
Epoch 1/10, Batch Loss: 0.10035208612680435
Epoch 1/10, Batch Loss: 0.6608784794807434
Epoch 1/10, Batch Loss: 0.30939456820487976
Epoch 1/10, Batch Loss: 0.3346942365169525
Epoch 1/10, Batch Loss: 0.5650626420974731
Epoch 1/10, Batch Loss: 0.578464925289154
Epoch 1/10, Batch Loss: 0.36261409521102905
Epoch 1/10, Batch Loss: 0.2115771621465683
Epoch 1/10, Batch Loss: 0.21938490867614746
Epoch 1/10, Batch Loss: 0.22662068903446198
Epoch 1/10, Batch Loss: 0.32792675495147705
Epoch 1/10, Batch Loss: 0.3328045904636383
Epoch 1/10, Batch Loss: 0.13131974637508392
Epoch 1/10, Batch Loss: 0.3114399015903473
Epoch 1/10, Batch Loss: 0.1542816460132599
Epoch 1/10, Batch Loss: 0.4733436405658722
Epoch 1/10, Batch Loss: 0.36347755789756775
Epoch 1/10, Batch Loss: 0.39281314611434937
Epoch 1/10, Batch Loss: 0.3246540725231170

Epoch 1/10, Batch Loss: 0.2373313456773758
Epoch 1/10, Batch Loss: 0.3686026334762573
Epoch 1/10, Batch Loss: 0.3457343280315399
Epoch 1/10, Batch Loss: 0.20839367806911469
Epoch 1/10, Batch Loss: 0.31242799758911133
Epoch 1/10, Batch Loss: 0.5432679653167725
Epoch 1/10, Batch Loss: 0.2275206744670868
Epoch 1/10, Batch Loss: 0.05415671691298485
Epoch 1/10, Batch Loss: 0.30209338665008545
Epoch 1/10, Batch Loss: 0.2133215069770813
Epoch 1/10, Batch Loss: 0.09964485466480255
Epoch 1/10, Batch Loss: 0.07021492719650269
Epoch 1/10, Batch Loss: 0.48086994886398315
Epoch 1/10, Batch Loss: 0.19752547144889832
Epoch 1/10, Batch Loss: 0.32628199458122253
Epoch 1/10, Batch Loss: 0.45301252603530884
Epoch 1/10, Batch Loss: 0.22692584991455078
Epoch 1/10, Batch Loss: 0.4718497693538666
Epoch 1/10, Batch Loss: 0.3242536783218384
Epoch 1/10, Batch Loss: 0.10194378346204758
Epoch 1/10, Batch Loss: 0.32626795768737793
Epoch 1/10, Batch Loss: 0.2514084577560425
Epoch 1/10, Batch Loss: 0.227509900927543

Epoch 1/10, Batch Loss: 0.3723561465740204
Epoch 1/10, Batch Loss: 0.5005619525909424
Epoch 1/10, Batch Loss: 0.21948672831058502
Epoch 1/10, Batch Loss: 0.22568240761756897
Epoch 1/10, Batch Loss: 0.40080708265304565
Epoch 1/10, Batch Loss: 0.2373209297657013
Epoch 1/10, Batch Loss: 0.20352917909622192
Epoch 1/10, Batch Loss: 0.526936411857605
Epoch 1/10, Batch Loss: 0.48447179794311523
Epoch 1/10, Batch Loss: 0.3735070824623108
Epoch 1/10, Batch Loss: 0.3571159541606903
Epoch 1/10, Batch Loss: 0.24067972600460052
Epoch 1/10, Batch Loss: 0.4480568468570709
Epoch 1/10, Batch Loss: 0.2514423131942749
Epoch 1/10, Batch Loss: 0.37656915187835693
Epoch 1/10, Batch Loss: 0.13212615251541138
Epoch 1/10, Batch Loss: 0.23788152635097504
Epoch 1/10, Batch Loss: 0.5627189874649048
Epoch 1/10, Batch Loss: 0.47493571043014526
Epoch 1/10, Batch Loss: 0.7663746476173401
Epoch 1/10, Batch Loss: 0.37973570823669434
Epoch 1/10, Batch Loss: 0.22075727581977844
Epoch 1/10, Batch Loss: 0.12476903945207596

Epoch 2/10, Batch Loss: 0.20738749206066132
Epoch 2/10, Batch Loss: 0.46665915846824646
Epoch 2/10, Batch Loss: 0.4229746162891388
Epoch 2/10, Batch Loss: 0.22229087352752686
Epoch 2/10, Batch Loss: 0.27650949358940125
Epoch 2/10, Batch Loss: 0.399880051612854
Epoch 2/10, Batch Loss: 0.24302999675273895
Epoch 2/10, Batch Loss: 0.4890400767326355
Epoch 2/10, Batch Loss: 0.31711915135383606
Epoch 2/10, Batch Loss: 0.3862016797065735
Epoch 2/10, Batch Loss: 0.17959998548030853
Epoch 2/10, Batch Loss: 0.2487514466047287
Epoch 2/10, Batch Loss: 0.17972181737422943
Epoch 2/10, Batch Loss: 0.23113267123699188
Epoch 2/10, Batch Loss: 0.08648546785116196
Epoch 2/10, Batch Loss: 0.21407397091388702
Epoch 2/10, Batch Loss: 0.40804383158683777
Epoch 2/10, Batch Loss: 0.33770328760147095
Epoch 2/10, Batch Loss: 0.43659508228302
Epoch 2/10, Batch Loss: 0.24192583560943604
Epoch 2/10, Batch Loss: 0.1726953685283661
Epoch 2/10, Batch Loss: 0.6607251167297363
Epoch 2/10, Batch Loss: 0.28177574276924133

Epoch 2/10, Batch Loss: 0.39453792572021484
Epoch 2/10, Batch Loss: 0.3537467420101166
Epoch 2/10, Batch Loss: 0.1410859227180481
Epoch 2/10, Batch Loss: 0.1369086503982544
Epoch 2/10, Batch Loss: 0.34454333782196045
Epoch 2/10, Batch Loss: 0.11968555301427841
Epoch 2/10, Batch Loss: 0.36554983258247375
Epoch 2/10, Batch Loss: 0.11095035821199417
Epoch 2/10, Batch Loss: 0.4661902189254761
Epoch 2/10, Batch Loss: 0.32972145080566406
Epoch 2/10, Batch Loss: 0.4301767945289612
Epoch 2/10, Batch Loss: 0.22969795763492584
Epoch 2/10, Batch Loss: 0.2506070137023926
Epoch 2/10, Batch Loss: 0.3505370020866394
Epoch 2/10, Batch Loss: 0.25205373764038086
Epoch 2/10, Batch Loss: 0.4997600317001343
Epoch 2/10, Batch Loss: 0.22185149788856506
Epoch 2/10, Batch Loss: 0.3239201307296753
Epoch 2/10, Batch Loss: 0.2178657352924347
Epoch 2/10, Batch Loss: 0.18845349550247192
Epoch 2/10, Batch Loss: 0.5245582461357117
Epoch 2/10, Batch Loss: 0.33304762840270996
Epoch 2/10, Batch Loss: 0.8669389486312866


Epoch 3/10, Batch Loss: 0.22291871905326843
Epoch 3/10, Batch Loss: 0.4734683036804199
Epoch 3/10, Batch Loss: 0.2330523431301117
Epoch 3/10, Batch Loss: 0.36611366271972656
Epoch 3/10, Batch Loss: 0.2312304973602295
Epoch 3/10, Batch Loss: 0.39279425144195557
Epoch 3/10, Batch Loss: 0.4664580821990967
Epoch 3/10, Batch Loss: 0.1967255026102066
Epoch 3/10, Batch Loss: 0.2507661283016205
Epoch 3/10, Batch Loss: 0.34037187695503235
Epoch 3/10, Batch Loss: 0.46635836362838745
Epoch 3/10, Batch Loss: 0.13014058768749237
Epoch 3/10, Batch Loss: 0.25046584010124207
Epoch 3/10, Batch Loss: 0.13622882962226868
Epoch 3/10, Batch Loss: 0.2308262586593628
Epoch 3/10, Batch Loss: 0.5681148171424866
Epoch 3/10, Batch Loss: 0.27237769961357117
Epoch 3/10, Batch Loss: 0.3527573347091675
Epoch 3/10, Batch Loss: 0.3726541996002197
Epoch 3/10, Batch Loss: 0.35074958205223083
Epoch 3/10, Batch Loss: 0.23180115222930908
Epoch 3/10, Batch Loss: 0.10260964930057526
Epoch 3/10, Batch Loss: 0.5015523433685303

Epoch 3/10, Batch Loss: 0.3584944009780884
Epoch 3/10, Batch Loss: 0.5303889513015747
Epoch 3/10, Batch Loss: 0.07076764851808548
Epoch 3/10, Batch Loss: 0.08924995362758636
Epoch 3/10, Batch Loss: 0.0881311297416687
Epoch 3/10, Batch Loss: 0.21575069427490234
Epoch 3/10, Batch Loss: 0.20200087130069733
Epoch 3/10, Batch Loss: 0.20139852166175842
Epoch 3/10, Batch Loss: 0.45408838987350464
Epoch 3/10, Batch Loss: 0.5376970767974854
Epoch 3/10, Batch Loss: 0.24108092486858368
Epoch 3/10, Batch Loss: 0.4729190170764923
Epoch 3/10, Batch Loss: 0.28835803270339966
Epoch 3/10, Batch Loss: 0.10293862223625183
Epoch 3/10, Batch Loss: 0.36750197410583496
Epoch 3/10, Batch Loss: 0.3543170690536499
Epoch 3/10, Batch Loss: 0.18227095901966095
Epoch 3/10, Batch Loss: 0.34655144810676575
Epoch 3/10, Batch Loss: 0.08235028386116028
Epoch 3/10, Batch Loss: 0.34704622626304626
Epoch 3/10, Batch Loss: 0.08116459846496582
Epoch 3/10, Batch Loss: 0.4642983675003052
Epoch 3/10, Batch Loss: 0.1985692679882

Epoch 4/10, Batch Loss: 0.46718448400497437
Epoch 4/10, Batch Loss: 0.21365371346473694
Epoch 4/10, Batch Loss: 0.23759396374225616
Epoch 4/10, Batch Loss: 0.24336108565330505
Epoch 4/10, Batch Loss: 0.24696390330791473
Epoch 4/10, Batch Loss: 0.5338044166564941
Epoch 4/10, Batch Loss: 0.2365412563085556
Epoch 4/10, Batch Loss: 0.1638067215681076
Epoch 4/10, Batch Loss: 0.2126343846321106
Epoch 4/10, Batch Loss: 0.3515832722187042
Epoch 4/10, Batch Loss: 0.24009360373020172
Epoch 4/10, Batch Loss: 0.37792670726776123
Epoch 4/10, Batch Loss: 0.3484695255756378
Epoch 4/10, Batch Loss: 0.5287376046180725
Epoch 4/10, Batch Loss: 0.3767853081226349
Epoch 4/10, Batch Loss: 0.21197845041751862
Epoch 4/10, Batch Loss: 0.6105616092681885
Epoch 4/10, Batch Loss: 0.3474291265010834
Epoch 4/10, Batch Loss: 0.6549737453460693
Epoch 4/10, Batch Loss: 0.4542202949523926
Epoch 4/10, Batch Loss: 0.5652185082435608
Epoch 4/10, Batch Loss: 0.4047989249229431
Epoch 4/10, Batch Loss: 0.369689404964447
Epoc

Epoch 4/10, Batch Loss: 0.4167378544807434
Epoch 4/10, Batch Loss: 0.3715907335281372
Epoch 4/10, Batch Loss: 0.34327441453933716
Epoch 4/10, Batch Loss: 0.2631523609161377
Epoch 4/10, Batch Loss: 0.22640274465084076
Epoch 4/10, Batch Loss: 0.3546596169471741
Epoch 4/10, Batch Loss: 0.33961421251296997
Epoch 4/10, Batch Loss: 0.10956845432519913
Epoch 4/10, Batch Loss: 0.24467721581459045
Epoch 4/10, Batch Loss: 0.24468713998794556
Epoch 4/10, Batch Loss: 0.23308435082435608
Epoch 4/10, Batch Loss: 0.49766501784324646
Epoch 4/10, Batch Loss: 0.1223982498049736
Epoch 4/10, Batch Loss: 0.17765238881111145
Epoch 4/10, Batch Loss: 0.69331294298172
Epoch 4/10, Batch Loss: 0.21201060712337494
Epoch 4/10, Batch Loss: 0.688835620880127
Epoch 4/10, Batch Loss: 0.2079402655363083
Epoch 4/10, Batch Loss: 0.23929378390312195
Epoch 4/10, Batch Loss: 0.3432232737541199
Epoch 4/10, Batch Loss: 0.5563232898712158
Epoch 4/10, Batch Loss: 0.26989254355430603
Epoch 4/10, Batch Loss: 0.14131183922290802
E

Epoch 5/10, Batch Loss: 0.44403135776519775
Epoch 5/10, Batch Loss: 0.35174259543418884
Epoch 5/10, Batch Loss: 0.4111660122871399
Epoch 5/10, Batch Loss: 0.10457801818847656
Epoch 5/10, Batch Loss: 0.2279200255870819
Epoch 5/10, Batch Loss: 0.2009902000427246
Epoch 5/10, Batch Loss: 0.35749006271362305
Epoch 5/10, Batch Loss: 0.23516854643821716
Epoch 5/10, Batch Loss: 0.31568625569343567
Epoch 5/10, Batch Loss: 0.45225656032562256
Epoch 5/10, Batch Loss: 0.31991976499557495
Epoch 5/10, Batch Loss: 0.2415027618408203
Epoch 5/10, Batch Loss: 0.2417469322681427
Epoch 5/10, Batch Loss: 0.23894117772579193
Epoch 5/10, Batch Loss: 0.33792588114738464
Epoch 5/10, Batch Loss: 0.24479053914546967
Epoch 5/10, Batch Loss: 0.10839473456144333
Epoch 5/10, Batch Loss: 0.2149769514799118
Epoch 5/10, Batch Loss: 0.3496516942977905
Epoch 5/10, Batch Loss: 0.3299885094165802
Epoch 5/10, Batch Loss: 0.47632402181625366
Epoch 5/10, Batch Loss: 0.3224867582321167
Epoch 5/10, Batch Loss: 0.281435906887054

Epoch 5/10, Batch Loss: 0.0832480639219284
Epoch 5/10, Batch Loss: 0.35476967692375183
Epoch 5/10, Batch Loss: 0.23887082934379578
Epoch 5/10, Batch Loss: 0.31088536977767944
Epoch 5/10, Batch Loss: 0.3707977533340454
Epoch 5/10, Batch Loss: 0.09058969467878342
Epoch 5/10, Batch Loss: 0.7050354480743408
Epoch 5/10, Batch Loss: 0.46029534935951233
Epoch 5/10, Batch Loss: 0.09827812016010284
Epoch 5/10, Batch Loss: 0.4401039481163025
Epoch 5/10, Batch Loss: 0.20853286981582642
Epoch 5/10, Batch Loss: 0.11472657322883606
Epoch 5/10, Batch Loss: 0.33614620566368103
Epoch 5/10, Batch Loss: 0.5970723628997803
Epoch 5/10, Batch Loss: 0.23304010927677155
Epoch 5/10, Batch Loss: 0.35844019055366516
Epoch 5/10, Batch Loss: 0.380012571811676
Epoch 5/10, Batch Loss: 0.4173162281513214
Epoch 5/10, Batch Loss: 0.2428840696811676
Epoch 5/10, Batch Loss: 0.46589329838752747
Epoch 5/10, Batch Loss: 0.3165496587753296
Epoch 5/10, Batch Loss: 0.34303486347198486
Epoch 5/10, Batch Loss: 0.2430555522441864

Epoch 5/10, Batch Loss: 0.19347825646400452
Epoch 5/10, Batch Loss: 0.3641938269138336
Epoch 5/10, Batch Loss: 0.19471339881420135
Epoch 5/10, Batch Loss: 0.4175054728984833
Epoch 5/10, Batch Loss: 0.5059149265289307
Epoch 5/10, Batch Loss: 0.135090172290802
Epoch 5/10, Batch Loss: 0.21852697432041168
Epoch 5/10, Batch Loss: 0.3162970542907715
Epoch 5/10, Batch Loss: 0.156815767288208
Epoch 5/10, Batch Loss: 0.4100978970527649
Epoch 5/10, Batch Loss: 0.12090781331062317
Epoch 5/10, Batch Loss: 0.21627968549728394
Epoch 5/10, Batch Loss: 0.5112671256065369
Epoch 5/10, Batch Loss: 0.19867344200611115
Epoch 5/10, Batch Loss: 0.11143852770328522
Epoch 5/10, Batch Loss: 0.23553134500980377
Epoch 5/10, Batch Loss: 0.25079935789108276
Epoch 5/10, Batch Loss: 0.2647550106048584
Epoch 5/10, Batch Loss: 0.2804287374019623
Epoch 5/10, Batch Loss: 0.33459731936454773
Epoch 5/10, Batch Loss: 0.33474695682525635
Epoch 5/10, Batch Loss: 0.37237435579299927
Epoch 5/10, Batch Loss: 0.6163833737373352
E

Epoch 6/10, Batch Loss: 0.23925118148326874
Epoch 6/10, Batch Loss: 0.3754851520061493
Epoch 6/10, Batch Loss: 0.1274423599243164
Epoch 6/10, Batch Loss: 0.14964675903320312
Epoch 6/10, Batch Loss: 0.38333380222320557
Epoch 6/10, Batch Loss: 0.26360824704170227
Epoch 6/10, Batch Loss: 0.316202312707901
Epoch 6/10, Batch Loss: 0.2401118129491806
Epoch 6/10, Batch Loss: 0.30779868364334106
Epoch 6/10, Batch Loss: 0.2073042243719101
Epoch 6/10, Batch Loss: 0.4800170660018921
Epoch 6/10, Batch Loss: 0.1969524621963501
Epoch 6/10, Batch Loss: 0.12576282024383545
Epoch 6/10, Batch Loss: 0.3719923496246338
Epoch 6/10, Batch Loss: 0.2409115433692932
Epoch 6/10, Batch Loss: 0.2599295377731323
Epoch 6/10, Batch Loss: 0.42932236194610596
Epoch 6/10, Batch Loss: 0.21912194788455963
Epoch 6/10, Batch Loss: 0.3679872751235962
Epoch 6/10, Batch Loss: 0.2970195412635803
Epoch 6/10, Batch Loss: 0.2836991548538208
Epoch 6/10, Batch Loss: 0.33931824564933777
Epoch 6/10, Batch Loss: 0.3291545510292053
Epo

Epoch 6/10, Batch Loss: 0.10984572768211365
Epoch 6/10, Batch Loss: 0.34169015288352966
Epoch 6/10, Batch Loss: 0.21815624833106995
Epoch 6/10, Batch Loss: 0.25065878033638
Epoch 6/10, Batch Loss: 0.1035163551568985
Epoch 6/10, Batch Loss: 0.4545993506908417
Epoch 6/10, Batch Loss: 0.208897665143013
Epoch 6/10, Batch Loss: 0.19061145186424255
Epoch 6/10, Batch Loss: 0.3231864869594574
Epoch 6/10, Batch Loss: 0.23040559887886047
Epoch 6/10, Batch Loss: 0.10944069921970367
Epoch 6/10, Batch Loss: 0.32650646567344666
Epoch 6/10, Batch Loss: 0.22390635311603546
Epoch 6/10, Batch Loss: 0.2931367754936218
Epoch 6/10, Batch Loss: 0.46170833706855774
Epoch 6/10, Batch Loss: 0.3479589819908142
Epoch 6/10, Batch Loss: 0.3465971052646637
Epoch 6/10, Batch Loss: 0.8943715691566467
Epoch 6/10, Batch Loss: 0.21544009447097778
Epoch 6/10, Batch Loss: 0.208082377910614
Epoch 6/10, Batch Loss: 0.19972041249275208
Epoch 6/10, Batch Loss: 0.3521263599395752
Epoch 6/10, Batch Loss: 0.4888973832130432
Epoc

Epoch 7/10, Batch Loss: 0.20156081020832062
Epoch 7/10, Batch Loss: 0.3268127739429474
Epoch 7/10, Batch Loss: 0.31132224202156067
Epoch 7/10, Batch Loss: 0.3542063236236572
Epoch 7/10, Batch Loss: 0.3474976718425751
Epoch 7/10, Batch Loss: 0.4719310700893402
Epoch 7/10, Batch Loss: 0.3237827718257904
Epoch 7/10, Batch Loss: 0.5106930732727051
Epoch 7/10, Batch Loss: 0.3820764422416687
Epoch 7/10, Batch Loss: 0.11275889724493027
Epoch 7/10, Batch Loss: 0.47205424308776855
Epoch 7/10, Batch Loss: 0.11337550729513168
Epoch 7/10, Batch Loss: 0.538521945476532
Epoch 7/10, Batch Loss: 0.18029648065567017
Epoch 7/10, Batch Loss: 0.478936105966568
Epoch 7/10, Batch Loss: 0.35005983710289
Epoch 7/10, Batch Loss: 0.25008291006088257
Epoch 7/10, Batch Loss: 0.19574056565761566
Epoch 7/10, Batch Loss: 0.26765909790992737
Epoch 7/10, Batch Loss: 0.19863517582416534
Epoch 7/10, Batch Loss: 0.3032838702201843
Epoch 7/10, Batch Loss: 0.3666616380214691
Epoch 7/10, Batch Loss: 0.11576836556196213
Epoc

Epoch 7/10, Batch Loss: 0.11199504882097244
Epoch 7/10, Batch Loss: 0.12128095328807831
Epoch 7/10, Batch Loss: 0.28054529428482056
Epoch 7/10, Batch Loss: 0.4404809772968292
Epoch 7/10, Batch Loss: 0.21189382672309875
Epoch 7/10, Batch Loss: 0.20928151905536652
Epoch 7/10, Batch Loss: 0.27893751859664917
Epoch 7/10, Batch Loss: 0.31124886870384216
Epoch 7/10, Batch Loss: 0.20142032206058502
Epoch 7/10, Batch Loss: 0.24379383027553558
Epoch 7/10, Batch Loss: 0.43473827838897705
Epoch 7/10, Batch Loss: 0.25917574763298035
Epoch 7/10, Batch Loss: 0.49182409048080444
Epoch 7/10, Batch Loss: 0.4791925251483917
Epoch 7/10, Batch Loss: 0.11606312543153763
Epoch 7/10, Batch Loss: 0.2028460055589676
Epoch 7/10, Batch Loss: 0.5509321689605713
Epoch 7/10, Batch Loss: 0.5614410638809204
Epoch 7/10, Batch Loss: 0.4732372462749481
Epoch 7/10, Batch Loss: 0.19722871482372284
Epoch 7/10, Batch Loss: 0.37518545985221863
Epoch 7/10, Batch Loss: 0.22223015129566193
Epoch 7/10, Batch Loss: 0.332170337438

Epoch 8/10, Batch Loss: 0.5715301036834717
Epoch 8/10, Batch Loss: 0.3275301158428192
Epoch 8/10, Batch Loss: 0.5932462215423584
Epoch 8/10, Batch Loss: 0.5429047346115112
Epoch 8/10, Batch Loss: 0.34538087248802185
Epoch 8/10, Batch Loss: 0.34326663613319397
Epoch 8/10, Batch Loss: 0.11492317169904709
Epoch 8/10, Batch Loss: 0.19588467478752136
Epoch 8/10, Batch Loss: 0.10985209792852402
Epoch 8/10, Batch Loss: 0.28353646397590637
Epoch 8/10, Batch Loss: 0.19812442362308502
Epoch 8/10, Batch Loss: 0.1571941375732422
Epoch 8/10, Batch Loss: 0.23397932946681976
Epoch 8/10, Batch Loss: 0.5171393156051636
Epoch 8/10, Batch Loss: 0.30847039818763733
Epoch 8/10, Batch Loss: 0.33851075172424316
Epoch 8/10, Batch Loss: 0.14610107243061066
Epoch 8/10, Batch Loss: 0.6441299915313721
Epoch 8/10, Batch Loss: 0.19701319932937622
Epoch 8/10, Batch Loss: 0.5176031589508057
Epoch 8/10, Batch Loss: 0.11957620829343796
Epoch 8/10, Batch Loss: 0.3324393033981323
Epoch 8/10, Batch Loss: 0.899983346462249

Epoch 8/10, Batch Loss: 0.34922152757644653
Epoch 8/10, Batch Loss: 0.36657047271728516
Epoch 8/10, Batch Loss: 0.36237290501594543
Epoch 8/10, Batch Loss: 0.5310914516448975
Epoch 8/10, Batch Loss: 0.20745135843753815
Epoch 8/10, Batch Loss: 0.1952439844608307
Epoch 8/10, Batch Loss: 0.26021817326545715
Epoch 8/10, Batch Loss: 0.3355104923248291
Epoch 8/10, Batch Loss: 0.22890064120292664
Epoch 8/10, Batch Loss: 0.23035280406475067
Epoch 8/10, Batch Loss: 0.276667058467865
Epoch 8/10, Batch Loss: 0.35672011971473694
Epoch 8/10, Batch Loss: 0.47706761956214905
Epoch 8/10, Batch Loss: 0.13798168301582336
Epoch 8/10, Batch Loss: 0.374277263879776
Epoch 8/10, Batch Loss: 0.15821105241775513
Epoch 8/10, Batch Loss: 0.2301255762577057
Epoch 8/10, Batch Loss: 0.4628627896308899
Epoch 8/10, Batch Loss: 0.13299965858459473
Epoch 8/10, Batch Loss: 0.3258333206176758
Epoch 8/10, Batch Loss: 0.2504276633262634
Epoch 8/10, Batch Loss: 0.21323052048683167
Epoch 8/10, Batch Loss: 0.20698456466197968

Epoch 8/10, Batch Loss: 0.35662198066711426
Epoch 8/10, Batch Loss: 0.12470541894435883
Epoch 8/10, Batch Loss: 0.3377637267112732
Epoch 8/10, Average Training Loss: 0.3071500606432627
Epoch 9/10, Batch Loss: 0.238241046667099
Epoch 9/10, Batch Loss: 0.43449535965919495
Epoch 9/10, Batch Loss: 0.4586159288883209
Epoch 9/10, Batch Loss: 0.28483933210372925
Epoch 9/10, Batch Loss: 0.37352994084358215
Epoch 9/10, Batch Loss: 0.2600248456001282
Epoch 9/10, Batch Loss: 0.575630247592926
Epoch 9/10, Batch Loss: 0.2638565003871918
Epoch 9/10, Batch Loss: 0.11216528713703156
Epoch 9/10, Batch Loss: 0.35119545459747314
Epoch 9/10, Batch Loss: 0.36615434288978577
Epoch 9/10, Batch Loss: 0.44191408157348633
Epoch 9/10, Batch Loss: 0.2031460404396057
Epoch 9/10, Batch Loss: 0.3266652524471283
Epoch 9/10, Batch Loss: 0.3505755364894867
Epoch 9/10, Batch Loss: 0.19902074337005615
Epoch 9/10, Batch Loss: 0.21570123732089996
Epoch 9/10, Batch Loss: 0.33732324838638306
Epoch 9/10, Batch Loss: 0.3764620

Epoch 9/10, Batch Loss: 0.09251391887664795
Epoch 9/10, Batch Loss: 0.3044837713241577
Epoch 9/10, Batch Loss: 0.32071226835250854
Epoch 9/10, Batch Loss: 0.21877840161323547
Epoch 9/10, Batch Loss: 0.2898990213871002
Epoch 9/10, Batch Loss: 0.19581566751003265
Epoch 9/10, Batch Loss: 0.18782351911067963
Epoch 9/10, Batch Loss: 0.34704503417015076
Epoch 9/10, Batch Loss: 0.19471146166324615
Epoch 9/10, Batch Loss: 0.230526402592659
Epoch 9/10, Batch Loss: 0.2667618989944458
Epoch 9/10, Batch Loss: 0.22542788088321686
Epoch 9/10, Batch Loss: 0.3829575777053833
Epoch 9/10, Batch Loss: 0.1983136087656021
Epoch 9/10, Batch Loss: 0.1608833223581314
Epoch 9/10, Batch Loss: 0.5476006269454956
Epoch 9/10, Batch Loss: 0.2114293873310089
Epoch 9/10, Batch Loss: 0.20217309892177582
Epoch 9/10, Batch Loss: 0.08656350523233414
Epoch 9/10, Batch Loss: 0.4602908194065094
Epoch 9/10, Batch Loss: 0.2927863597869873
Epoch 9/10, Batch Loss: 0.10105888545513153
Epoch 9/10, Batch Loss: 0.5501708388328552
E

Epoch 9/10, Batch Loss: 0.4900374710559845
Epoch 9/10, Batch Loss: 0.5474048852920532
Epoch 9/10, Batch Loss: 0.5033097267150879
Epoch 9/10, Batch Loss: 0.6577704548835754
Epoch 9/10, Batch Loss: 0.5047888159751892
Epoch 9/10, Batch Loss: 0.4845760762691498
Epoch 9/10, Batch Loss: 0.580308198928833
Epoch 9/10, Batch Loss: 0.23677301406860352
Epoch 9/10, Batch Loss: 0.3024064898490906
Epoch 9/10, Batch Loss: 0.8652738332748413
Epoch 9/10, Batch Loss: 0.5206798911094666
Epoch 9/10, Batch Loss: 0.5424063205718994
Epoch 9/10, Batch Loss: 0.4223833978176117
Epoch 9/10, Batch Loss: 0.5856106281280518
Epoch 9/10, Batch Loss: 0.6198649406433105
Epoch 9/10, Batch Loss: 0.5618358254432678
Epoch 9/10, Batch Loss: 0.5532774925231934
Epoch 9/10, Batch Loss: 0.47674092650413513
Epoch 9/10, Batch Loss: 0.5554748177528381
Epoch 9/10, Batch Loss: 0.3794559836387634
Epoch 9/10, Batch Loss: 0.4722762405872345
Epoch 9/10, Batch Loss: 0.22172755002975464
Epoch 9/10, Batch Loss: 0.41038474440574646
Epoch 9/

Epoch 10/10, Batch Loss: 0.3619917631149292
Epoch 10/10, Batch Loss: 0.5387510061264038
Epoch 10/10, Batch Loss: 0.5358129739761353
Epoch 10/10, Batch Loss: 0.3221402168273926
Epoch 10/10, Batch Loss: 0.3864707350730896
Epoch 10/10, Batch Loss: 0.26760026812553406
Epoch 10/10, Batch Loss: 0.4835289716720581
Epoch 10/10, Batch Loss: 0.19346991181373596
Epoch 10/10, Batch Loss: 0.42713236808776855
Epoch 10/10, Batch Loss: 0.28597360849380493
Epoch 10/10, Batch Loss: 0.18880991637706757
Epoch 10/10, Batch Loss: 0.18001525104045868
Epoch 10/10, Batch Loss: 0.6498491168022156
Epoch 10/10, Batch Loss: 0.29519158601760864
Epoch 10/10, Batch Loss: 0.22380614280700684
Epoch 10/10, Batch Loss: 0.20249642431735992
Epoch 10/10, Batch Loss: 0.23022764921188354
Epoch 10/10, Batch Loss: 0.21721002459526062
Epoch 10/10, Batch Loss: 0.2241346687078476
Epoch 10/10, Batch Loss: 0.12554530799388885
Epoch 10/10, Batch Loss: 0.4686098098754883
Epoch 10/10, Batch Loss: 0.3458091914653778
Epoch 10/10, Batch L

Epoch 10/10, Batch Loss: 0.17407234013080597
Epoch 10/10, Batch Loss: 0.5319429636001587
Epoch 10/10, Batch Loss: 0.28673872351646423
Epoch 10/10, Batch Loss: 0.34259361028671265
Epoch 10/10, Batch Loss: 0.18686959147453308
Epoch 10/10, Batch Loss: 0.20316222310066223
Epoch 10/10, Batch Loss: 0.10606621950864792
Epoch 10/10, Batch Loss: 0.264108806848526
Epoch 10/10, Batch Loss: 0.4244577884674072
Epoch 10/10, Batch Loss: 0.3237057328224182
Epoch 10/10, Batch Loss: 0.2715621292591095
Epoch 10/10, Batch Loss: 0.21866166591644287
Epoch 10/10, Batch Loss: 0.12138989567756653
Epoch 10/10, Batch Loss: 0.13153524696826935
Epoch 10/10, Batch Loss: 0.22592182457447052
Epoch 10/10, Batch Loss: 0.29778769612312317
Epoch 10/10, Batch Loss: 0.269702285528183
Epoch 10/10, Batch Loss: 0.16220787167549133
Epoch 10/10, Batch Loss: 0.35376012325286865
Epoch 10/10, Batch Loss: 0.1239369660615921
Epoch 10/10, Batch Loss: 0.14155469834804535
Epoch 10/10, Batch Loss: 0.23612108826637268
Epoch 10/10, Batch 

In [51]:
# Fine-tune용
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
from sklearn.model_selection import train_test_split

class CustomBertForSequenceClassification(BertForSequenceClassification):
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        labels=None,
        output_hidden_states=True
    ):
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            labels=labels,
            output_hidden_states=output_hidden_states
        )
        logits = outputs.logits
        hidden_states = outputs.hidden_states[-5]  # n번째 레이어의 hidden states를 반환합니다.
        loss = outputs.loss
        return logits, loss, hidden_states

# 데이터 로드 및 전처리
data_A = pd.read_csv("output5.csv")  # data set A 파일명에 맞게 수정
data_B = pd.read_csv("infected.csv")  # data set B 파일명에 맞게 수정
# 모델 불러오는 경로
model_path = "Pre-trained.pt"
# 모델 저장경로
model_path2 = "Fine-tuned.pt"

# X_train, Y_train 생성
X_train = []
Y_train = []

for index, row in data_A.iterrows():  # 중복 제거를 하지 않고 원본 데이터 사용
    patient_id = row["ID"]
    patient_info = [str(row[column]) for column in data_A.columns if column != "ID" and column != "DESCRIPTION"]
    symptoms = ", ".join(data_A[data_A["ID"] == patient_id]["DESCRIPTION"].tolist())
    combined_info = ", ".join(patient_info) + ", " + symptoms
    X_train.append(combined_info)
    if patient_id in data_B.values:
        Y_train.append(1)
    else:
        Y_train.append(0)

print("X_train\n", X_train[:10])
print("Y_train\n", Y_train[:10])
        
# BERT 토크나이저 및 모델 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# 모델이 이미 저장되어 있는지 확인하고, 저장된 모델이 있으면 불러오고 없으면 새로운 모델 생성
if os.path.exists(model_path):
    # 저장된 모델이 있을 경우 불러오기
    model = CustomBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.load_state_dict(torch.load(model_path))
    print("Pre-train model loaded.")
else:
    # 저장된 모델이 없을 경우 새로운 모델 생성
    model = CustomBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    print("New model generated.")

# 입력 데이터를 BERT의 입력 형식으로 변환
max_len = 128  # 입력 시퀀스의 최대 길이

input_ids = []
attention_masks = []

for info in X_train:
    encoded_dict = tokenizer.encode_plus(
                        info,                         # 환자 정보 및 증상
                        add_special_tokens = True,    # [CLS], [SEP] 토큰 추가
                        max_length = max_len,         # 최대 길이 지정
                        pad_to_max_length = True,     # 패딩을 추가하여 최대 길이로 맞춤
                        return_attention_mask = True, # 어텐션 마스크 생성
                        return_tensors = 'pt',        # PyTorch 텐서로 반환
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(Y_train)

# 데이터셋 및 데이터로더 생성
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = 0.8
train_dataset, val_dataset = train_test_split(dataset, test_size=1-train_size, random_state=42)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

# 모델을 GPU로 이동
model.to(device)

# 옵티마이저 및 학습률 설정
# 기본 학습률 : 2e-6
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# 에폭 설정
epochs = 10

# 학습 루프
hidden_states_list = []  # 모든 에폭에 대한 hidden state를 저장할 리스트
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs[1]  # loss가 outputs의 두 번째 값입니다.
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}')
        # hidden state를 저장합니다.
        #hidden_states = outputs[2]
        #hidden_states_list.append(hidden_states)

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}')

    # 모델 저장 - 각 에폭마다 모델을 저장합니다.
    model_save_path = f"Fine-tuned_epoch{epoch + 1}.pt"
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved for epoch {epoch + 1} at {model_save_path}")

    # 모든 에폭에 대한 hidden state를 합쳐서 CSV 파일로 저장합니다.
    #hidden_states_concat = torch.cat(hidden_states_list, dim=0)
    #hidden_states_concat = hidden_states_concat[:, 0, :].cpu().detach().numpy()
    #hidden_states_df = pd.DataFrame(hidden_states_concat)
    #hidden_states_df.to_csv("hidden_states_all_epochs.csv", index=False)

# 모델 저장
torch.save(model.state_dict(), model_path2)


# 모델 평가
model.eval()
val_accuracy = 0
for batch in val_dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs[0]  # logits가 outputs의 첫 번째 값입니다.
    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()
    val_accuracy += (logits.argmax(axis=1) == label_ids).mean().item()

print(f'Validation Accuracy: {val_accuracy / len(val_dataloader)}')


X_train
 ['3/23/1963, nan, 999-45-1210, S99916197, X23361434X, Mrs., Malia984, Barrows492, nan, Wolf938, M, asian, nonhispanic, F, Hanoi  Hà Đông  VN, 693 Hettinger Underpass Unit 8, Shrewsbury, Massachusetts, Worcester County, nan, 42.3182333, -71.68403957, 1456657.22, 7867.64, 11/11/1974, nan, 82bec8e4-6c53-4af4-acbf-80f3cf64f756, 40055000.0, Chronic sinusitis (disorder), Miscarriage in first trimester, Body mass index 30+ - obesity (finding), Nasal congestion (finding), Chill (finding), Fever (finding), Loss of taste (finding)', '3/2/1966, nan, 999-86-8438, S99986421, X56721264X, Mr., Adam631, Wiegand701, nan, nan, M, white, nonhispanic, M, Boston  Massachusetts  US, 822 Rosenbaum Green Apt 60, Malden, Massachusetts, Middlesex County, 2155.0, 42.47884498, -71.03499439, 33743.09, 500.5, 3/12/2008, nan, 586916e9-2a18-45be-88bf-28355e86b749, 162864005.0, Body mass index 30+ - obesity (finding), Prediabetes, Anemia (disorder), Sputum finding (finding), Fever (finding)', '12/10/1997, nan

Some weights of CustomBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Pre-train model loaded.
True
Epoch 1/10, Batch Loss: 0.4232475161552429
Epoch 1/10, Batch Loss: 0.2300320416688919
Epoch 1/10, Batch Loss: 0.6249696016311646
Epoch 1/10, Batch Loss: 0.3087734580039978
Epoch 1/10, Batch Loss: 0.17127984762191772
Epoch 1/10, Batch Loss: 0.3642699122428894
Epoch 1/10, Batch Loss: 0.2243185043334961
Epoch 1/10, Batch Loss: 0.3155520558357239
Epoch 1/10, Batch Loss: 0.3468567728996277
Epoch 1/10, Batch Loss: 0.39337626099586487
Epoch 1/10, Batch Loss: 0.14118275046348572
Epoch 1/10, Batch Loss: 0.3188472092151642
Epoch 1/10, Batch Loss: 0.2033919245004654
Epoch 1/10, Batch Loss: 0.1943192481994629
Epoch 1/10, Batch Loss: 0.19697944819927216
Epoch 1/10, Batch Loss: 0.5061054825782776
Epoch 1/10, Batch Loss: 0.13903680443763733
Epoch 1/10, Batch Loss: 0.33504804968833923
Epoch 1/10, Batch Loss: 0.4369771480560303
Epoch 1/10, Batch Loss: 0.3791814148426056
Epoch 1/10, Batch Loss: 0.47323691844940186
Epoch 1/10, Batch Loss: 0.11763240396976471
Epoch 1/10, Batch

Epoch 3/10, Batch Loss: 0.000809699238743633
Epoch 3/10, Average Training Loss: 0.30946087200220884
Model saved for epoch 3 at Fine-tuned_epoch3.pt
Epoch 4/10, Batch Loss: 0.4706595838069916
Epoch 4/10, Batch Loss: 0.5331329107284546
Epoch 4/10, Batch Loss: 0.35690075159072876
Epoch 4/10, Batch Loss: 0.28329983353614807
Epoch 4/10, Batch Loss: 0.5339346528053284
Epoch 4/10, Batch Loss: 0.20539148151874542
Epoch 4/10, Batch Loss: 0.2117503136396408
Epoch 4/10, Batch Loss: 0.233424574136734
Epoch 4/10, Batch Loss: 0.19133375585079193
Epoch 4/10, Batch Loss: 0.23693428933620453
Epoch 4/10, Batch Loss: 0.2425420731306076
Epoch 4/10, Batch Loss: 0.31854507327079773
Epoch 4/10, Batch Loss: 0.3699486255645752
Epoch 4/10, Batch Loss: 0.37834635376930237
Epoch 4/10, Batch Loss: 0.34158384799957275
Epoch 4/10, Batch Loss: 0.2334730178117752
Epoch 4/10, Batch Loss: 0.21030980348587036
Epoch 4/10, Batch Loss: 0.10733845829963684
Epoch 4/10, Batch Loss: 0.4417895972728729
Epoch 4/10, Batch Loss: 0.

Epoch 6/10, Batch Loss: 0.358774334192276
Epoch 6/10, Batch Loss: 0.09551792591810226
Epoch 6/10, Batch Loss: 0.28310972452163696
Epoch 6/10, Batch Loss: 0.16148804128170013
Epoch 6/10, Batch Loss: 0.009739971719682217
Epoch 6/10, Average Training Loss: 0.274368975777179
Model saved for epoch 6 at Fine-tuned_epoch6.pt
Epoch 7/10, Batch Loss: 0.08621811121702194
Epoch 7/10, Batch Loss: 0.1724848598241806
Epoch 7/10, Batch Loss: 0.3063663840293884
Epoch 7/10, Batch Loss: 0.3452546000480652
Epoch 7/10, Batch Loss: 0.16663019359111786
Epoch 7/10, Batch Loss: 0.18014267086982727
Epoch 7/10, Batch Loss: 0.09905797243118286
Epoch 7/10, Batch Loss: 0.17634201049804688
Epoch 7/10, Batch Loss: 0.07192659378051758
Epoch 7/10, Batch Loss: 0.8749650120735168
Epoch 7/10, Batch Loss: 0.1606248915195465
Epoch 7/10, Batch Loss: 0.16483110189437866
Epoch 7/10, Batch Loss: 0.2545137107372284
Epoch 7/10, Batch Loss: 0.583075225353241
Epoch 7/10, Batch Loss: 0.06438113003969193
Epoch 7/10, Batch Loss: 0.08

Epoch 9/10, Batch Loss: 0.3693647086620331
Epoch 9/10, Batch Loss: 0.21517285704612732
Epoch 9/10, Batch Loss: 0.13209421932697296
Epoch 9/10, Batch Loss: 0.2920198440551758
Epoch 9/10, Batch Loss: 0.19674335420131683
Epoch 9/10, Batch Loss: 0.11262106150388718
Epoch 9/10, Batch Loss: 0.055466096848249435
Epoch 9/10, Batch Loss: 0.31480687856674194
Epoch 9/10, Batch Loss: 0.05361969769001007
Epoch 9/10, Average Training Loss: 0.16774539463222027
Model saved for epoch 9 at Fine-tuned_epoch9.pt
Epoch 10/10, Batch Loss: 0.06825008243322372
Epoch 10/10, Batch Loss: 0.022520754486322403
Epoch 10/10, Batch Loss: 0.08019760251045227
Epoch 10/10, Batch Loss: 0.03416089713573456
Epoch 10/10, Batch Loss: 0.20871035754680634
Epoch 10/10, Batch Loss: 0.07354604452848434
Epoch 10/10, Batch Loss: 0.012246831320226192
Epoch 10/10, Batch Loss: 0.06370289623737335
Epoch 10/10, Batch Loss: 0.0160989947617054
Epoch 10/10, Batch Loss: 0.3342176675796509
Epoch 10/10, Batch Loss: 0.12726929783821106
Epoch 1

In [16]:
# 데이터 랜덤분할(500/500/250)
import pandas as pd
import numpy as np

def sample_csv_and_additional(input_file, output_file_500, output_file_100, n_500):
    # CSV 파일을 읽어옵니다.
    data = pd.read_csv(input_file)
    
    # 데이터를 랜덤하게 샘플링합니다.
    sampled_data_750 = data.sample(n=n_500, random_state=42)
    
    # 첫 250개 데이터를 output_file_500과 output_file_100에 순서대로 삽입합니다.
    first_250 = sampled_data_750[:250]
    first_250.to_csv(output_file_500, index=False)
    first_250.to_csv(output_file_100, index=False)
    
    # 나머지 500개 데이터를 절반으로 나누어 각각 output_file_500과 output_file_100에 추가합니다.
    remaining_500 = sampled_data_750[250:]
    split_idx = len(remaining_500) // 2
    second_250_500 = remaining_500[:split_idx]
    second_250_100 = remaining_500[split_idx:]
    
    # 파일에 추가합니다.
    second_250_500.to_csv(output_file_500, mode='a', header=False, index=False)
    second_250_100.to_csv(output_file_100, mode='a', header=False, index=False)

# 입력 CSV 파일 경로
input_file = "output6.csv"

# 출력 CSV 파일 경로
output_file_500 = "random_500_D.csv"
output_file_100 = "random_500_C.csv"

# 랜덤하게 추출할 데이터 개수
n_500 = 750

# 함수 호출
sample_csv_and_additional(input_file, output_file_500, output_file_100, n_500)



In [15]:
# 데이터 랜덤분할(300/500)
import pandas as pd

def sample_csv_and_additional(input_file, output_file_500, output_file_100, n_500):
    # CSV 파일을 읽어옵니다.
    data = pd.read_csv(input_file)
    
    # 데이터를 랜덤하게 샘플링합니다.
    sampled_data_500 = data.sample(n=n_500, random_state=42)
    
    # 샘플링된 500개의 데이터를 CSV 파일로 내보냅니다.
    sampled_data_500.to_csv(output_file_500, index=False)
    
    # sampled_data_500에서 첫 100개의 데이터를 선택합니다.
    sampled_data_100 = sampled_data_500.head(300)
    
    # 선택된 첫 100개의 데이터를 CSV 파일로 내보냅니다.
    sampled_data_100.to_csv(output_file_100, index=False)

# 입력 CSV 파일 경로
input_file = "output6.csv"

# 출력 CSV 파일 경로
output_file_500 = "random_500.csv"
output_file_100 = "random_300.csv"

# 랜덤하게 추출할 데이터 개수
n_500 = 500

# 함수 호출
sample_csv_and_additional(input_file, output_file_500, output_file_100, n_500)


In [23]:
# smashed data 생성 (500/server side)
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
from sklearn.model_selection import train_test_split

class CustomBertForSequenceClassification(BertForSequenceClassification):
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        labels=None,
        output_hidden_states=True
    ):
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            labels=labels,
            output_hidden_states=output_hidden_states
        )
        logits = outputs.logits
        hidden_states = outputs.hidden_states[-5]  # n번째 레이어의 hidden states를 반환합니다.
        loss = outputs.loss
        return logits, loss, hidden_states

# 데이터 로드 및 전처리
data_A = pd.read_csv("random_500_D.csv")  # data set A 파일명에 맞게 수정
data_B = pd.read_csv("infected.csv")  # data set B 파일명에 맞게 수정
# 모델 저장 경로
model_path = "Pre-trained.pt"

# X_train, Y_train 생성
X_train = []
Y_train = []

for index, row in data_A.iterrows():  # 중복 제거를 하지 않고 원본 데이터 사용
    patient_id = row["ID"]
    patient_info = [str(row[column]) for column in data_A.columns if column != "ID" and column != "DESCRIPTION"]
    symptoms = ", ".join(data_A[data_A["ID"] == patient_id]["DESCRIPTION"].tolist())
    combined_info = ", ".join(patient_info) + ", " + symptoms
    X_train.append(combined_info)
    if patient_id in data_B.values:
        Y_train.append(1)
    else:
        Y_train.append(0)

print("X_train\n", X_train[:10])
print("Y_train\n", Y_train[:10])
        
# BERT 토크나이저 및 모델 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# 모델이 이미 저장되어 있는지 확인하고, 저장된 모델이 있으면 불러오고 없으면 새로운 모델 생성
if os.path.exists(model_path):
    # 저장된 모델이 있을 경우 불러오기
    model = CustomBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.load_state_dict(torch.load(model_path))
    print("Pre-train model loaded.")
else:
    # 저장된 모델이 없을 경우 새로운 모델 생성
    model = CustomBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    print("New model generated.")

# 입력 데이터를 BERT의 입력 형식으로 변환
max_len = 128  # 입력 시퀀스의 최대 길이

input_ids = []
attention_masks = []

for info in X_train:
    encoded_dict = tokenizer.encode_plus(
                        info,                         # 환자 정보 및 증상
                        add_special_tokens = True,    # [CLS], [SEP] 토큰 추가
                        max_length = max_len,         # 최대 길이 지정
                        pad_to_max_length = True,     # 패딩을 추가하여 최대 길이로 맞춤
                        return_attention_mask = True, # 어텐션 마스크 생성
                        return_tensors = 'pt',        # PyTorch 텐서로 반환
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(Y_train)

# 데이터셋 생성
dataset = TensorDataset(input_ids, attention_masks, labels)

# 데이터로더 생성
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

# 모델을 GPU로 이동
model.to(device)

# 모델 평가
model.eval()
val_accuracy = 0
hidden_states_list = []  # 평가할 때 hidden state를 저장할 리스트
for batch in dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs[0]  # logits가 outputs의 첫 번째 값입니다.
    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()
    val_accuracy += (logits.argmax(axis=1) == label_ids).mean().item()
    # hidden state를 저장합니다.
    hidden_states = outputs[2]
    hidden_states_list.append(hidden_states)
hidden_states_concat = torch.cat(hidden_states_list, dim=0)
hidden_states_concat = hidden_states_concat[:, 0, :].cpu().detach().numpy()
hidden_states_df = pd.DataFrame(hidden_states_concat)
hidden_states_df.to_csv("Dictionary_smashed_data.csv", index=False)

print(f'Validation Accuracy: {val_accuracy / len(dataloader)}')


X_train
 ['6/6/1971, nan, 999-71-5643, S99989143, X60757569X, Mr., Tracy345, Skiles927, nan, nan, M, white, nonhispanic, M, Marblehead  Massachusetts  US, 884 Auer Annex, Uxbridge, Massachusetts, Worcester County, nan, 42.10399442, -71.60337809, 1307003.54, 4997.12, 6/5/1974, nan, 5921ab74-99b6-49c9-a7d3-ad8349070ca8, 128613002.0, Seizure disorder, History of single seizure (situation), Hypertension, Body mass index 30+ - obesity (finding), Cough (finding), Sore throat symptom (finding), Viral sinusitis (disorder)', '10/28/1915, 3/3/1990, 999-77-6721, S99996606, X31277871X, Mrs., Londa304, Hessel84, nan, Weimann465, M, white, nonhispanic, F, Somerville  Massachusetts  US, 136 Ratke Manor, Dighton, Massachusetts, Bristol County, nan, 41.81765972, -71.18273423, 1416726.1, 356060.14, 12/21/1933, nan, e9e9254b-75fb-4c28-8694-4ac7c5d8f076, 59621000.0, Hypertension, Cardiac Arrest, History of cardiac arrest (situation), Prediabetes, Anemia (disorder), Smokes tobacco daily, Atrial Fibrillatio

Some weights of CustomBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Pre-train model loaded.




True
Validation Accuracy: 0.87890625


In [33]:
# smashed data 생성 (100/client side)
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
from sklearn.model_selection import train_test_split

class CustomBertForSequenceClassification(BertForSequenceClassification):
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        labels=None,
        output_hidden_states=True
    ):
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            labels=labels,
            output_hidden_states=output_hidden_states
        )
        logits = outputs.logits
        hidden_states = outputs.hidden_states[-5]  # n번째 레이어의 hidden states를 반환합니다.
        loss = outputs.loss
        return logits, loss, hidden_states

# 데이터 로드 및 전처리
data_A = pd.read_csv("random_500_C.csv")  # data set A 파일명에 맞게 수정
data_B = pd.read_csv("infected.csv")  # data set B 파일명에 맞게 수정
# 모델 저장 경로
model_path = "Fine-tuned_epoch10.pt"

# X_train, Y_train 생성
X_train = []
Y_train = []

for index, row in data_A.iterrows():  # 중복 제거를 하지 않고 원본 데이터 사용
    patient_id = row["ID"]
    patient_info = [str(row[column]) for column in data_A.columns if column != "ID" and column != "DESCRIPTION"]
    symptoms = ", ".join(data_A[data_A["ID"] == patient_id]["DESCRIPTION"].tolist())
    combined_info = ", ".join(patient_info) + ", " + symptoms
    X_train.append(combined_info)
    if patient_id in data_B.values:
        Y_train.append(1)
    else:
        Y_train.append(0)

print("X_train\n", X_train[:10])
print("Y_train\n", Y_train[:10])
        
# BERT 토크나이저 및 모델 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# 모델이 이미 저장되어 있는지 확인하고, 저장된 모델이 있으면 불러오고 없으면 새로운 모델 생성
if os.path.exists(model_path):
    # 저장된 모델이 있을 경우 불러오기
    model = CustomBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.load_state_dict(torch.load(model_path))
    print("Pre-train model loaded.")
else:
    # 저장된 모델이 없을 경우 새로운 모델 생성
    model = CustomBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    print("New model generated.")

# 입력 데이터를 BERT의 입력 형식으로 변환
max_len = 128  # 입력 시퀀스의 최대 길이

input_ids = []
attention_masks = []

for info in X_train:
    encoded_dict = tokenizer.encode_plus(
                        info,                         # 환자 정보 및 증상
                        add_special_tokens = True,    # [CLS], [SEP] 토큰 추가
                        max_length = max_len,         # 최대 길이 지정
                        pad_to_max_length = True,     # 패딩을 추가하여 최대 길이로 맞춤
                        return_attention_mask = True, # 어텐션 마스크 생성
                        return_tensors = 'pt',        # PyTorch 텐서로 반환
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(Y_train)

# 데이터셋 생성
dataset = TensorDataset(input_ids, attention_masks, labels)

# 데이터로더 생성
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

# 모델을 GPU로 이동
model.to(device)

# 모델 평가
model.eval()
val_accuracy = 0
hidden_states_list = []  # 평가할 때 hidden state를 저장할 리스트
for batch in dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs[0]  # logits가 outputs의 첫 번째 값입니다.
    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()
    val_accuracy += (logits.argmax(axis=1) == label_ids).mean().item()
    # hidden state를 저장합니다.
    hidden_states = outputs[2]
    hidden_states_list.append(hidden_states)
hidden_states_concat = torch.cat(hidden_states_list, dim=0)
hidden_states_concat = hidden_states_concat[:, 0, :].cpu().detach().numpy()
hidden_states_df = pd.DataFrame(hidden_states_concat)
hidden_states_df.to_csv("Client_smashed_data_epoch10.csv", index=False)

print(f'Validation Accuracy: {val_accuracy / len(dataloader)}')


X_train
 ['6/6/1971, nan, 999-71-5643, S99989143, X60757569X, Mr., Tracy345, Skiles927, nan, nan, M, white, nonhispanic, M, Marblehead  Massachusetts  US, 884 Auer Annex, Uxbridge, Massachusetts, Worcester County, nan, 42.10399442, -71.60337809, 1307003.54, 4997.12, 6/5/1974, nan, 5921ab74-99b6-49c9-a7d3-ad8349070ca8, 128613002.0, Seizure disorder, History of single seizure (situation), Hypertension, Body mass index 30+ - obesity (finding), Cough (finding), Sore throat symptom (finding), Viral sinusitis (disorder)', '10/28/1915, 3/3/1990, 999-77-6721, S99996606, X31277871X, Mrs., Londa304, Hessel84, nan, Weimann465, M, white, nonhispanic, F, Somerville  Massachusetts  US, 136 Ratke Manor, Dighton, Massachusetts, Bristol County, nan, 41.81765972, -71.18273423, 1416726.1, 356060.14, 12/21/1933, nan, e9e9254b-75fb-4c28-8694-4ac7c5d8f076, 59621000.0, Hypertension, Cardiac Arrest, History of cardiac arrest (situation), Prediabetes, Anemia (disorder), Smokes tobacco daily, Atrial Fibrillatio

Some weights of CustomBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Pre-train model loaded.
True
Validation Accuracy: 0.88671875


In [76]:
# 유클리드 거리 유사도
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

def calculate_accuracy_and_distance(client_file, dictionary_file, original_file_client, original_file_dictionary, n=5):
    # 변환된 파일을 읽어옵니다.
    client_data = pd.read_csv(client_file)
    dictionary_data = pd.read_csv(dictionary_file)
    
    # 원본 파일을 읽어옵니다.
    original_client_data = pd.read_csv(original_file_client)
    original_dictionary_data = pd.read_csv(original_file_dictionary)
    
    # 데이터 포인트 간의 유클리드 거리를 계산합니다.
    distances = euclidean_distances(client_data.values, dictionary_data.values)
    
    # Top@n 유사도를 찾습니다.
    topn_similarities = np.argsort(distances, axis=1)[:, :n]
    topn_values = np.sort(distances, axis=1)[:, :n]
    
    # 모든 결과를 출력하고 정확도를 계산합니다.
    successful_distances = []
    unsuccessful_distances = []
    successes = 0
    success_indices = []  # 성공한 인덱스를 저장할 리스트
    success_ranks_count = {rank: 0 for rank in range(1, n+1)}  # 각 성공한 서버 측 랭크의 수를 저장할 딕셔너리
    for i, (indices, scores) in enumerate(zip(topn_similarities, topn_values)):
        print(f"\nTop {n} inferences for client {i + 1}:")
        for rank, (idx, score) in enumerate(zip(indices, scores), 1):
            print(f"Server {idx + 1} with distance {score}")
            if original_client_data.iloc[i].equals(original_dictionary_data.iloc[idx]):
                successes += 1
                successful_distances.append(score)
                success_indices.append((i + 1, rank))  # 성공한 인덱스를 추가
                success_ranks_count[rank] += 1  # 해당 랭크의 수를 증가시킴
            else:
                unsuccessful_distances.append(score)
        if successes == 0:
            print("No successful match found.")
    
    # 정확도 계산
    accuracy = successes / len(client_data)
    
    # 성공적으로 일치하는 데이터 포인트와 클라이언트 데이터 포인트, 그리고 일치하지 않는 데이터 포인트와 클라이언트 데이터 포인트 간의 평균 거리를 계산합니다.
    successful_mean_distance = np.mean(successful_distances)
    unsuccessful_mean_distance = np.mean(unsuccessful_distances)
    
    # 평균 거리의 분산 계산
    successful_distance_variance = np.var(successful_distances)
    unsuccessful_distance_variance = np.var(unsuccessful_distances)
    
    return accuracy, successful_mean_distance, unsuccessful_mean_distance, success_indices, successful_distance_variance, unsuccessful_distance_variance, success_ranks_count

# 변환된 파일 경로
dictionary_file = "Dictionary_smashed_data.csv"

# 원본 파일 경로
original_file_client = "random_500_C.csv"
original_file_dictionary = "random_500_D.csv"

# Top n 설정
n = 5

# 정확도 계산 및 평균 거리 계산
for i in range(1, 11):
    client_file = f'Client_smashed_data_epoch{i}.csv'
    accuracy, successful_mean_distance, unsuccessful_mean_distance, success_indices, successful_distance_variance, unsuccessful_distance_variance, success_ranks_count = calculate_accuracy_and_distance(client_file, dictionary_file, original_file_client, original_file_dictionary, n)

    print("\nFor file:", client_file)
    print("Accuracy:", accuracy)
    print("Successful Mean Distance:", successful_mean_distance)
    print("Unsuccessful Mean Distance:", unsuccessful_mean_distance)

    # 분산 출력
    print("Successful Distance Variance:", successful_distance_variance)
    print("Unsuccessful Distance Variance:", unsuccessful_distance_variance)

    # 성공한 인덱스들을 출력합니다.
    print("Success Indices:", success_indices)

    # 각 성공한 서버 측 랭크의 수를 출력합니다.
    print("Success Ranks Count:")
    for rank, count in success_ranks_count.items():
        print(f"Rank {rank}: {count} successes")



Top 5 inferences for client 1:
Server 1 with distance 2.7247970296745616
Server 284 with distance 3.6963992048509695
Server 98 with distance 3.8378456825929135
Server 361 with distance 3.855834733057886
Server 182 with distance 3.8609895446124995

Top 5 inferences for client 2:
Server 2 with distance 0.724735359141168
Server 228 with distance 2.028347170460485
Server 345 with distance 2.1298753154352243
Server 453 with distance 2.2715492251690943
Server 123 with distance 2.2911883283675314

Top 5 inferences for client 3:
Server 97 with distance 1.9867703240100762
Server 432 with distance 1.9887874566723371
Server 122 with distance 2.0140766694107395
Server 287 with distance 2.079045109070839
Server 341 with distance 2.123982410335399

Top 5 inferences for client 4:
Server 287 with distance 2.235145193404765
Server 107 with distance 2.3410859743374486
Server 146 with distance 2.363398620656569
Server 4 with distance 2.3729412267207084
Server 462 with distance 2.3746641913631192

Top 5 

Top 5 inferences for client 127:
Server 127 with distance 0.776055041693363
Server 312 with distance 1.669524794588886
Server 131 with distance 1.871908388008971
Server 465 with distance 1.8944568803391741
Server 172 with distance 1.9585033523812079

Top 5 inferences for client 128:
Server 128 with distance 1.1915899711044882
Server 439 with distance 1.7890302509609408
Server 96 with distance 1.8820089439882284
Server 256 with distance 1.9790964995256062
Server 120 with distance 1.985717581505245

Top 5 inferences for client 129:
Server 223 with distance 1.9522299774271574
Server 303 with distance 1.9654485601206144
Server 122 with distance 1.999888282280062
Server 154 with distance 2.1545366524063225
Server 281 with distance 2.161321376890651

Top 5 inferences for client 130:
Server 130 with distance 0.966806350478851
Server 481 with distance 1.986938674193881
Server 437 with distance 2.0717143199010257
Server 116 with distance 2.132109480553832
Server 67 with distance 2.1569416780980

Server 274 with distance 2.4525161162296225

Top 5 inferences for client 245:
Server 96 with distance 1.6463118360444937
Server 221 with distance 1.6993069631485
Server 370 with distance 1.7473346570630164
Server 323 with distance 1.7616803133119998
Server 439 with distance 1.77137971465431

Top 5 inferences for client 246:
Server 246 with distance 0.7529366375846757
Server 290 with distance 1.155507719631635
Server 413 with distance 1.2437959715510165
Server 76 with distance 1.305719032855389
Server 478 with distance 1.4611638945951184

Top 5 inferences for client 247:
Server 247 with distance 1.4379404484725926
Server 491 with distance 1.4833054789702353
Server 256 with distance 1.6089424893224322
Server 96 with distance 1.6366798571177203
Server 120 with distance 1.6813524363646368

Top 5 inferences for client 248:
Server 323 with distance 1.7579427213790868
Server 96 with distance 1.8032647152547334
Server 132 with distance 1.8549986741841828
Server 452 with distance 1.880391557884

Top 5 inferences for client 353:
Server 197 with distance 1.1259328163998568
Server 148 with distance 1.380595607373654
Server 315 with distance 1.3997577904254066
Server 103 with distance 1.4366229411922427
Server 484 with distance 1.6348758691988587

Top 5 inferences for client 354:
Server 365 with distance 3.1400670185474455
Server 374 with distance 3.336686597969172
Server 48 with distance 3.3495649486409733
Server 417 with distance 3.4391971889842505
Server 227 with distance 3.4614540751871146

Top 5 inferences for client 355:
Server 500 with distance 2.6392838055945558
Server 267 with distance 2.6766717747400413
Server 464 with distance 2.7450342324327055
Server 462 with distance 2.7575264685503877
Server 22 with distance 2.762718852878185

Top 5 inferences for client 356:
Server 391 with distance 1.456999860788813
Server 96 with distance 1.5477340251935154
Server 256 with distance 1.6329783452514453
Server 120 with distance 1.6888804470717242
Server 439 with distance 1.689808481

Top 5 inferences for client 470:
Server 306 with distance 4.317188482592018
Server 207 with distance 4.617343467155756
Server 487 with distance 4.714990788773338
Server 436 with distance 4.907022947318057
Server 237 with distance 4.966130927531157

Top 5 inferences for client 471:
Server 299 with distance 1.6050336518254567
Server 123 with distance 1.618871093927311
Server 165 with distance 1.6746044638953965
Server 99 with distance 1.74953126626129
Server 430 with distance 1.7515896574400567

Top 5 inferences for client 472:
Server 68 with distance 2.1869445922396347
Server 417 with distance 2.2099129518282195
Server 365 with distance 2.475863440486534
Server 498 with distance 2.480495248747086
Server 328 with distance 2.4846236310714707

Top 5 inferences for client 473:
Server 315 with distance 1.0319337123080081
Server 148 with distance 1.151218761986499
Server 197 with distance 1.1796877000570634
Server 103 with distance 1.2639898540483834
Server 484 with distance 1.324789512075465

In [66]:
# 코사인 유사도
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def calculate_accuracy_and_similarity(client_file, dictionary_file, original_file_client, original_file_dictionary, n=5):
    # 변환된 파일을 읽어옵니다.
    client_data = pd.read_csv(client_file)
    dictionary_data = pd.read_csv(dictionary_file)
    
    # 원본 파일을 읽어옵니다.
    original_client_data = pd.read_csv(original_file_client)
    original_dictionary_data = pd.read_csv(original_file_dictionary)
    
    # 데이터 포인트 간의 코사인 유사도를 계산합니다.
    similarities = cosine_similarity(client_data.values, dictionary_data.values)
    
    # Top@n 유사도를 찾습니다.
    topn_similarities = np.argsort(similarities, axis=1)[:, :-n-1:-1]  # 역순으로 정렬하여 상위 n개를 얻습니다.
    topn_values = -np.sort(-similarities, axis=1)[:, :-n-1:-1]  # 역순으로 정렬하여 상위 n개의 값만 얻습니다.
    
    # 모든 결과를 출력하고 정확도를 계산합니다.
    successful_distances = []
    unsuccessful_distances = []
    successes = 0
    success_indices = []  # 성공한 인덱스를 저장할 리스트
    for i, (indices, scores) in enumerate(zip(topn_similarities, topn_values)):
        """print(f"\nTop {n} inferences for client {i + 1}:")"""
        for rank, (idx, score) in enumerate(zip(indices, scores), 1):
            """print(f"Server {idx + 1} with similarity score {score}")"""
            if original_client_data.iloc[i].equals(original_dictionary_data.iloc[idx]):
                successes += 1
                successful_distances.append(score)
                success_indices.append((i + 1, rank))  # 성공한 인덱스를 추가
            else:
                unsuccessful_distances.append(score)
        if successes == 0:
            print("No successful match found.")
    
    # 정확도 계산
    accuracy = successes / len(client_data)
    
    # 성공적으로 일치하는 데이터 포인트와 클라이언트 데이터 포인트, 그리고 일치하지 않는 데이터 포인트와 클라이언트 데이터 포인트 간의 평균 유사도를 계산합니다.
    successful_mean_similarity = np.mean(successful_distances)
    unsuccessful_mean_similarity = np.mean(unsuccessful_distances)
    
    # 유사도의 분산 계산
    successful_similarity_variance = np.var(successful_distances)
    unsuccessful_similarity_variance = np.var(unsuccessful_distances)
    
    return accuracy, successful_mean_similarity, unsuccessful_mean_similarity, success_indices, successful_similarity_variance, unsuccessful_similarity_variance

# 변환된 파일 경로
#client_file = "Client_smashed_data_epoch10.csv"
dictionary_file = "Dictionary_smashed_data.csv"

# 원본 파일 경로
original_file_client = "random_500_C.csv"
original_file_dictionary = "random_500_D.csv"

# Top n 설정
n = 5

# 정확도 계산 및 평균 유사도 계산
for i in range(1, 10):
    client_file = f'Client_smashed_data_epoch{i}.csv'
    accuracy, successful_mean_similarity, unsuccessful_mean_similarity, success_indices, successful_similarity_variance, unsuccessful_similarity_variance = calculate_accuracy_and_similarity(client_file, dictionary_file, original_file_client, original_file_dictionary, n)

    print("\nAccuracy:", accuracy)
    print("Successful Mean Similarity:", successful_mean_similarity)
    print("Unsuccessful Mean Similarity:", unsuccessful_mean_similarity)

    # 분산 출력
    print("Successful Similarity Variance:", successful_similarity_variance)
    print("Unsuccessful Similarity Variance:", unsuccessful_similarity_variance)

    # 성공한 인덱스들을 출력합니다.
    print("Success Indices:", success_indices)



Accuracy: 0.374
Successful Mean Similarity: 0.4345162819923329
Unsuccessful Mean Similarity: 0.45544617929380526
Successful Similarity Variance: 0.0041208159004049805
Unsuccessful Similarity Variance: 0.003390901478672084
Success Indices: [(1, 1), (2, 1), (4, 4), (5, 1), (6, 1), (7, 1), (8, 1), (11, 2), (13, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 5), (23, 1), (24, 1), (25, 1), (26, 1), (27, 3), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 2), (35, 1), (36, 1), (37, 4), (38, 1), (39, 1), (42, 1), (45, 1), (49, 1), (50, 4), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (57, 1), (59, 1), (60, 1), (62, 1), (63, 1), (64, 1), (65, 1), (67, 1), (68, 3), (69, 1), (71, 1), (72, 1), (74, 4), (76, 1), (77, 1), (78, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 5), (89, 1), (90, 1), (92, 4), (93, 1), (94, 1), (96, 1), (97, 1), (98, 1), (99, 1), (101, 1), (103, 1), (104, 2), (105, 1), (107, 2), (108, 4), (110, 5), (112, 1), (113, 1), (


Accuracy: 0.268
Successful Mean Similarity: 0.44991214372953753
Unsuccessful Mean Similarity: 0.4520375451721119
Successful Similarity Variance: 0.0036980723070263945
Unsuccessful Similarity Variance: 0.003571237387735291
Success Indices: [(1, 1), (2, 1), (6, 4), (7, 1), (9, 1), (10, 1), (15, 1), (16, 1), (17, 1), (19, 2), (20, 1), (23, 1), (24, 1), (25, 2), (27, 4), (28, 1), (29, 1), (31, 3), (36, 1), (38, 2), (39, 1), (47, 1), (49, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (59, 1), (62, 1), (63, 1), (65, 2), (67, 1), (69, 1), (72, 1), (76, 1), (78, 2), (80, 1), (82, 1), (83, 1), (85, 1), (89, 4), (90, 1), (92, 5), (93, 1), (94, 1), (96, 1), (97, 4), (99, 1), (101, 1), (103, 1), (105, 5), (112, 1), (113, 1), (115, 1), (116, 2), (118, 4), (120, 2), (123, 1), (126, 1), (127, 1), (128, 1), (131, 1), (132, 1), (133, 1), (134, 2), (135, 5), (136, 2), (140, 2), (142, 2), (145, 1), (146, 3), (147, 1), (148, 2), (149, 1), (150, 1), (151, 1), (152, 1), (153, 4), (154, 5), (156, 1), (15

In [22]:
# 레벤슈타인 유사도
import pandas as pd
import numpy as np
import Levenshtein as lev

def calculate_accuracy_and_distance(client_file, dictionary_file, original_file_client, original_file_dictionary):
    # 변환된 파일을 읽어옵니다.
    client_data = pd.read_csv(client_file)
    dictionary_data = pd.read_csv(dictionary_file)
    
    # 원본 파일을 읽어옵니다.
    original_client_data = pd.read_csv(original_file_client)
    original_dictionary_data = pd.read_csv(original_file_dictionary)
    
    # Top@5 유사도를 찾을 레벤슈타인 거리를 계산합니다.
    distances = np.array([[lev.distance(client, dictionary) for dictionary in dictionary_data.values] for client in client_data.values])
    
    # Top@5 유사도를 찾습니다.
    top5_similarities = np.argsort(distances, axis=1)[:, :5]
    top5_distances = np.sort(distances, axis=1)[:, :5]
    
    # 모든 결과를 출력하고 정확도를 계산합니다.
    successful_distances = []
    unsuccessful_distances = []
    successes = 0
    success_indices = []  # 성공한 인덱스를 저장할 리스트
    for i, (indices, scores) in enumerate(zip(top5_similarities, top5_distances)):
        print(f"\nTop 5 inferences for client {i + 1}:")
        for rank, (idx, score) in enumerate(zip(indices, scores), 1):
            print(f"Server {idx + 1} with Levenshtein distance {score}")
            if original_client_data.iloc[i].equals(original_dictionary_data.iloc[idx]):
                successes += 1
                successful_distances.append(score)
                success_indices.append((i + 1, rank))  # 성공한 인덱스를 추가
            else:
                unsuccessful_distances.append(score)
        if successes == 0:
            print("No successful match found.")
    
    # 정확도 계산
    accuracy = successes / len(client_data)
    
    # 성공적으로 일치하는 데이터 포인트와 클라이언트 데이터 포인트, 그리고 일치하지 않는 데이터 포인트와 클라이언트 데이터 포인트 간의 평균 거리를 계산합니다.
    successful_mean_distance = np.mean(successful_distances)
    unsuccessful_mean_distance = np.mean(unsuccessful_distances)
    
    # 거리의 분산 계산
    successful_distance_variance = np.var(successful_distances)
    unsuccessful_distance_variance = np.var(unsuccessful_distances)
    
    return accuracy, successful_mean_distance, unsuccessful_mean_distance, success_indices, successful_distance_variance, unsuccessful_distance_variance

# 변환된 파일 경로
client_file = "Client_smashed_data_epoch1.csv"
dictionary_file = "Dictionary_smashed_data.csv"

# 원본 파일 경로
original_file_client = "random_500_C.csv"
original_file_dictionary = "random_500_D.csv"

# 정확도 계산 및 평균 거리 계산
accuracy, successful_mean_distance, unsuccessful_mean_distance, success_indices, successful_distance_variance, unsuccessful_distance_variance = calculate_accuracy_and_distance(client_file, dictionary_file, original_file_client, original_file_dictionary)

print("\nAccuracy:", accuracy)
print("Successful Mean Distance:", successful_mean_distance)
print("Unsuccessful Mean Distance:", unsuccessful_mean_distance)

# 분산 출력
print("Successful Distance Variance:", successful_distance_variance)
print("Unsuccessful Distance Variance:", unsuccessful_distance_variance)

# 성공한 인덱스들을 출력합니다.
print("Success Indices:", success_indices)



Top 5 inferences for client 1:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 inferences for client 2:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 inferences for client 3:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 inferences for client 4:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 inferences for client 5:
Server 1 with Levenshtein di

Server 339 with Levenshtein distance 768

Top 5 inferences for client 129:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 inferences for client 130:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 inferences for client 131:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 inferences for client 132:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 infe

Server 339 with Levenshtein distance 768

Top 5 inferences for client 251:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 inferences for client 252:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 inferences for client 253:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 inferences for client 254:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 infe

Server 339 with Levenshtein distance 768

Top 5 inferences for client 367:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 inferences for client 368:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 inferences for client 369:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 inferences for client 370:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 infe

Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 inferences for client 490:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 inferences for client 491:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 inferences for client 492:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 with Levenshtein distance 768
Server 339 with Levenshtein distance 768

Top 5 inferences for client 493:
Server 1 with Levenshtein distance 768
Server 342 with Levenshtein distance 768
Server 341 with Levenshtein distance 768
Server 340 

In [6]:
import pandas as pd
import numpy as np

def euclidean_distance(point1, point2):
    return np.linalg.norm(point1 - point2)

def calculate_mean_and_variance(distances):
    mean_distance = np.mean(distances)
    variance_distance = np.var(distances)
    return mean_distance, variance_distance

def main():
    # CSV 파일 읽기
    file1 = "Client_smashed_data_epoch10.csv"
    file2 = "Dictionary_smashed_data.csv"
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    # 같은 행 번호를 가지는 record끼리 pair 형성
    paired_records = zip(df1.values, df2.values)

    # 유클리드 거리 계산 및 저장
    euclidean_distances = []
    for record1, record2 in paired_records:
        euclidean_distances.append(euclidean_distance(record1, record2))

    # 평균과 분산 계산
    mean_distance, variance_distance = calculate_mean_and_variance(euclidean_distances)

    # 결과 출력
    print("평균 유클리드 거리:", mean_distance)
    print("유클리드 거리 분산:", variance_distance)

if __name__ == "__main__":
    main()


평균 유클리드 거리: 3.4949114077812755
유클리드 거리 분산: 0.9717998198424425


In [14]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

# 파일 경로 설정
client_file = "Client_smashed_data_epoch10_C.csv"
dictionary_file = "Dictionary_smashed_data_C.csv"

# CSV 파일 읽기
client_data = pd.read_csv(client_file)
dictionary_data = pd.read_csv(dictionary_file)

# 결측값 확인
print("Client Data Null Values:\n", client_data.isnull().sum())
print("Dictionary Data Null Values:\n", dictionary_data.isnull().sum())

# 결측값 제거
client_data.dropna(inplace=True)
dictionary_data.dropna(inplace=True)

# 유클리드 거리 계산 함수
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# top@5 accuracy 측정을 위한 함수
def top_k_accuracy(client_data, dictionary_data, k=5):
    # 가장 가까운 이웃 찾기
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(dictionary_data)
    distances, indices = nbrs.kneighbors(client_data)

    total_matches = 0
    for i in range(len(client_data)):
        client_point = client_data.iloc[i]
        nearest_indices = indices[i]
        nearest_distances = distances[i]

        for j in range(len(nearest_indices)):
            nearest_index = nearest_indices[j]
            nearest_distance = nearest_distances[j]
            dictionary_point = dictionary_data.iloc[nearest_index]

            # 가장 가까운 이웃의 인덱스와 거리 출력
            print(f"Client data point {i+1}, Top {j+1} nearest dictionary point index: {nearest_index}, Distance: {nearest_distance}")

            # 실제 데이터와 일치하는지 확인
            if client_point.equals(dictionary_point):
                total_matches += 1
                break  # 정확히 일치하는 경우 다음 클라이언트 데이터로 넘어감

    accuracy = total_matches / len(client_data)
    return accuracy

# top1 record 거리의 평균과 분산 계산 함수
def calculate_distance_stats(client_data, dictionary_data):
    distances = []
    for i in range(len(client_data)):
        client_point = client_data.iloc[i]
        distances_to_dictionary = []

        for j in range(len(dictionary_data)):
            dictionary_point = dictionary_data.iloc[j]
            distance = euclidean_distance(client_point, dictionary_point)
            distances_to_dictionary.append(distance)

        # top1 거리만을 고려
        min_distance = min(distances_to_dictionary)
        distances.append(min_distance)

    distances = np.array(distances)
    mean_distance = np.mean(distances)
    variance_distance = np.var(distances)

    return mean_distance, variance_distance

# Top@5 Accuracy 측정
accuracy = top_k_accuracy(client_data, dictionary_data)
print(f"Top@5 Accuracy: {accuracy}")

# top1 record 거리의 평균과 분산 계산
mean_distance, variance_distance = calculate_distance_stats(client_data, dictionary_data)
print(f"Mean Distance: {mean_distance}")
print(f"Variance Distance: {variance_distance}")


Client Data Null Values:
 0      0
1      0
2      0
3      0
4      0
      ..
763    0
764    0
765    0
766    0
767    0
Length: 768, dtype: int64
Dictionary Data Null Values:
 0      250
1      250
2      250
3      250
4      250
      ... 
763    250
764    250
765    250
766    250
767    250
Length: 768, dtype: int64
Client data point 1, Top 1 nearest dictionary point index: 0, Distance: 2.5770112934297447
Client data point 1, Top 2 nearest dictionary point index: 43, Distance: 2.7279617684618924
Client data point 1, Top 3 nearest dictionary point index: 180, Distance: 2.969499020894492
Client data point 1, Top 4 nearest dictionary point index: 105, Distance: 3.1936031851734037
Client data point 1, Top 5 nearest dictionary point index: 218, Distance: 3.2443628725167213
Client data point 2, Top 1 nearest dictionary point index: 1, Distance: 3.899252066743445
Client data point 2, Top 2 nearest dictionary point index: 245, Distance: 4.342273499649014
Client data point 2, Top 3 ne

Client data point 113, Top 5 nearest dictionary point index: 168, Distance: 3.083083024412013
Client data point 114, Top 1 nearest dictionary point index: 43, Distance: 2.8565243369850446
Client data point 114, Top 2 nearest dictionary point index: 113, Distance: 2.913413273519672
Client data point 114, Top 3 nearest dictionary point index: 0, Distance: 3.035964419051852
Client data point 114, Top 4 nearest dictionary point index: 180, Distance: 3.173901112225749
Client data point 114, Top 5 nearest dictionary point index: 218, Distance: 3.359555056522005
Client data point 115, Top 1 nearest dictionary point index: 114, Distance: 3.093335446867474
Client data point 115, Top 2 nearest dictionary point index: 53, Distance: 4.197067892701378
Client data point 115, Top 3 nearest dictionary point index: 58, Distance: 4.321369992615134
Client data point 115, Top 4 nearest dictionary point index: 22, Distance: 4.334502810489262
Client data point 115, Top 5 nearest dictionary point index: 189,

Mean Distance: 3.1507278565745485
Variance Distance: 0.5401144611116975
