In [1]:
import pandas as pd
import numpy as np
import os
import koreanize_matplotlib
import seaborn as sns
%config InlineBackend.figure_format = 'retina'  #선명하게 만들기
#한글폰트 가져오기
from matplotlib import rc
rc('font', family='NanumGothic')

# 결측치 확인하는 라이브러리
import missingno as msno
import chardet

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
from sklearn.metrics import euclidean_distances

path = 'C:/Users/sim/Desktop/medical/data'
os.chdir(path)
print(os.getcwd())

C:\Users\sim\Desktop\medical\data


### 스케일링

코사인 유사도를 구하기 위한 데이터 정규화

범주형 특성은 다음과 같음
- 성별코드: 남성(1), 여성(2)
- 요단백: 정상(1.0) ~ (6.0) ;높을수록 나쁨
- 흡연상태: 비흡연(1), 끊음(2), 흡연중(3)
- 음주여부: 비음주(0), 음주(1)
- 청력: 정상(1), 질환의심(2)
- BMI_WC_risk: 위험도낮음(0)~(5) ;높을수록 나쁨
- WHtR_risk: 정상(1), 저체중(0)~비만(3)

그러나, 성별코드 변수를 제외하면 범주형 변수들도 미리 모두 수치화 해놓았기 떄문에 원-핫 인코딩 작업 필요하지 않음

In [2]:
df = pd.read_csv('final_ex.csv')
df_origin = df.copy()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104819 entries, 0 to 104818
Data columns (total 16 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   성별코드         104819 non-null  int64  
 1   수축기혈압        104819 non-null  float64
 2   이완기혈압        104819 non-null  float64
 3   식전혈당(공복혈당)   104819 non-null  float64
 4   혈색소          104819 non-null  float64
 5   요단백          104819 non-null  float64
 6   혈청크레아티닌      104819 non-null  float64
 7   혈청지오티(AST)   104819 non-null  float64
 8   혈청지피티(ALT)   104819 non-null  float64
 9   감마지티피        104819 non-null  float64
 10  흡연상태         104819 non-null  float64
 11  음주여부         104819 non-null  float64
 12  시력           104819 non-null  float64
 13  청력           104819 non-null  float64
 14  BMI_WC_risk  104819 non-null  int64  
 15  WHtR_risk    104819 non-null  int64  
dtypes: float64(13), int64(3)
memory usage: 12.8 MB


In [4]:
#######################################가경이행 철수행 생성
gg = {
    '성별코드': 2,
    '수축기혈압': 105,
    '이완기혈압': 70.0,
    '식전혈당(공복혈당)': 90.0,
    '혈색소': 13.75,
    '요단백': 1.0,
    '혈청크레아티닌': 1.25,
    '혈청지오티(AST)': 20.0,
    '혈청지피티(ALT)': 17.5,
    '감마지티피': 21.5,
    '흡연상태': 1,  
    '음주여부': 0,  
    '시력': 0.6,
    '청력': 1.0,
    'BMI_WC_risk': 0,
    'WHtR_risk': 1
}
cs = {
    '성별코드': 1,
    '수축기혈압': 105,
    '이완기혈압': 70.0,
    '식전혈당(공복혈당)': 90.0,
    '혈색소': 14.75,
    '요단백': 1.0,
    '혈청크레아티닌': 1.25,
    '혈청지오티(AST)': 20.0,
    '혈청지피티(ALT)': 17.5,
    '감마지티피': 37,
    '흡연상태': 1,  
    '음주여부': 0,  
    '시력': 0.6,
    '청력': 1.0,
    'BMI_WC_risk': 0,
    'WHtR_risk': 1
}
healthy= pd.DataFrame([gg, cs])

df = pd.concat([df, healthy], ignore_index=True)
df.tail()

Unnamed: 0,성별코드,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
104816,2,100.0,61.0,93.0,13.6,1.0,0.8,20.0,14.0,12.0,1.0,1.0,1.1,1.0,1,0
104817,2,98.0,67.0,96.0,13.2,3.0,1.0,21.0,14.0,25.0,1.0,0.0,1.2,1.0,1,0
104818,1,125.0,80.0,92.0,16.9,1.0,1.1,30.0,20.0,16.0,1.0,1.0,0.8,1.0,2,0
104819,2,105.0,70.0,90.0,13.75,1.0,1.25,20.0,17.5,21.5,1.0,0.0,0.6,1.0,0,1
104820,1,105.0,70.0,90.0,14.75,1.0,1.25,20.0,17.5,37.0,1.0,0.0,0.6,1.0,0,1


성별 기준으로 데이터셋 분리

In [5]:
male_group = df[df['성별코드'] == 1]
female_group = df[df['성별코드'] == 2]

In [6]:
# 나눴으므로 성별코드 제거
male_group = male_group.drop(columns=['성별코드'])
female_group = female_group.drop(columns=['성별코드'])

In [7]:
# 범주형 및 연속형 변수 정의
# categorical_columns = ['요단백', '흡연상태', '음주여부', '청력', 'BMI_WC_risk', 'WHtR_risk']
# continuous_columns = ['수축기혈압', '이완기혈압', '식전혈당(공복혈당)', '혈색소', '혈청크레아티닌', '혈청지오티(AST)','시력', '혈청지피티(ALT)', '감마지티피']

# 1. 범주형 변수 원-핫 인코딩
#encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first') # 다중 공선성 문제를 피하기 위해 첫 번쨰 열 제거
#encoded_male_group = encoder.fit_transform(male_group[categorical_columns])
#encoded_female_group = encoder.transform(female_group[categorical_columns])

# 2. 연속형 변수 표준화
#scaler = StandardScaler()
#scaled_male_continuous = scaler.fit_transform(male_group[continuous_columns])
#scaled_female_continuous = scaler.transform(female_group[continuous_columns])

# 범주형 변수 원-핫 인코딩된 결과를 데이터프레임으로 변환
#encoded_male_df = pd.DataFrame(encoded_male_group, columns=encoder.get_feature_names_out(categorical_columns))
#encoded_female_df = pd.DataFrame(encoded_female_group, columns=encoder.get_feature_names_out(categorical_columns))

# 연속형 변수 표준화된 결과를 데이터프레임으로 변환
#scaled_male_df = pd.DataFrame(scaled_male_continuous, columns=continuous_columns)
#scaled_female_df = pd.DataFrame(scaled_female_continuous, columns=continuous_columns)

# 범주형, 연속형 변수를 결합하여 최종 데이터셋 생성
#enc_male = pd.concat([encoded_male_df, scaled_male_df], axis=1)
#enc_female = pd.concat([encoded_female_df, scaled_female_df], axis=1)

In [8]:
# StandardScaler로 정규화
scaler = StandardScaler()
scaled_male_data = scaler.fit_transform(male_group)
scaled_female_data = scaler.fit_transform(female_group)

# 데이터프레임으로 변환
enc_male = pd.DataFrame(scaled_male_data, columns=male_group.columns)
enc_female = pd.DataFrame(scaled_female_data, columns=female_group.columns)

일반 사용자와 건강한 행 분리

In [9]:
# 건강한 행
enc_healthy_male = enc_male[-1:]
enc_healthy_female = enc_female[-1:]
# 일반 사용자 행
enc_male = enc_male[:51095]
enc_female = enc_female[:53724]

uc_similarity_male

In [10]:
# 변수별 유클리디안 거리를 저장할 빈 데이터프레임 생성
uc_similarity_male = pd.DataFrame(columns=enc_male.columns)

# 각 변수별 유클리디안 거리 계산
for column in enc_male.columns:
    reference_value = np.array(enc_healthy_male[column]).reshape(1, -1)
    euclidean_scores = euclidean_distances(enc_male[[column]], reference_value).flatten()
    uc_similarity_male[column] = euclidean_scores

In [11]:
uc_similarity_male

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
0,1.258414,0.106113,0.138930,0.158573,0.0,1.522616,0.475003,0.990115,0.379034,0.000000,2.720519,0.680267,0.0,2.820355,1.317652
1,0.419471,0.000000,0.416790,0.158573,0.0,0.138420,0.345457,0.113620,0.234640,2.181657,2.720519,2.040800,0.0,1.410177,0.000000
2,2.516828,1.697804,0.625186,0.898581,0.0,0.415259,0.734096,0.892727,0.397083,2.181657,2.720519,0.544213,0.0,2.115266,0.000000
3,2.516828,0.955014,0.555720,1.532874,0.0,0.692098,0.000000,0.243471,0.072197,0.000000,0.000000,0.952373,0.0,2.115266,0.000000
4,0.755048,0.848902,0.486255,0.475720,0.0,0.968937,0.086364,0.016231,0.360985,2.181657,2.720519,0.408160,0.0,0.000000,1.317652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51090,1.174520,0.530564,0.694651,1.215728,0.0,0.692098,0.431821,0.113620,0.198542,0.000000,0.000000,2.448960,0.0,2.115266,0.000000
51091,0.587260,0.106113,0.347325,0.052858,0.0,1.522616,0.129546,0.633024,0.270739,1.090829,2.720519,1.088427,0.0,1.410177,0.000000
51092,0.167789,0.636676,0.069465,1.532874,0.0,0.415259,0.172728,0.016231,0.415133,0.000000,2.720519,2.448960,0.0,0.705089,1.317652
51093,1.342308,0.212225,0.486255,1.321443,0.0,1.245777,1.209099,2.808032,1.714678,0.000000,2.720519,1.632640,0.0,1.410177,0.000000


In [23]:
enc_healthy_male

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
51095,-1.383548,-0.399997,-0.207158,-0.761042,-0.25853,0.836762,-0.2899,-0.432968,0.071062,-0.873948,-2.28238,-1.172618,-0.031918,-1.669866,-0.130254


In [12]:
uc_similarity_male.describe()

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
count,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0
mean,1.445273,0.833527,0.570629,1.009421,0.258535,0.861448,0.384141,0.520579,0.371137,0.873965,2.282425,1.284402,0.031918,1.669899,0.579643
std,0.908552,0.682101,0.846954,0.748529,1.000019,0.978842,0.967738,0.95734,0.931313,1.000012,0.999969,0.851707,1.00002,0.999992,0.825235
min,0.0,0.0,0.0,0.052858,0.0,0.13842,0.0,0.016231,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.755048,0.318338,0.208395,0.370004,0.0,0.692098,0.086364,0.081157,0.180492,0.0,2.720519,0.81632,0.0,0.705089,0.0
50%,1.258414,0.742789,0.41679,0.898581,0.0,0.968937,0.172728,0.211008,0.306837,0.0,2.720519,1.088427,0.0,1.410177,0.0
75%,2.097357,1.061127,0.694651,1.427159,0.0,1.245777,0.345457,0.535636,0.397083,2.181657,2.720519,1.63264,0.0,2.115266,1.317652
max,9.64784,7.427891,36.747015,10.94155,11.830417,193.095372,46.463957,42.769731,179.806567,2.181657,2.720519,25.305916,31.362659,3.525443,2.635303


uc_similarity_male

In [13]:
# 변수별 유클리디안 거리를 저장할 빈 데이터프레임 생성
uc_similarity_female = pd.DataFrame(columns=enc_male.columns)

# 각 변수별 유클리디안 거리 계산
for column in enc_female.columns:
    reference_value = np.array(enc_healthy_female[column]).reshape(1, -1)
    euclidean_scores = euclidean_distances(enc_female[[column]], reference_value).flatten()
    uc_similarity_female[column] = euclidean_scores

In [14]:
uc_similarity_female

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
0,1.147266,1.102393,0.224958,1.567079,0.000000,3.800461,0.260650,0.192566,0.619813,0.000000,2.731847,1.429693,0.0,0.000000,1.345029
1,0.353005,0.330718,0.000000,0.050551,0.000000,3.800461,0.521301,0.281442,0.619813,0.000000,2.731847,1.715631,0.0,0.786243,0.000000
2,4.147807,2.315025,1.799664,0.050551,0.000000,3.215775,0.065163,0.222191,0.163109,0.000000,2.731847,1.143754,0.0,2.358728,0.000000
3,0.794261,0.330718,0.974818,4.903441,0.000000,3.800461,0.130325,0.162940,0.946031,0.000000,2.731847,0.571877,0.0,0.786243,1.345029
4,1.676773,0.881914,0.074986,1.162672,0.000000,3.215775,0.195488,0.162940,0.358839,0.000000,0.000000,1.000785,0.0,0.786243,1.345029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53719,1.323768,0.000000,0.899832,0.252755,0.000000,3.215775,0.065163,0.222191,0.358839,0.000000,2.731847,2.573447,0.0,0.000000,1.345029
53720,2.029778,0.771675,0.224958,0.556060,0.000000,1.461716,0.130325,0.251817,0.293596,0.000000,2.731847,0.142969,0.0,0.786243,0.000000
53721,1.853275,0.992153,0.374930,2.578098,0.000000,3.215775,0.260650,0.014813,0.032622,3.517855,2.731847,1.429693,0.0,2.358728,0.000000
53722,0.441256,0.992153,0.224958,0.151653,0.000000,2.631088,0.000000,0.103689,0.619813,0.000000,2.731847,1.429693,0.0,0.786243,1.345029


In [15]:
uc_similarity_female.describe()

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
count,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0
mean,0.916948,0.796253,0.546369,0.869274,0.314508,3.295614,0.341059,0.246998,0.624419,0.366132,2.296421,1.191312,0.02324,1.131347,0.801574
std,0.741093,0.611962,0.837566,0.759939,1.000018,0.950149,0.940062,0.970177,0.83326,1.000017,0.99997,0.77729,1.000019,1.000007,0.756001
min,0.0,0.0,0.0,0.050551,0.0,0.292343,0.0,0.014813,0.032622,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.441256,0.330718,0.224958,0.353857,0.0,2.631088,0.130325,0.103689,0.358839,0.0,2.731847,0.714846,0.0,0.786243,0.0
50%,0.70601,0.661436,0.37493,0.657162,0.0,3.215775,0.26065,0.192566,0.55457,0.0,2.731847,1.143754,0.0,0.786243,1.345029
75%,1.323768,1.102393,0.674874,1.162672,0.0,3.800461,0.390976,0.251817,0.7503,0.0,2.731847,1.429693,0.0,1.572485,1.345029
max,8.383865,6.504117,38.31785,10.362943,9.537484,99.689017,92.009591,186.033389,49.878645,3.517855,2.731847,26.592281,43.053301,3.931213,2.690057


In [16]:
uc_similarity_male.to_csv("uc_similarity_male.csv", index=False)

In [17]:
uc_similarity_female.to_csv("uc_similarity_female.csv", index=False)

-----------------------------

## enc_male과 enc_healthy_male의 크기 비교
- 일반행이 철수행보다 크면 True(+), 더 작으면 False(-)로 표현

In [25]:
enc_male

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
0,-0.125135,-0.506110,-0.068228,-0.602469,-0.25853,-0.685853,0.185104,0.557147,-0.307972,-0.873948,0.438139,-0.492352,-0.031918,1.150488,1.187398
1,-0.964077,-0.399997,0.209632,-0.602469,-0.25853,0.698343,0.055557,-0.319348,-0.163578,1.307709,0.438139,0.868181,-0.031918,-0.259689,-0.130254
2,1.133279,1.297807,0.418028,0.137540,-0.25853,0.421504,0.444196,0.459759,0.468145,1.307709,0.438139,-1.716832,-0.031918,0.445400,-0.130254
3,1.133279,0.555018,0.348563,0.771832,-0.25853,0.144664,-0.289900,-0.189497,-0.001135,-0.873948,-2.282380,-2.124992,-0.031918,0.445400,-0.130254
4,-2.138597,-1.248899,-0.693413,-0.285322,-0.25853,-0.132175,-0.203535,-0.449199,-0.289923,1.307709,0.438139,-0.764458,-0.031918,-1.669866,-1.447906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51090,-0.209029,0.130567,0.487493,-1.976770,-0.25853,0.144664,0.141922,-0.319348,-0.127480,-0.873948,-2.282380,1.276341,-0.031918,0.445400,-0.130254
51091,-0.796289,-0.506110,-0.554483,-0.708184,-0.25853,-0.685853,-0.160353,0.200056,-0.199677,0.216881,0.438139,-0.084192,-0.031918,-0.259689,-0.130254
51092,-1.551337,-1.036673,-0.137693,0.771832,-0.25853,0.421504,-0.462628,-0.416737,-0.344071,-0.873948,0.438139,1.276341,-0.031918,-0.964778,-1.447906
51093,-0.041240,-0.612222,0.279097,0.560401,-0.25853,-0.409014,0.919200,2.375064,1.785740,-0.873948,0.438139,0.460021,-0.031918,-0.259689,-0.130254


In [26]:
enc_healthy_male

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
51095,-1.383548,-0.399997,-0.207158,-0.761042,-0.25853,0.836762,-0.2899,-0.432968,0.071062,-0.873948,-2.28238,-1.172618,-0.031918,-1.669866,-0.130254


In [30]:
enc_male.columns

Index(['수축기혈압', '이완기혈압', '식전혈당(공복혈당)', '혈색소', '요단백', '혈청크레아티닌', '혈청지오티(AST)',
       '혈청지피티(ALT)', '감마지티피', '흡연상태', '음주여부', '시력', '청력', 'BMI_WC_risk',
       'WHtR_risk'],
      dtype='object')

In [29]:
enc_healthy_male.columns

Index(['수축기혈압', '이완기혈압', '식전혈당(공복혈당)', '혈색소', '요단백', '혈청크레아티닌', '혈청지오티(AST)',
       '혈청지피티(ALT)', '감마지티피', '흡연상태', '음주여부', '시력', '청력', 'BMI_WC_risk',
       'WHtR_risk'],
      dtype='object')

In [32]:
print(enc_male.shape, enc_healthy_male.shape)
print(enc_male.dtypes)
print(enc_healthy_male.dtypes)

# 두 데이터프레임의 첫 몇 줄 확인
print(enc_male.head())
print(enc_healthy_male.head())

(51095, 15) (1, 15)
수축기혈압          float64
이완기혈압          float64
식전혈당(공복혈당)     float64
혈색소            float64
요단백            float64
혈청크레아티닌        float64
혈청지오티(AST)     float64
혈청지피티(ALT)     float64
감마지티피          float64
흡연상태           float64
음주여부           float64
시력             float64
청력             float64
BMI_WC_risk    float64
WHtR_risk      float64
dtype: object
수축기혈압          float64
이완기혈압          float64
식전혈당(공복혈당)     float64
혈색소            float64
요단백            float64
혈청크레아티닌        float64
혈청지오티(AST)     float64
혈청지피티(ALT)     float64
감마지티피          float64
흡연상태           float64
음주여부           float64
시력             float64
청력             float64
BMI_WC_risk    float64
WHtR_risk      float64
dtype: object
      수축기혈압     이완기혈압  식전혈당(공복혈당)       혈색소      요단백   혈청크레아티닌  혈청지오티(AST)  \
0 -0.125135 -0.506110   -0.068228 -0.602469 -0.25853 -0.685853    0.185104   
1 -0.964077 -0.399997    0.209632 -0.602469 -0.25853  0.698343    0.055557   
2  1.133279  1.297807    0.4

In [38]:
# enc_healthy_male을 51095개의 행으로 복사하여 새로운 DataFrame 생성
enc_healthy_male_expanded = pd.concat([enc_healthy_male] * len(enc_male), ignore_index=True)

# enc_male과 enc_healthy_male_expanded 비교
TF = enc_male > enc_healthy_male_expanded

# 결과 확인
TF.head()

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
0,True,False,True,True,False,False,True,True,False,False,True,True,False,True,True
1,True,False,True,True,False,False,True,True,False,True,True,True,False,True,False
2,True,True,True,True,False,False,True,True,True,True,True,False,False,True,False
3,True,True,True,True,False,False,False,True,False,False,False,False,False,True,False
4,False,False,False,True,False,False,True,False,False,True,True,True,False,False,False


In [39]:
uc_similarity_male

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
0,1.258414,0.106113,0.138930,0.158573,0.0,1.522616,0.475003,0.990115,0.379034,0.000000,2.720519,0.680267,0.0,2.820355,1.317652
1,0.419471,0.000000,0.416790,0.158573,0.0,0.138420,0.345457,0.113620,0.234640,2.181657,2.720519,2.040800,0.0,1.410177,0.000000
2,2.516828,1.697804,0.625186,0.898581,0.0,0.415259,0.734096,0.892727,0.397083,2.181657,2.720519,0.544213,0.0,2.115266,0.000000
3,2.516828,0.955014,0.555720,1.532874,0.0,0.692098,0.000000,0.243471,0.072197,0.000000,0.000000,0.952373,0.0,2.115266,0.000000
4,0.755048,0.848902,0.486255,0.475720,0.0,0.968937,0.086364,0.016231,0.360985,2.181657,2.720519,0.408160,0.0,0.000000,1.317652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51090,1.174520,0.530564,0.694651,1.215728,0.0,0.692098,0.431821,0.113620,0.198542,0.000000,0.000000,2.448960,0.0,2.115266,0.000000
51091,0.587260,0.106113,0.347325,0.052858,0.0,1.522616,0.129546,0.633024,0.270739,1.090829,2.720519,1.088427,0.0,1.410177,0.000000
51092,0.167789,0.636676,0.069465,1.532874,0.0,0.415259,0.172728,0.016231,0.415133,0.000000,2.720519,2.448960,0.0,0.705089,1.317652
51093,1.342308,0.212225,0.486255,1.321443,0.0,1.245777,1.209099,2.808032,1.714678,0.000000,2.720519,1.632640,0.0,1.410177,0.000000


In [40]:
# TF 데이터셋에서 True면 +를, False면 -를 붙여 uc_similarity_male의 값에 적용
uc_similarity_male_TF = TF.replace({True: '+', False: '-'}).astype(str) + uc_similarity_male.astype(str)

# 결과 확인
uc_similarity_male_TF.head()

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
0,1.2584139249703226,-0.1061127218315764,0.1389301125326772,0.1585731813146979,-0.0,-1.5226158370115992,0.4750032788103909,0.9901152181859892,-0.3790341195295987,-0.0,2.7205193956730778,0.6802665625644876,-0.0,2.820354556828889,1.3176515803321769
1,0.4194713083234407,-0.0,0.4167903375980318,0.1585731813146979,-0.0,-0.138419621546509,0.3454569300439206,0.113619779136097,-0.2346401692326087,2.181657218060172,2.7205193956730778,2.0407996876934624,-0.0,1.4101772784144446,-0.0
2,2.516827849940645,1.697803549305221,0.6251855063970477,0.8985813607832844,-0.0,-0.4152588646395268,0.7340959763433313,0.8927268360693345,0.3970833633167224,2.181657218060172,2.7205193956730778,-0.5442132500515902,-0.0,2.115265917621667,-0.0
3,2.516827849940645,0.9550144964841868,0.555720450130709,1.532874086042073,-0.0,-0.6920981077325451,-0.0,0.2434709552916366,-0.072196975148495,-0.0,-0.0,-0.952373187590282,-0.0,2.115265917621667,-0.0
4,-0.755048354982193,-0.8489017746526103,-0.4862553938643705,0.4757195439440912,-0.0,-0.9689373508255632,0.0863642325109801,-0.0162313970194424,-0.360984875742475,2.181657218060172,2.7205193956730778,0.4081599375386922,-0.0,-0.0,-1.3176515803321769


In [None]:
uc_similarity_male_TF

In [42]:
uc_similarity_male_TF.to_csv("uc_similarity_male_TF.csv", index=False)

In [43]:
# enc_healthy_male을 51095개의 행으로 복사하여 새로운 DataFrame 생성
enc_healthy_female_expanded = pd.concat([enc_healthy_female] * len(enc_female), ignore_index=True)

# enc_male과 enc_healthy_male_expanded 비교
TF = enc_female > enc_healthy_female_expanded

# 결과 확인
TF.head()

# TF 데이터셋에서 True면 +를, False면 -를 붙여 uc_similarity_male의 값에 적용
uc_similarity_female_TF = TF.replace({True: '+', False: '-'}).astype(str) + uc_similarity_female.astype(str)

# 결과 확인
uc_similarity_female_TF.head()

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
0,-1.1472657830838944,-1.1023927229988988,0.2249580232139847,-1.5670792156462787,-0.0,-3.80046105601394,-0.2606503996544725,-0.1925658144208086,-0.619813110395924,-0.0,2.731846668598941,1.4296925367585371,-0.0,-0.0,-1.3450287289852407
1,0.3530048563335058,-0.3307178168996696,-0.0,-0.0505509424402038,-0.0,-3.80046105601394,-0.5213007993089452,-0.2814423441534895,-0.619813110395924,-0.0,2.731846668598941,1.7156310441102445,-0.0,0.7862426475266237,-0.0
2,4.147807061918695,2.315024718297688,1.799664185711878,-0.0505509424402038,-0.0,-3.215774739704103,0.0651625999136181,0.2221913243317022,-0.1631087132620853,-0.0,2.731846668598941,1.1437540294068296,-0.0,2.358727942579871,-0.0
3,0.7942609267503884,-0.3307178168996696,-0.974818100593934,-4.903441416699644,-0.0,-3.80046105601394,-0.1303251998272363,-0.1629403045099149,-0.9460305369200946,-0.0,2.731846668598941,0.5718770147034149,-0.0,0.7862426475266237,-1.3450287289852407
4,1.6767730675841532,0.8819141783991192,-0.0749860077379949,1.162671676124658,-0.0,-3.215774739704103,0.1954877997408544,0.162940304509915,-0.3588391691765876,-0.0,-0.0,1.0007847757309758,-0.0,0.7862426475266237,-1.3450287289852407


In [44]:
uc_similarity_female_TF.to_csv("uc_similarity_female_TF.csv", index=False)