In [90]:
import pandas as pd
import numpy as np
import os
import koreanize_matplotlib
import seaborn as sns
%config InlineBackend.figure_format = 'retina'  #선명하게 만들기
#한글폰트 가져오기
from matplotlib import rc
rc('font', family='NanumGothic')

# 결측치 확인하는 라이브러리
import missingno as msno
import chardet

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
from sklearn.metrics import euclidean_distances



### 스케일링

코사인 유사도를 구하기 위한 데이터 정규화

범주형 특성은 다음과 같음
- 성별코드: 남성(1), 여성(2)
- 요단백: 정상(1.0) ~ (6.0) ;높을수록 나쁨
- 흡연상태: 비흡연(1), 끊음(2), 흡연중(3)
- 음주여부: 비음주(0), 음주(1)
- 청력: 정상(1), 질환의심(2)
- BMI_WC_risk: 위험도낮음(0)~(5) ;높을수록 나쁨
- WHtR_risk: 정상(1), 저체중(0)~비만(3)

그러나, 성별코드 변수를 제외하면 범주형 변수들도 미리 모두 수치화 해놓았기 떄문에 원-핫 인코딩 작업 필요하지 않음

In [91]:
#"C:\Users\jeong\Documents\카카오톡 받은 파일\final_ex.csv"
df = pd.read_csv("C:/Users/jeong/Documents/카카오톡 받은 파일/Final_ex.csv")
df_origin = df.copy()

In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104819 entries, 0 to 104818
Data columns (total 16 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   성별코드         104819 non-null  int64  
 1   수축기혈압        104819 non-null  float64
 2   이완기혈압        104819 non-null  float64
 3   식전혈당(공복혈당)   104819 non-null  float64
 4   혈색소          104819 non-null  float64
 5   요단백          104819 non-null  float64
 6   혈청크레아티닌      104819 non-null  float64
 7   혈청지오티(AST)   104819 non-null  float64
 8   혈청지피티(ALT)   104819 non-null  float64
 9   감마지티피        104819 non-null  float64
 10  흡연상태         104819 non-null  float64
 11  음주여부         104819 non-null  float64
 12  시력           104819 non-null  float64
 13  청력           104819 non-null  float64
 14  BMI_WC_risk  104819 non-null  int64  
 15  WHtR_risk    104819 non-null  int64  
dtypes: float64(13), int64(3)
memory usage: 12.8 MB


In [93]:
#######################################가경이행 철수행 생성
gg = {
    '성별코드': 2,
    '수축기혈압': 105,
    '이완기혈압': 70.0,
    '식전혈당(공복혈당)': 90.0,
    '혈색소': 13.75,
    '요단백': 1.0,
    '혈청크레아티닌': 1.25,
    '혈청지오티(AST)': 20.0,
    '혈청지피티(ALT)': 17.5,
    '감마지티피': 21.5,
    '흡연상태': 1,  
    '음주여부': 0,  
    '시력': 0.6,
    '청력': 1.0,
    'BMI_WC_risk': 0,
    'WHtR_risk': 1
}
cs = {
    '성별코드': 1,
    '수축기혈압': 105,
    '이완기혈압': 70.0,
    '식전혈당(공복혈당)': 90.0,
    '혈색소': 14.75,
    '요단백': 1.0,
    '혈청크레아티닌': 1.25,
    '혈청지오티(AST)': 20.0,
    '혈청지피티(ALT)': 17.5,
    '감마지티피': 37,
    '흡연상태': 1,  
    '음주여부': 0,  
    '시력': 0.6,
    '청력': 1.0,
    'BMI_WC_risk': 0,
    'WHtR_risk': 1
}
healthy= pd.DataFrame([gg, cs])

df = pd.concat([df, healthy], ignore_index=True)
df.tail()

Unnamed: 0,성별코드,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
104816,2,100.0,61.0,93.0,13.6,1.0,0.8,20.0,14.0,12.0,1.0,1.0,1.1,1.0,1,0
104817,2,98.0,67.0,96.0,13.2,3.0,1.0,21.0,14.0,25.0,1.0,0.0,1.2,1.0,1,0
104818,1,125.0,80.0,92.0,16.9,1.0,1.1,30.0,20.0,16.0,1.0,1.0,0.8,1.0,2,0
104819,2,105.0,70.0,90.0,13.75,1.0,1.25,20.0,17.5,21.5,1.0,0.0,0.6,1.0,0,1
104820,1,105.0,70.0,90.0,14.75,1.0,1.25,20.0,17.5,37.0,1.0,0.0,0.6,1.0,0,1


성별 기준으로 데이터셋 분리

In [94]:
male_group = df[df['성별코드'] == 1]
female_group = df[df['성별코드'] == 2]

In [95]:
# 나눴으므로 성별코드 제거
male_group = male_group.drop(columns=['성별코드'])
female_group = female_group.drop(columns=['성별코드'])

In [96]:
#범주형 및 연속형 변수 정의
categorical_columns = ['요단백', '흡연상태', '음주여부', '청력', 'BMI_WC_risk', 'WHtR_risk']
continuous_columns = ['수축기혈압', '이완기혈압', '식전혈당(공복혈당)', '혈색소', '혈청크레아티닌', '혈청지오티(AST)','시력', '혈청지피티(ALT)', '감마지티피']

#1. 범주형 변수 원-핫 인코딩
#encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first') # 다중 공선성 문제를 피하기 위해 첫 번쨰 열 제거
#encoded_male_group = encoder.fit_transform(male_group[categorical_columns])
#encoded_female_group = encoder.transform(female_group[categorical_columns])

# 2. 연속형 변수 표준화
#scaler = StandardScaler()
#scaled_male_continuous = scaler.fit_transform(male_group[continuous_columns])
#scaled_female_continuous = scaler.transform(female_group[continuous_columns])

# 범주형 변수 원-핫 인코딩된 결과를 데이터프레임으로 변환
#encoded_male_df = pd.DataFrame(encoded_male_group, columns=encoder.get_feature_names_out(categorical_columns))
#encoded_female_df = pd.DataFrame(encoded_female_group, columns=encoder.get_feature_names_out(categorical_columns))

# 연속형 변수 표준화된 결과를 데이터프레임으로 변환
#scaled_male_df = pd.DataFrame(scaled_male_continuous, columns=continuous_columns)
#scaled_female_df = pd.DataFrame(scaled_female_continuous, columns=continuous_columns)

# 범주형, 연속형 변수를 결합하여 최종 데이터셋 생성
#enc_male = pd.concat([encoded_male_df, scaled_male_df], axis=1)
#enc_female = pd.concat([encoded_female_df, scaled_female_df], axis=1)

In [97]:
# StandardScaler로 정규화
scaler = StandardScaler()
scaled_male_data = scaler.fit_transform(male_group)
scaled_female_data = scaler.fit_transform(female_group)

# 데이터프레임으로 변환
enc_male = pd.DataFrame(scaled_male_data, columns=male_group.columns)
enc_female = pd.DataFrame(scaled_female_data, columns=female_group.columns)

일반 사용자와 건강한 행 분리

In [98]:
# 건강한 행
enc_healthy_male = enc_male[-1:]
enc_healthy_female = enc_female[-1:]
# 일반 사용자 행
enc_male = enc_male[:51095]
enc_female = enc_female[:53724]

uc_similarity_male

In [99]:
# 변수별 유클리디안 거리를 저장할 빈 데이터프레임 생성
uc_similarity_male = pd.DataFrame(columns=enc_male.columns)

# 각 변수별 유클리디안 거리 계산
for column in enc_male.columns:
    reference_value = np.array(enc_healthy_male[column]).reshape(1, -1)
    euclidean_scores = euclidean_distances(enc_male[[column]], reference_value).flatten()
    uc_similarity_male[column] = euclidean_scores

In [100]:
uc_similarity_male

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
0,1.258414,0.106113,0.138930,0.158573,0.0,1.522616,0.475003,0.990115,0.379034,0.000000,2.720519,0.680267,0.0,2.820355,1.317652
1,0.419471,0.000000,0.416790,0.158573,0.0,0.138420,0.345457,0.113620,0.234640,2.181657,2.720519,2.040800,0.0,1.410177,0.000000
2,2.516828,1.697804,0.625186,0.898581,0.0,0.415259,0.734096,0.892727,0.397083,2.181657,2.720519,0.544213,0.0,2.115266,0.000000
3,2.516828,0.955014,0.555720,1.532874,0.0,0.692098,0.000000,0.243471,0.072197,0.000000,0.000000,0.952373,0.0,2.115266,0.000000
4,0.755048,0.848902,0.486255,0.475720,0.0,0.968937,0.086364,0.016231,0.360985,2.181657,2.720519,0.408160,0.0,0.000000,1.317652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51090,1.174520,0.530564,0.694651,1.215728,0.0,0.692098,0.431821,0.113620,0.198542,0.000000,0.000000,2.448960,0.0,2.115266,0.000000
51091,0.587260,0.106113,0.347325,0.052858,0.0,1.522616,0.129546,0.633024,0.270739,1.090829,2.720519,1.088427,0.0,1.410177,0.000000
51092,0.167789,0.636676,0.069465,1.532874,0.0,0.415259,0.172728,0.016231,0.415133,0.000000,2.720519,2.448960,0.0,0.705089,1.317652
51093,1.342308,0.212225,0.486255,1.321443,0.0,1.245777,1.209099,2.808032,1.714678,0.000000,2.720519,1.632640,0.0,1.410177,0.000000


In [101]:
enc_healthy_male

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
51095,-1.383548,-0.399997,-0.207158,-0.761042,-0.25853,0.836762,-0.2899,-0.432968,0.071062,-0.873948,-2.28238,-1.172618,-0.031918,-1.669866,-0.130254


In [102]:
uc_similarity_male.describe()

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
count,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0,51095.0
mean,1.445273,0.833527,0.570629,1.009421,0.258535,0.861448,0.384141,0.520579,0.371137,0.873965,2.282425,1.284402,0.031918,1.669899,0.579643
std,0.908552,0.682101,0.846954,0.748529,1.000019,0.978842,0.967738,0.95734,0.931313,1.000012,0.999969,0.851707,1.00002,0.999992,0.825235
min,0.0,0.0,0.0,0.052858,0.0,0.13842,0.0,0.016231,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.755048,0.318338,0.208395,0.370004,0.0,0.692098,0.086364,0.081157,0.180492,0.0,2.720519,0.81632,0.0,0.705089,0.0
50%,1.258414,0.742789,0.41679,0.898581,0.0,0.968937,0.172728,0.211008,0.306837,0.0,2.720519,1.088427,0.0,1.410177,0.0
75%,2.097357,1.061127,0.694651,1.427159,0.0,1.245777,0.345457,0.535636,0.397083,2.181657,2.720519,1.63264,0.0,2.115266,1.317652
max,9.64784,7.427891,36.747015,10.94155,11.830417,193.095372,46.463957,42.769731,179.806567,2.181657,2.720519,25.305916,31.362659,3.525443,2.635303


uc_similarity_male

In [103]:
# 변수별 유클리디안 거리를 저장할 빈 데이터프레임 생성
uc_similarity_female = pd.DataFrame(columns=enc_male.columns)

# 각 변수별 유클리디안 거리 계산
for column in enc_female.columns:
    reference_value = np.array(enc_healthy_female[column]).reshape(1, -1)
    euclidean_scores = euclidean_distances(enc_female[[column]], reference_value).flatten()
    uc_similarity_female[column] = euclidean_scores

In [104]:
uc_similarity_female

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
0,1.147266,1.102393,0.224958,1.567079,0.000000,3.800461,0.260650,0.192566,0.619813,0.000000,2.731847,1.429693,0.0,0.000000,1.345029
1,0.353005,0.330718,0.000000,0.050551,0.000000,3.800461,0.521301,0.281442,0.619813,0.000000,2.731847,1.715631,0.0,0.786243,0.000000
2,4.147807,2.315025,1.799664,0.050551,0.000000,3.215775,0.065163,0.222191,0.163109,0.000000,2.731847,1.143754,0.0,2.358728,0.000000
3,0.794261,0.330718,0.974818,4.903441,0.000000,3.800461,0.130325,0.162940,0.946031,0.000000,2.731847,0.571877,0.0,0.786243,1.345029
4,1.676773,0.881914,0.074986,1.162672,0.000000,3.215775,0.195488,0.162940,0.358839,0.000000,0.000000,1.000785,0.0,0.786243,1.345029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53719,1.323768,0.000000,0.899832,0.252755,0.000000,3.215775,0.065163,0.222191,0.358839,0.000000,2.731847,2.573447,0.0,0.000000,1.345029
53720,2.029778,0.771675,0.224958,0.556060,0.000000,1.461716,0.130325,0.251817,0.293596,0.000000,2.731847,0.142969,0.0,0.786243,0.000000
53721,1.853275,0.992153,0.374930,2.578098,0.000000,3.215775,0.260650,0.014813,0.032622,3.517855,2.731847,1.429693,0.0,2.358728,0.000000
53722,0.441256,0.992153,0.224958,0.151653,0.000000,2.631088,0.000000,0.103689,0.619813,0.000000,2.731847,1.429693,0.0,0.786243,1.345029


In [105]:
uc_similarity_female.describe()

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
count,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0,53724.0
mean,0.916948,0.796253,0.546369,0.869274,0.314508,3.295614,0.341059,0.246998,0.624419,0.366132,2.296421,1.191312,0.02324,1.131347,0.801574
std,0.741093,0.611962,0.837566,0.759939,1.000018,0.950149,0.940062,0.970177,0.83326,1.000017,0.99997,0.77729,1.000019,1.000007,0.756001
min,0.0,0.0,0.0,0.050551,0.0,0.292343,0.0,0.014813,0.032622,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.441256,0.330718,0.224958,0.353857,0.0,2.631088,0.130325,0.103689,0.358839,0.0,2.731847,0.714846,0.0,0.786243,0.0
50%,0.70601,0.661436,0.37493,0.657162,0.0,3.215775,0.26065,0.192566,0.55457,0.0,2.731847,1.143754,0.0,0.786243,1.345029
75%,1.323768,1.102393,0.674874,1.162672,0.0,3.800461,0.390976,0.251817,0.7503,0.0,2.731847,1.429693,0.0,1.572485,1.345029
max,8.383865,6.504117,38.31785,10.362943,9.537484,99.689017,92.009591,186.033389,49.878645,3.517855,2.731847,26.592281,43.053301,3.931213,2.690057


In [107]:
uc_similarity_male.to_csv("uc_similarity_male.csv", index=False)

In [18]:
uc_similarity_female.to_csv("uc_similarity_female.csv", index=False)

-----------------------------

## enc_male과 enc_healthy_male의 크기 비교
- 일반행이 철수행보다 크면 True(+), 더 작으면 False(-)로 표현

In [19]:
enc_male

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
0,-0.125135,-0.506110,-0.068228,-0.602469,-0.25853,-0.685853,0.185104,0.557147,-0.307972,-0.873948,0.438139,-0.492352,-0.031918,1.150488,1.187398
1,-0.964077,-0.399997,0.209632,-0.602469,-0.25853,0.698343,0.055557,-0.319348,-0.163578,1.307709,0.438139,0.868181,-0.031918,-0.259689,-0.130254
2,1.133279,1.297807,0.418028,0.137540,-0.25853,0.421504,0.444196,0.459759,0.468145,1.307709,0.438139,-1.716832,-0.031918,0.445400,-0.130254
3,1.133279,0.555018,0.348563,0.771832,-0.25853,0.144664,-0.289900,-0.189497,-0.001135,-0.873948,-2.282380,-2.124992,-0.031918,0.445400,-0.130254
4,-2.138597,-1.248899,-0.693413,-0.285322,-0.25853,-0.132175,-0.203535,-0.449199,-0.289923,1.307709,0.438139,-0.764458,-0.031918,-1.669866,-1.447906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51090,-0.209029,0.130567,0.487493,-1.976770,-0.25853,0.144664,0.141922,-0.319348,-0.127480,-0.873948,-2.282380,1.276341,-0.031918,0.445400,-0.130254
51091,-0.796289,-0.506110,-0.554483,-0.708184,-0.25853,-0.685853,-0.160353,0.200056,-0.199677,0.216881,0.438139,-0.084192,-0.031918,-0.259689,-0.130254
51092,-1.551337,-1.036673,-0.137693,0.771832,-0.25853,0.421504,-0.462628,-0.416737,-0.344071,-0.873948,0.438139,1.276341,-0.031918,-0.964778,-1.447906
51093,-0.041240,-0.612222,0.279097,0.560401,-0.25853,-0.409014,0.919200,2.375064,1.785740,-0.873948,0.438139,0.460021,-0.031918,-0.259689,-0.130254


In [20]:
enc_healthy_male

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
51095,-1.383548,-0.399997,-0.207158,-0.761042,-0.25853,0.836762,-0.2899,-0.432968,0.071062,-0.873948,-2.28238,-1.172618,-0.031918,-1.669866,-0.130254


In [21]:
enc_male.columns

Index(['수축기혈압', '이완기혈압', '식전혈당(공복혈당)', '혈색소', '요단백', '혈청크레아티닌', '혈청지오티(AST)',
       '혈청지피티(ALT)', '감마지티피', '흡연상태', '음주여부', '시력', '청력', 'BMI_WC_risk',
       'WHtR_risk'],
      dtype='object')

In [22]:
enc_healthy_male.columns

Index(['수축기혈압', '이완기혈압', '식전혈당(공복혈당)', '혈색소', '요단백', '혈청크레아티닌', '혈청지오티(AST)',
       '혈청지피티(ALT)', '감마지티피', '흡연상태', '음주여부', '시력', '청력', 'BMI_WC_risk',
       'WHtR_risk'],
      dtype='object')

In [23]:
print(enc_male.shape, enc_healthy_male.shape)
print(enc_male.dtypes)
print(enc_healthy_male.dtypes)

# 두 데이터프레임의 첫 몇 줄 확인
print(enc_male.head())
print(enc_healthy_male.head())

(51095, 15) (1, 15)
수축기혈압          float64
이완기혈압          float64
식전혈당(공복혈당)     float64
혈색소            float64
요단백            float64
혈청크레아티닌        float64
혈청지오티(AST)     float64
혈청지피티(ALT)     float64
감마지티피          float64
흡연상태           float64
음주여부           float64
시력             float64
청력             float64
BMI_WC_risk    float64
WHtR_risk      float64
dtype: object
수축기혈압          float64
이완기혈압          float64
식전혈당(공복혈당)     float64
혈색소            float64
요단백            float64
혈청크레아티닌        float64
혈청지오티(AST)     float64
혈청지피티(ALT)     float64
감마지티피          float64
흡연상태           float64
음주여부           float64
시력             float64
청력             float64
BMI_WC_risk    float64
WHtR_risk      float64
dtype: object
      수축기혈압     이완기혈압  식전혈당(공복혈당)       혈색소      요단백   혈청크레아티닌  혈청지오티(AST)  \
0 -0.125135 -0.506110   -0.068228 -0.602469 -0.25853 -0.685853    0.185104   
1 -0.964077 -0.399997    0.209632 -0.602469 -0.25853  0.698343    0.055557   
2  1.133279  1.297807    0.4

In [24]:
# enc_healthy_male을 51095개의 행으로 복사하여 새로운 DataFrame 생성
enc_healthy_male_expanded = pd.concat([enc_healthy_male] * len(enc_male), ignore_index=True)

# enc_male과 enc_healthy_male_expanded 비교
TF = enc_male > enc_healthy_male_expanded

# 결과 확인
TF.head()

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
0,True,False,True,True,False,False,True,True,False,False,True,True,False,True,True
1,True,False,True,True,False,False,True,True,False,True,True,True,False,True,False
2,True,True,True,True,False,False,True,True,True,True,True,False,False,True,False
3,True,True,True,True,False,False,False,True,False,False,False,False,False,True,False
4,False,False,False,True,False,False,True,False,False,True,True,True,False,False,False


In [25]:
uc_similarity_male

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
0,1.258414,0.106113,0.138930,0.158573,0.0,1.522616,0.475003,0.990115,0.379034,0.000000,2.720519,0.680267,0.0,2.820355,1.317652
1,0.419471,0.000000,0.416790,0.158573,0.0,0.138420,0.345457,0.113620,0.234640,2.181657,2.720519,2.040800,0.0,1.410177,0.000000
2,2.516828,1.697804,0.625186,0.898581,0.0,0.415259,0.734096,0.892727,0.397083,2.181657,2.720519,0.544213,0.0,2.115266,0.000000
3,2.516828,0.955014,0.555720,1.532874,0.0,0.692098,0.000000,0.243471,0.072197,0.000000,0.000000,0.952373,0.0,2.115266,0.000000
4,0.755048,0.848902,0.486255,0.475720,0.0,0.968937,0.086364,0.016231,0.360985,2.181657,2.720519,0.408160,0.0,0.000000,1.317652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51090,1.174520,0.530564,0.694651,1.215728,0.0,0.692098,0.431821,0.113620,0.198542,0.000000,0.000000,2.448960,0.0,2.115266,0.000000
51091,0.587260,0.106113,0.347325,0.052858,0.0,1.522616,0.129546,0.633024,0.270739,1.090829,2.720519,1.088427,0.0,1.410177,0.000000
51092,0.167789,0.636676,0.069465,1.532874,0.0,0.415259,0.172728,0.016231,0.415133,0.000000,2.720519,2.448960,0.0,0.705089,1.317652
51093,1.342308,0.212225,0.486255,1.321443,0.0,1.245777,1.209099,2.808032,1.714678,0.000000,2.720519,1.632640,0.0,1.410177,0.000000


In [26]:
# TF 데이터셋에서 True면 +를, False면 -를 붙여 uc_similarity_male의 값에 적용
uc_similarity_male_TF = TF.replace({True: '+', False: '-'}).astype(str) + uc_similarity_male.astype(str)

# 결과 확인
uc_similarity_male_TF.head()

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
0,1.2584139249703226,-0.1061127218315764,0.1389301125326772,0.1585731813146979,-0.0,-1.5226158370115992,0.4750032788103909,0.9901152181859892,-0.3790341195295987,-0.0,2.7205193956730778,0.6802665625644876,-0.0,2.820354556828889,1.3176515803321769
1,0.4194713083234407,-0.0,0.4167903375980318,0.1585731813146979,-0.0,-0.138419621546509,0.3454569300439206,0.113619779136097,-0.2346401692326087,2.181657218060172,2.7205193956730778,2.0407996876934624,-0.0,1.4101772784144446,-0.0
2,2.516827849940645,1.697803549305221,0.6251855063970477,0.8985813607832844,-0.0,-0.4152588646395268,0.7340959763433313,0.8927268360693345,0.3970833633167224,2.181657218060172,2.7205193956730778,-0.5442132500515902,-0.0,2.115265917621667,-0.0
3,2.516827849940645,0.9550144964841868,0.555720450130709,1.532874086042073,-0.0,-0.6920981077325451,-0.0,0.2434709552916366,-0.072196975148495,-0.0,-0.0,-0.952373187590282,-0.0,2.115265917621667,-0.0
4,-0.755048354982193,-0.8489017746526103,-0.4862553938643705,0.4757195439440912,-0.0,-0.9689373508255632,0.0863642325109801,-0.0162313970194424,-0.360984875742475,2.181657218060172,2.7205193956730778,0.4081599375386922,-0.0,-0.0,-1.3176515803321769


In [27]:
uc_similarity_male_TF

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
0,+1.2584139249703226,-0.10611272183157647,+0.13893011253267726,+0.15857318131469797,-0.0,-1.5226158370115992,+0.4750032788103909,+0.9901152181859891,-0.37903411952959876,-0.0,+2.7205193956730778,+0.6802665625644876,-0.0,+2.8203545568288892,+1.3176515803321767
1,+0.4194713083234407,-0.0,+0.4167903375980318,+0.15857318131469797,-0.0,-0.138419621546509,+0.34545693004392064,+0.11361977913609707,-0.23464016923260872,+2.1816572180601717,+2.7205193956730778,+2.0407996876934624,-0.0,+1.4101772784144446,-0.0
2,+2.516827849940645,+1.697803549305221,+0.6251855063970477,+0.8985813607832844,-0.0,-0.4152588646395268,+0.7340959763433313,+0.8927268360693345,+0.39708336331672245,+2.1816572180601717,+2.7205193956730778,-0.5442132500515902,-0.0,+2.115265917621667,-0.0
3,+2.516827849940645,+0.9550144964841868,+0.555720450130709,+1.5328740860420733,-0.0,-0.6920981077325451,-0.0,+0.24347095529163668,-0.072196975148495,-0.0,-0.0,-0.9523731875902819,-0.0,+2.115265917621667,-0.0
4,-0.755048354982193,-0.8489017746526103,-0.4862553938643705,+0.4757195439440912,-0.0,-0.9689373508255631,+0.08636423251098017,-0.016231397019442404,-0.360984875742475,+2.1816572180601717,+2.7205193956730778,+0.40815993753869223,-0.0,-0.0,-1.3176515803321769
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51090,+1.1745196633056343,+0.5305636091578816,+0.6946505626633863,-1.21572772341268,-0.0,-0.6920981077325451,+0.4318211625549008,+0.11361977913609707,-0.19854168165836122,-0.0,-0.0,+2.4489596252321544,-0.0,+2.115265917621667,-0.0
51091,+0.5872598316528171,-0.10611272183157647,-0.34732528133169316,+0.052857727104900956,-0.0,-1.5226158370115992,+0.12954634876647017,+0.6330244837582554,-0.2707386568068562,+1.0908286090300858,+2.7205193956730778,+1.0884265001031799,-0.0,+1.4101772784144446,-0.0
51092,-0.16778852332937724,-0.6366763309894579,+0.06946505626633866,+1.5328740860420733,-0.0,-0.4152588646395268,-0.17272846502196035,+0.016231397019443258,-0.41513260710384625,-0.0,+2.7205193956730778,+2.4489596252321544,-0.0,+0.7050886392072224,-1.3176515803321769
51093,+1.3423081866350106,-0.21222544366315266,+0.48625539386437044,+1.3214431776224778,-0.0,-1.2457765939185812,+1.209099255153722,+2.808031684363543,+1.714678159776756,-0.0,+2.7205193956730778,+1.6326397501547696,-0.0,+1.4101772784144446,-0.0


In [28]:
uc_similarity_male_TF.to_csv("uc_similarity_male_TF.csv", index=False)

In [29]:
# enc_healthy_male을 51095개의 행으로 복사하여 새로운 DataFrame 생성
enc_healthy_female_expanded = pd.concat([enc_healthy_female] * len(enc_female), ignore_index=True)

# enc_male과 enc_healthy_male_expanded 비교
TF = enc_female > enc_healthy_female_expanded

# 결과 확인
TF.head()

# TF 데이터셋에서 True면 +를, False면 -를 붙여 uc_similarity_male의 값에 적용
uc_similarity_female_TF = TF.replace({True: '+', False: '-'}).astype(str) + uc_similarity_female.astype(str)

# 결과 확인
uc_similarity_female_TF.head()

Unnamed: 0,수축기혈압,이완기혈압,식전혈당(공복혈당),혈색소,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,시력,청력,BMI_WC_risk,WHtR_risk
0,-1.1472657830838944,-1.1023927229988988,0.2249580232139847,-1.5670792156462787,-0.0,-3.80046105601394,-0.2606503996544725,-0.1925658144208086,-0.619813110395924,-0.0,2.731846668598941,1.4296925367585371,-0.0,-0.0,-1.3450287289852407
1,0.3530048563335058,-0.3307178168996696,-0.0,-0.0505509424402038,-0.0,-3.80046105601394,-0.5213007993089452,-0.2814423441534895,-0.619813110395924,-0.0,2.731846668598941,1.7156310441102445,-0.0,0.7862426475266237,-0.0
2,4.147807061918695,2.315024718297688,1.799664185711878,-0.0505509424402038,-0.0,-3.215774739704103,0.0651625999136181,0.2221913243317022,-0.1631087132620853,-0.0,2.731846668598941,1.1437540294068296,-0.0,2.358727942579871,-0.0
3,0.7942609267503884,-0.3307178168996696,-0.974818100593934,-4.903441416699644,-0.0,-3.80046105601394,-0.1303251998272363,-0.1629403045099149,-0.9460305369200946,-0.0,2.731846668598941,0.5718770147034149,-0.0,0.7862426475266237,-1.3450287289852407
4,1.6767730675841532,0.8819141783991192,-0.0749860077379949,1.162671676124658,-0.0,-3.215774739704103,0.1954877997408544,0.162940304509915,-0.3588391691765876,-0.0,-0.0,1.0007847757309758,-0.0,0.7862426475266237,-1.3450287289852407


In [30]:
uc_similarity_female_TF.to_csv("uc_similarity_female_TF.csv", index=False)

해당 과정을 통해서 최종으로 행별로 거리 기반으로 각 질병의 위험도를 나타냄

### 감성분석을 통해서 같은 종류의 영양제 중 최적의 영양제 추천하기

In [31]:
df = pd.read_csv("C:/Users/jeong/Desktop/baf/foremotion.csv")

In [32]:
df = df.drop(columns=['MorphemeResults_modi_jaegumae','ProcessedMorphemes'])

In [33]:
df

Unnamed: 0,product,ReviewText
0,164 루테인지아잔틴 알티지오메가3 GR,효과가 있으면 좋겠습니다
1,38 발효효소,가성비 좋아요 효과는 먹어봐야 알겠죠
2,38 발효효소,금일 배송 받았습니다
3,곡물효소 프로바이오틱스,바나바 혈당 케어 플러스가 정확하게 배달 되었습니다 감사 합니다
4,곡물효소 프로바이오틱스,집사람이 먹은지 한달넘었는데 당뇨약하고 같이 먹고있는데요 일단 그전애 너무 입맛없어...
...,...,...
7638,화이링워터,아직 먹기전이라 알 수 없지만 기대를 합니다
7639,화이링워터,잠을 깊게 잘수있어요
7640,화이링워터,아직복용전입니다효과가있었으면 좋겠어요 나중에 다시올릴께요
7641,화이링워터,별로 좋은지 모르겠어요


In [34]:
from datasets import load_dataset
imdb = load_dataset("imdb")

  from .autonotebook import tqdm as notebook_tqdm


In [35]:
from transformers import AutoTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import numpy as np
import math
import random

In [36]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    ignore_mismatched_sizes=True  # 크기 불일치 무시
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [38]:
random.seed(427)
random_number = random.sample(range(0,(len(df)-1)), 50)
print(random_number)
df_ex = df.iloc[random_number]

[1340, 5024, 6680, 3997, 6569, 2959, 373, 1362, 2401, 749, 5520, 5733, 7204, 6, 4109, 5756, 5842, 3957, 2755, 3774, 5847, 2580, 3483, 4103, 5479, 2764, 4944, 6074, 43, 2588, 7482, 5413, 1356, 6589, 3324, 4457, 7422, 5216, 1310, 3967, 4921, 4892, 1360, 2415, 2622, 4535, 3649, 1832, 674, 5330]


In [39]:
list = df_ex['ReviewText'].tolist()

In [40]:
clean = []
for i in range(0,len(list)):
    if(list[i] != np.nan):
        print(list[i])
        clean.append(list[i])

광고보고 약사몰에서 확인하고착한가격에 구입했네요남녀다먹어두 된다해서한포를 물에타서 먹으니 젤편하고맛두좋네요피곤두가시구몸두 가벼워진 느낌
쿠팡이 더 싸네
은행잎 추출물
먹기편해용 굿 굿
두번째 구매입니다  효과가 있는것 같아서 다시 주문합니다  잘먹을께요 
먹자마자 세한느낌먼가농축된느낌 두번먹었는데 몸전체가 비타민씨로가득찬 느낌이네요
좋아요 잘쓰고 있습니다
효과정말좋아요계속 재구매의사있습니다
어제 상품 잘 도착했어요 꾸준히 잘 섭취해볼꼐요
배송 빠르게 잘 도착
배송빠르고 좋아요
생유산균 가격대비 넘좋아요요즘 변비가 좀심해서 락토뭐시기사먹다가 잘안들어서 프로 바이오스틱 2틀복용했는데 정말좋아요정말 약사님들 감사합니다최곱니다
가성비가아주아주좋아요하루2알씩먹어요영양성분이우수해서주위에많은분들에게권장하고있어요
잘받았습니다 효과는 먹어봐야 알거같아요
잘 먹고 있습니다저령합니다다른제품 보다 먹기좋아요
열심히 먹어보겠습니다
처음 구입해서 이제 두알을사용했어요구매 전에 약사몰을 통해서호감이 갔기에이거다 하는 마음으로 구입했어요
비타민 외에도 다른 영양소도 많네요
배송빠르고 저렴히 잘 산거 같아요
주문 하자마자 다음날 바로 배달되네요 가격도 착하고  자주 이용할듯 싶네요
빠른 배송감사드립니다
좋아요좋아요
확실히 금박이라 있어보여요선물용으로도 좋을것 같네요신맛이 강하지 않고 적당한듯유통기한도 넉넉하네요많이파시곶 빠른배송 감사 합니다
구입해서 5일정도 먹고있는데 최곱니다 뭐라고 말하긴 뭐하지만 몸에 느낌이 좋고 희한하게 술을 마셔도 속이 편하고 숙취가 없내요 계속 복용 예정입니다
확실히 덜 피곤하네요 잘 먹고 있습니다
좋은 제품 저렴하게 잘 구매했습니다사업 번창하세요
혈당관리에 효과가 있는것같다 식후 혈당조절에 효과를 보고 있다 지속적으로 복용하겠다
좋은제품이라해서 믿고메습니다
첫 구매 합니다배송비 무료에 타사몰 보다 저렴 하네요먹어보고 재구매예정입니다
뱃살에 도움된다 하여 구매합니다 
혈압약 먹는게 무서워서 신청해서 먹고 있어요 이제 며칠 먹어서 잘 모르겠지만 심정적으로는 혈압이 

In [41]:
# 감정 분석 수행 예시
results = sentiment_analyzer(list)
for text, result in zip(list, results):
    print(f"Text: {text}\nSentiment: {result['label']}, Score: {result['score']:.2f}\n")

Text: 광고보고 약사몰에서 확인하고착한가격에 구입했네요남녀다먹어두 된다해서한포를 물에타서 먹으니 젤편하고맛두좋네요피곤두가시구몸두 가벼워진 느낌
Sentiment: LABEL_1, Score: 0.61

Text: 쿠팡이 더 싸네
Sentiment: LABEL_1, Score: 0.58

Text: 은행잎 추출물
Sentiment: LABEL_0, Score: 0.50

Text: 먹기편해용 굿 굿
Sentiment: LABEL_1, Score: 0.51

Text: 두번째 구매입니다  효과가 있는것 같아서 다시 주문합니다  잘먹을께요 
Sentiment: LABEL_1, Score: 0.54

Text: 먹자마자 세한느낌먼가농축된느낌 두번먹었는데 몸전체가 비타민씨로가득찬 느낌이네요
Sentiment: LABEL_1, Score: 0.64

Text: 좋아요 잘쓰고 있습니다
Sentiment: LABEL_0, Score: 0.58

Text: 효과정말좋아요계속 재구매의사있습니다
Sentiment: LABEL_1, Score: 0.50

Text: 어제 상품 잘 도착했어요 꾸준히 잘 섭취해볼꼐요
Sentiment: LABEL_0, Score: 0.57

Text: 배송 빠르게 잘 도착
Sentiment: LABEL_0, Score: 0.56

Text: 배송빠르고 좋아요
Sentiment: LABEL_0, Score: 0.60

Text: 생유산균 가격대비 넘좋아요요즘 변비가 좀심해서 락토뭐시기사먹다가 잘안들어서 프로 바이오스틱 2틀복용했는데 정말좋아요정말 약사님들 감사합니다최곱니다
Sentiment: LABEL_1, Score: 0.63

Text: 가성비가아주아주좋아요하루2알씩먹어요영양성분이우수해서주위에많은분들에게권장하고있어요
Sentiment: LABEL_0, Score: 0.54

Text: 잘받았습니다 효과는 먹어봐야 알거같아요
Sentiment: LABEL_0, Score: 0.52

Text: 잘 먹고 있습니다저령합니다다른제품 보다 먹기좋아요
Se

In [42]:
data1 = { "text": df_ex['ReviewText'],"label": [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]  # 1: 긍정, 0: 부정
}

In [43]:
df1 = pd.DataFrame(data1)

In [44]:
from datasets import Dataset

# 데이터프레임을 Hugging Face Dataset으로 변환
dataset1 = Dataset.from_pandas(df1)

In [45]:
#train_test_split = dataset1.train_test_split(test_size=0.2)
#train_dataset = train_test_split['train']
#test_dataset = train_test_split['test']

In [46]:
#def tokenize_function(examples):
 #   return tokenizer(examples["text"], padding="max_length", truncation=True)

# 데이터셋 토큰화
#tokenized_train = train_dataset.map(tokenize_function, batched=True)
#tokenized_test = test_dataset.map(tokenize_function, batched=True)

In [47]:
# from transformers import Trainer, TrainingArguments

# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=3,
#     weight_decay=0.01,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_train,
#     eval_dataset=tokenized_test,
# )

In [48]:
#trainer.train()

In [49]:
#eval_results = trainer.evaluate()

In [50]:
#print(eval_results) 

eval_loss가 0.3422로 상대적으로 낮은 값이다. 따라서 해당 모델로 예측을 진행한다.

In [51]:
###training_args = TrainingArguments(
  #  output_dir='./results',
 #   per_device_eval_batch_size=8,
#)
#trainer = Trainer(
  #  model=model,
 #   args=training_args,
#)

In [52]:
#test_results= trainer.predict(tokenized_test)

In [53]:
#predictions = test_results.predictions.argmax(axis=-1)  # 가장 높은 확률의 레이블 선택
#labels = test_results.label_ids  # 실제 레이블 (있다면)

In [54]:
#print("예측 레이블:", predictions)
#if labels is not None:
#    print("실제 레이블:", labels)

In [55]:
#df.drop(columns=['product'],inplace=True)

전체 데이터 예측하기

In [56]:
#data2 = {"text":df['ReviewText']}
#dataset2 = Dataset.from_pandas(df2)

In [57]:
#pred_data2 = dataset2.map(tokenize_function, batched=True)

In [58]:
#data2_results= trainer.predict(pred_data2)

In [59]:
#pred_data2 = data2_results.predictions.argmax(axis=-1)

In [60]:
#pred_data2

In [61]:
#df['score'] = pred_data2

In [62]:
#df.to_csv("C:/Users/jeong/Desktop/baf/predict_review.csv",encoding='utf-8-sig',index=False)

### 모델 기반으로 영양제 추천하기

모델로 예측한 긍정/부정에서 긍정에 1을 부여하여 각 영양제별로 긍정 점수를 매긴다. 그리고 아래가 그 점수를 매긴 최종 영양제 데이터셋이다.

In [110]:
product = pd.read_csv("C:/Users/jeong/Desktop/baf/product.csv")

product 데이터에는 각 영양제 별로 도움이 되는 질환의 열을 1로 기록하였다.

In [111]:
product

Unnamed: 0,product,Vision_group,Cardiovascular_group,Smoking_Alcohol_group,Obesity_group,Liver_Kidney_group,Anemia_group,Diabetes_group,Hypertension_group,Hearing_group,score
0,164 루테인지아잔틴 알티지오메가3 GR,1.0,1.0,,,1.0,,,1.0,1.0,1
1,38 발효효소,,,,1.0,,,,,,2
2,곡물효소 프로바이오틱스,,,,1.0,,,,,,145
3,관절엔 소연골 뮤코다당단백 콘드로이친,,1.0,1.0,,1.0,,,1.0,,14
4,눈 건강엔 루테인 프리미엄,1.0,,,,,,,,1.0,272
5,닥터팜 코큐텐 밀크씨슬,,1.0,,,1.0,,,,,75
6,데일리 비타민C 1000 골드 플러스,,,1.0,,,,,,,14
7,데일리 혈행케어 초임계 rTG오메가3,,1.0,,,1.0,,,1.0,,1
8,류신 단백질 프리미엄 플러스,,,,,1.0,,,,,162
9,리얼 밀크씨슬,,,,,1.0,,,,,11


In [112]:
product.columns

Index(['product', 'Vision_group', 'Cardiovascular_group',
       'Smoking_Alcohol_group', 'Obesity_group', 'Liver_Kidney_group',
       'Anemia_group', 'Diabetes_group', 'Hypertension_group', 'Hearing_group',
       'score'],
      dtype='object')

In [113]:
df = pd.read_csv("C:/Users/jeong/Downloads/Final_grouping.csv")

In [114]:
df

Unnamed: 0,성별코드,Blood_Pressure_group,Anemia_group,Liver_Kidney_group,Obesity_group,Smoking_Alcohol_group,Vision_group,Hearing_group,Diabetes_group,Cardiovascular_group
0,1,Hypertension,Suspected Anemia,Suspected liver/kidney disease,Weight loss needed,Alcohol,Vision risk,Normal,Suspected Diabetes,Suspected Cardiovascular
1,1,Normal,Suspected Anemia,Suspected liver/kidney disease,Normal,Smoking and Alcohol,Normal,Normal,Suspected Diabetes,Normal
2,1,Hypertension,Normal,Suspected liver/kidney disease,Weight loss needed,Smoking and Alcohol,Vision risk,Normal,Suspected Diabetes,Suspected Cardiovascular
3,1,Hypertension,Normal,Suspected liver/kidney disease,Weight loss needed,healthy,Vision risk,Normal,Suspected Diabetes,Suspected Cardiovascular
4,1,Normal,Suspected Anemia,Normal,Weight gain needed,Smoking and Alcohol,Vision risk,Normal,Normal,Normal
...,...,...,...,...,...,...,...,...,...,...
104814,2,Hypertension,Normal,Suspected liver/kidney disease,Weight gain needed,Alcohol,Normal,Normal,Suspected Diabetes,Suspected Cardiovascular
104815,2,Hypertension,Normal,Suspected liver/kidney disease,Normal,Alcohol,Vision risk,Normal,Suspected Diabetes,Suspected Cardiovascular
104816,2,Hypertension,Suspected Anemia,Suspected liver/kidney disease,Weight loss needed,Smoking and Alcohol,Normal,Normal,Suspected Diabetes,Suspected Cardiovascular
104817,2,Normal,Normal,Suspected liver/kidney disease,Weight gain needed,Alcohol,Normal,Normal,Suspected Diabetes,Normal


In [115]:
# 고혈압 그룹에 대한 제품 추천
hypertension_mask = df['Blood_Pressure_group'] == 'Hypertension'
if hypertension_mask.any():
    filtered_products = product[product['Hypertension_group'] == 1]
    if not filtered_products.empty:
        best_press_index = filtered_products['score'].idxmax()  # score가 최대인 행의 인덱스 찾기
        best_press = filtered_products.loc[best_press_index, 'product']  # 해당 인덱스의 제품 이름 가져오기
        df.loc[hypertension_mask, 'Blood_Pressure'] = best_press
    else:
        df.loc[hypertension_mask, 'Blood_Pressure'] = '없음'  # 제품이 없는 경우 '없음'으로 설정
else:
    df.loc[hypertension_mask, 'Blood_Pressure'] = '없음'  # 고혈압 그룹이 없는 경우 '없음'으로 설정

# 빈혈 그룹에 대한 제품 추천
anemia_mask = df['Anemia_group'] == 'Suspected Anemia'
if anemia_mask.any():
    filtered_products = product[product['Anemia_group'] == 1]
    if not filtered_products.empty:
        best_ane_index = filtered_products['score'].idxmax()  # score가 최대인 행의 인덱스 찾기
        best_ane = filtered_products.loc[best_ane_index, 'product']  # 해당 인덱스의 제품 이름 가져오기
        df.loc[anemia_mask, 'Anemia'] = best_ane
    else:
        df.loc[anemia_mask, 'Anemia'] = '없음'
else:
    df.loc[anemia_mask, 'Anemia'] = '없음'

# 간,신장 그룹에 대한 제품 추천
liver_kidney_mask = df['Liver_Kidney_group'] == 'Suspected liver/kidney disease'
if liver_kidney_mask.any():
    filtered_products = product[product['Liver_Kidney_group'] == 1]
    if not filtered_products.empty:
        best_riv_index = filtered_products['score'].idxmax()  # score가 최대인 행의 인덱스 찾기
        best_riv = filtered_products.loc[best_riv_index, 'product']  # 해당 인덱스의 제품 이름 가져오기
        df.loc[liver_kidney_mask, 'Liver_Kidney'] = best_riv
    else:
        df.loc[liver_kidney_mask, 'Liver_Kidney'] = '없음'
else:
    df.loc[liver_kidney_mask, 'Liver_Kidney'] = '없음'

# 비만 그룹에 대한 제품 추천
obesity_mask = df['Obesity_group'] == 'Weight loss needed'
if obesity_mask.any():
    filtered_products = product[product['Obesity_group'] == 1]
    if not filtered_products.empty:
        best_obe_index = filtered_products['score'].idxmax()  # score가 최대인 행의 인덱스 찾기
        best_obe = filtered_products.loc[best_obe_index, 'product']  # 해당 인덱스의 제품 이름 가져오기
        df.loc[obesity_mask, 'Obesity'] = best_obe
    else:
        df.loc[obesity_mask, 'Obesity'] = '없음'
else:
    df.loc[obesity_mask, 'Obesity'] = '없음'

# 음주,흡연 그룹에 대한 제품 추천
smoking_alcohol_mask = df['Smoking_Alcohol_group'].str.contains('Alcohol|Smoking and Alcohol|Smoking', case=False, na=False)
if smoking_alcohol_mask.any():
    filtered_products = product[product['Smoking_Alcohol_group'] == 1]
    if not filtered_products.empty:
        best_s_a_index = filtered_products['score'].idxmax()  # score가 최대인 행의 인덱스 찾기
        best_s_a = filtered_products.loc[best_s_a_index, 'product']  # 해당 인덱스의 제품 이름 가져오기
        df.loc[smoking_alcohol_mask, 'Smoking_Alcohol'] = best_s_a
    else:
        df.loc[smoking_alcohol_mask, 'Smoking_Alcohol'] = '없음'
else:
    df.loc[smoking_alcohol_mask, 'Smoking_Alcohol'] = '없음'

# 시력 그룹에 대한 제품 추천
vision_mask = df['Vision_group'] == 'Suspected Vision Issue'
if vision_mask.any():
    filtered_products = product[product['Vision_group'] == 1]
    if not filtered_products.empty:
        best_vision_index = filtered_products['score'].idxmax()  # score가 최대인 행의 인덱스 찾기
        best_vision = filtered_products.loc[best_vision_index, 'product']  # 해당 인덱스의 제품 이름 가져오기
        df.loc[vision_mask, 'Vision'] = best_vision
    else:
        df.loc[vision_mask, 'Vision'] = '없음'
else:
    df.loc[vision_mask, 'Vision'] = '없음'

# 청력 그룹에 대한 제품 추천
hearing_mask = df['Hearing_group'] == 'Suspected Hearing Issue'
if hearing_mask.any():
    filtered_products = product[product['Hearing_group'] == 1]
    if not filtered_products.empty:
        best_hearing_index = filtered_products['score'].idxmax()  # score가 최대인 행의 인덱스 찾기
        best_hearing = filtered_products.loc[best_hearing_index, 'product']  # 해당 인덱스의 제품 이름 가져오기
        df.loc[hearing_mask, 'Hearing'] = best_hearing
    else:
        df.loc[hearing_mask, 'Hearing'] = '없음'
else:
    df.loc[hearing_mask, 'Hearing'] = '없음'

# 당뇨 그룹에 대한 제품 추천
diabetes_mask = df['Diabetes_group'] == 'Suspected Diabetes'
if diabetes_mask.any():
    filtered_products = product[product['Diabetes_group'] == 1]
    if not filtered_products.empty:
        best_diabetes_index = filtered_products['score'].idxmax()  # score가 최대인 행의 인덱스 찾기
        best_diabetes = filtered_products.loc[best_diabetes_index, 'product']  # 해당 인덱스의 제품 이름 가져오기
        df.loc[diabetes_mask, 'Diabetes'] = best_diabetes
    else:
        df.loc[diabetes_mask, 'Diabetes'] = '없음'
else:
    df.loc[diabetes_mask, 'Diabetes'] = '없음'

# 심혈관 그룹에 대한 제품 추천
cardiovascular_mask = df['Cardiovascular_group'] == 'Suspected Cardiovascular'
if cardiovascular_mask.any():
    filtered_products = product[product['Cardiovascular_group'] == 1]
    if not filtered_products.empty:
        best_car_index = filtered_products['score'].idxmax()  # score가 최대인 행의 인덱스 찾기
        best_car = filtered_products.loc[best_car_index, 'product']  # 해당 인덱스의 제품 이름 가져오기
        df.loc[cardiovascular_mask, 'Cardiovascular'] = best_car
    else:
        df.loc[cardiovascular_mask, 'Cardiovascular'] = '없음'
else:
    df.loc[cardiovascular_mask, 'Cardiovascular'] = '없음'


In [116]:
# NaN이 아닌 값만 리스트로 만들어 'Non_NaN_List'라는 새 열에 추가
df['Non_NaN_List'] = df[['Blood_Pressure', 'Anemia', 'Liver_Kidney', 'Obesity', 
                         'Smoking_Alcohol', 'Vision', 'Hearing', 'Diabetes', 'Cardiovascular']].apply(
    lambda row: row.dropna().tolist(), axis=1
)


각 행별로 가진 질병의 위험도와 그에 따른 영양제를 NoN_NaN_List에 저장하였다.

In [117]:
df.head(6)

Unnamed: 0,성별코드,Blood_Pressure_group,Anemia_group,Liver_Kidney_group,Obesity_group,Smoking_Alcohol_group,Vision_group,Hearing_group,Diabetes_group,Cardiovascular_group,Blood_Pressure,Anemia,Liver_Kidney,Obesity,Smoking_Alcohol,Vision,Hearing,Diabetes,Cardiovascular,Non_NaN_List
0,1,Hypertension,Suspected Anemia,Suspected liver/kidney disease,Weight loss needed,Alcohol,Vision risk,Normal,Suspected Diabetes,Suspected Cardiovascular,초임계 알티지 rTG 오메가3,면역엔아연,면역엔아연,리얼 아르기닌 파워업 6000,메가씨 비타민C 골드 7 플래티넘 6개월,,,리얼 칼슘 마그네슘 아연 비타민D,면역엔아연,"[초임계 알티지 rTG 오메가3, 면역엔아연, 면역엔아연, 리얼 아르기닌 파워업 6..."
1,1,Normal,Suspected Anemia,Suspected liver/kidney disease,Normal,Smoking and Alcohol,Normal,Normal,Suspected Diabetes,Normal,,면역엔아연,면역엔아연,,메가씨 비타민C 골드 7 플래티넘 6개월,,,리얼 칼슘 마그네슘 아연 비타민D,,"[면역엔아연, 면역엔아연, 메가씨 비타민C 골드 7 플래티넘 6개월, 리얼 칼슘 마..."
2,1,Hypertension,Normal,Suspected liver/kidney disease,Weight loss needed,Smoking and Alcohol,Vision risk,Normal,Suspected Diabetes,Suspected Cardiovascular,초임계 알티지 rTG 오메가3,,면역엔아연,리얼 아르기닌 파워업 6000,메가씨 비타민C 골드 7 플래티넘 6개월,,,리얼 칼슘 마그네슘 아연 비타민D,면역엔아연,"[초임계 알티지 rTG 오메가3, 면역엔아연, 리얼 아르기닌 파워업 6000, 메가..."
3,1,Hypertension,Normal,Suspected liver/kidney disease,Weight loss needed,healthy,Vision risk,Normal,Suspected Diabetes,Suspected Cardiovascular,초임계 알티지 rTG 오메가3,,면역엔아연,리얼 아르기닌 파워업 6000,,,,리얼 칼슘 마그네슘 아연 비타민D,면역엔아연,"[초임계 알티지 rTG 오메가3, 면역엔아연, 리얼 아르기닌 파워업 6000, 리얼..."
4,1,Normal,Suspected Anemia,Normal,Weight gain needed,Smoking and Alcohol,Vision risk,Normal,Normal,Normal,,면역엔아연,,,메가씨 비타민C 골드 7 플래티넘 6개월,,,,,"[면역엔아연, 메가씨 비타민C 골드 7 플래티넘 6개월]"
5,1,Hypertension,Suspected Anemia,Suspected liver/kidney disease,Weight gain needed,Alcohol,Vision risk,Normal,Normal,Normal,초임계 알티지 rTG 오메가3,면역엔아연,면역엔아연,,메가씨 비타민C 골드 7 플래티넘 6개월,,,,,"[초임계 알티지 rTG 오메가3, 면역엔아연, 면역엔아연, 메가씨 비타민C 골드 7..."


Non_NaN_List에서 중복요소를 제거하고 최종적으로 사용자에 따른 영양제가 추천된 열을 보여준다.

In [118]:
recommended_sup = df['Non_NaN_List']

In [119]:
recommended_sup = recommended_sup.apply(lambda x: set(x))

In [120]:
supplements_list = pd.DataFrame(recommended_sup)

In [121]:
supplements_list.rename(columns={'Non_NaN_List': 'needed supplements'}, inplace=True)


In [122]:
df['needed supplements'] = supplements_list

In [124]:
df.columns

Index(['성별코드', 'Blood_Pressure_group', 'Anemia_group', 'Liver_Kidney_group',
       'Obesity_group', 'Smoking_Alcohol_group', 'Vision_group',
       'Hearing_group', 'Diabetes_group', 'Cardiovascular_group',
       'Blood_Pressure', 'Anemia', 'Liver_Kidney', 'Obesity',
       'Smoking_Alcohol', 'Vision', 'Hearing', 'Diabetes', 'Cardiovascular',
       'Non_NaN_List', 'needed supplements'],
      dtype='object')

In [125]:
df.drop(columns=['Anemia', 'Liver_Kidney', 'Obesity',
       'Smoking_Alcohol', 'Vision', 'Hearing', 'Diabetes', 'Cardiovascular',
       'Non_NaN_List'],inplace=True)

In [129]:
df.drop(columns=['Blood_Pressure'],inplace=True)

In [136]:
# 예제: 성별코드가 1인 행 하나와 2인 행 하나 선택
row_gender_1 = df[df['성별코드'] == 1].iloc[0]
row_gender_2 = df[df['성별코드'] == 2].iloc[0]

# 두 행을 하나의 데이터프레임으로 결합
selected_rows = pd.DataFrame([row_gender_1, row_gender_2])


### 최종 영양제 추천

In [138]:
selected_rows

Unnamed: 0,성별코드,Blood_Pressure_group,Anemia_group,Liver_Kidney_group,Obesity_group,Smoking_Alcohol_group,Vision_group,Hearing_group,Diabetes_group,Cardiovascular_group,needed supplements
0,1,Hypertension,Suspected Anemia,Suspected liver/kidney disease,Weight loss needed,Alcohol,Vision risk,Normal,Suspected Diabetes,Suspected Cardiovascular,"{면역엔아연, 리얼 칼슘 마그네슘 아연 비타민D, 메가씨 비타민C 골드 7 플래티넘..."
51095,2,Normal,Suspected Anemia,Normal,Weight gain needed,Alcohol,Normal,Normal,Suspected Diabetes,Normal,"{면역엔아연, 리얼 칼슘 마그네슘 아연 비타민D, 메가씨 비타민C 골드 7 플래티넘..."
