In [63]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter

goal_set = pickle.load(open('synthetic_dataset/goal_set.p', 'rb'))

In [64]:
train_data = goal_set['train']
train_data

[{'consult_id': 1059,
  'disease_tag': 'Central retinal artery or vein occlusion',
  'group_id': '7',
  'goal': {'request_slots': {'disease': 'UNK'},
   'explicit_inform_slots': {'Spots or clouds in vision': True},
   'implicit_inform_slots': {'Diminished vision': True,
    'Symptoms of eye': True,
    'Pain in eye': True}}},
 {'consult_id': 19510,
  'disease_tag': 'Degenerative disc disease',
  'group_id': '6',
  'goal': {'request_slots': {'disease': 'UNK'},
   'explicit_inform_slots': {'Shoulder pain': True},
   'implicit_inform_slots': {'Back pain': True,
    'Low back pain': True,
    'Neck pain': True,
    'Hip pain': True,
    'Ache all over': True}}},
 {'consult_id': 25630,
  'disease_tag': 'Diabetic retinopathy',
  'group_id': '4',
  'goal': {'request_slots': {'disease': 'UNK'},
   'explicit_inform_slots': {'Foreign body sensation in eye': True},
   'implicit_inform_slots': {}}},
 {'consult_id': 1467,
  'disease_tag': 'Chronic back pain',
  'group_id': '13',
  'goal': {'request

In [65]:
test_data = goal_set['test']

In [66]:
def merge_slots(explicit_slots, implicit_slots):
    merged_slots = explicit_slots.copy() if isinstance(explicit_slots, dict) else {}  # explicit_inform_slots 복사
    # implicit_inform_slots의 각 항목을 merged_slots에 추가
    for key, value in implicit_slots.items():
        # 이미 merged_slots에 있는 경우에는 무시하고 추가
        if key not in merged_slots:
            merged_slots[key] = value
    return merged_slots

def merge_slots_for_disease(data):
    merged_slots_list = []
    for item in train_data:
        disease_tag = item['disease_tag']
        explicit_slots = item['goal']['explicit_inform_slots']
        implicit_slots = item['goal']['implicit_inform_slots']
        merged_slots = merge_slots(explicit_slots, implicit_slots)
        merged_slots_list.append({'disease_tag': disease_tag, 'merged_slots': merged_slots})
    return merged_slots_list

# 데이터에 대해 함수를 적용하여 결과 생성
merged_data = merge_slots_for_disease(train_data)

In [71]:
symptom_counts = {}

# merged_data를 순회하면서 각 질병에 대해 등장한 증상들의 등장 횟수 계산
for data in merged_data:
    disease_tag = data['disease_tag']
    symptoms = data['merged_slots']
    if disease_tag not in symptom_counts:
        symptom_counts[disease_tag] = {}
    for symptom, count in symptoms.items():
        if symptom not in symptom_counts[disease_tag]:
            symptom_counts[disease_tag][symptom] = 0
        symptom_counts[disease_tag][symptom] += 1

sorted_symptom_counts = dict(sorted(symptom_counts.items()))

total_symptom_diseases = {}
total_symptom_diseases_count = {}

# 주어진 데이터를 반복하면서 각 증상이 어떤 질병들에 속하는지 기록
for disease, symptoms in sorted_symptom_counts.items():
    for symptom in symptoms:
        if symptom not in total_symptom_diseases:
            total_symptom_diseases[symptom] = []
            total_symptom_diseases_count[symptom] = 0
        total_symptom_diseases[symptom].append(disease)
        total_symptom_diseases_count[symptom] += 1
        
total_sorted_symptom_diseases_count = sorted(total_symptom_diseases_count.items(), key=lambda x: x[1], reverse=True)


In [72]:
def assign_weights(total_sorted_symptom_diseases_count):
    # 각 증상별 질병 개수 추출
    counts = [count for _, count in total_sorted_symptom_diseases_count]

    # 가중치 부여
    weights = pd.cut(counts, bins=[0, 1, 2, 3, float('inf')], labels=[100, 50, 30, 10])

    # 결과 반환
    weighted_counts = [(symptom, weight) for (symptom, _), weight in zip(total_sorted_symptom_diseases_count, weights)]
    return weighted_counts

weighted_counts = assign_weights(total_sorted_symptom_diseases_count)

weighted_counts

[('Pain in eye', 10),
 ('Skin rash', 10),
 ('Fatigue', 10),
 ('Problems with movement', 10),
 ('Leg pain', 10),
 ('Skin lesion', 10),
 ('Dizziness', 10),
 ('Headache', 10),
 ('Back pain', 10),
 ('Itching of skin', 10),
 ('Diminished vision', 10),
 ('Depressive or psychotic symptoms', 10),
 ('Itchiness of eye', 10),
 ('Symptoms of eye', 10),
 ('Sharp abdominal pain', 10),
 ('Ache all over', 10),
 ('Paresthesia', 10),
 ('Loss of sensation', 10),
 ('Vomiting', 10),
 ('Abnormal appearing skin', 10),
 ('Skin swelling', 10),
 ('Eye redness', 10),
 ('Sharp chest pain', 10),
 ('Nausea', 10),
 ('Abnormal involuntary movements', 10),
 ('Depression', 10),
 ('Excessive anger', 10),
 ('Wrist pain', 10),
 ('Anxiety and nervousness', 10),
 ('Neck pain', 10),
 ('Arm pain', 10),
 ('Ankle pain', 10),
 ('Knee lump or mass', 10),
 ('Swollen eye', 10),
 ('Cough', 10),
 ('Pain or soreness of breast', 10),
 ('Skin growth', 10),
 ('Allergic reaction', 10),
 ('Lacrimation', 10),
 ('Shortness of breath', 10),
 

In [4]:
df_train = pd.DataFrame(train_data)

# explicit_inform_slots와 implicit_inform_slots에서 얻은 질병들을 결합하여 리스트로 만듦
disease_list = []
for index, row in df_train.iterrows():
    explicit_diseases = list(row['goal']['explicit_inform_slots'].keys())
    implicit_diseases = list(row['goal']['implicit_inform_slots'].keys())
    combined_diseases = explicit_diseases + implicit_diseases
    disease_list.extend(combined_diseases)

# disease_list에서 중복 제거
disease_list = list(set(disease_list))

# disease_list를 disease_tag에 매핑하는 데이터프레임 생성
rows = []
for disease in disease_list:
    related_tags = df_train[df_train.apply(lambda x: disease in x['goal']['explicit_inform_slots'] or disease in x['goal']['implicit_inform_slots'], axis=1)]['disease_tag'].tolist()
    rows.append({'Disease': disease, 'Disease_Tag': related_tags})

# 데이터프레임 생성
df_mapping = pd.DataFrame(rows)

print(df_mapping)

                                   Disease  \
0                                   Coryza   
1                           Ringing in ear   
2                                 Headache   
3               Hip stiffness or tightness   
4          Hand or finger cramps or spasms   
..                                     ...   
261  Hand or finger stiffness or tightness   
262                 Elbow cramps or spasms   
263                                  Apnea   
264                   Painful menstruation   
265                     Wrist lump or mass   

                                           Disease_Tag  
0    [Erythema multiforme, Erythema multiforme, Ery...  
1    [Conductive hearing loss, Conductive hearing l...  
2    [Cerebral edema, Cerebral edema, Carbon monoxi...  
3    [Diabetes insipidus, Diabetes insipidus, Diabe...  
4    [Diabetic peripheral neuropathy, Diabetic peri...  
..                                                 ...  
261  [Carpal tunnel syndrome, Carpal tunnel synd

In [80]:
def calculate_symptom_prob(data, total_sorted_symptom_diseases_count):
    # 데이터프레임으로 변환
    df_train = pd.DataFrame(data)

    # explicit_inform_slots와 implicit_inform_slots에서 얻은 질병들을 결합하여 리스트로 만듦
    slots_list = []
    for index, row in df_train.iterrows():
        explicit_diseases = list(row['goal']['explicit_inform_slots'].keys())
        implicit_diseases = list(row['goal']['implicit_inform_slots'].keys())
        combined_slots = explicit_diseases + implicit_diseases
        slots_list.extend(combined_slots)

    # 각 항목이 등장하는 횟수를 세고 확률 계산
    total_slots = len(slots_list)
    slots_occurrences = pd.Series(slots_list).value_counts(normalize=True)

    # 질병과 증상에 대한 확률 분포를 DataFrame으로 만듦
    disease_symptom_prob = pd.DataFrame(index=df_train['disease_tag'].unique(), columns=slots_occurrences.index)

    # 질병과 증상에 대한 확률 계산하여 DataFrame에 채움
    for disease_tag in disease_symptom_prob.index:
        for symptom in disease_symptom_prob.columns:
            explicit_symptom = df_train[df_train['disease_tag'] == disease_tag]['goal'].apply(lambda x: symptom in x['explicit_inform_slots']).sum()
            implicit_symptom = df_train[df_train['disease_tag'] == disease_tag]['goal'].apply(lambda x: symptom in x['implicit_inform_slots']).sum()
            total_occurrences = len(df_train[df_train['disease_tag'] == disease_tag])
            occurrence_count = explicit_symptom + implicit_symptom
            disease_symptom_prob.loc[disease_tag, symptom] = occurrence_count / total_occurrences

    # 각 증상별 질병 수에 따라 가중치를 부여
    counts = [count for _, count in total_sorted_symptom_diseases_count]

    # 가중치 부여
    weights = pd.cut(counts, bins=[0, 2, 4, 8, 12, float('inf')], labels=[100, 50, 30, 20, 10])

    # 가중치를 적용하여 최종 확률 계산
    weighted_prob = disease_symptom_prob.copy()
    for (symptom, _), weight in zip(total_sorted_symptom_diseases_count, weights):
        if symptom in weighted_prob.columns:
            weighted_prob[symptom] *= float(weight)

    return weighted_prob

result = calculate_symptom_prob(train_data, total_sorted_symptom_diseases_count)
test_set = calculate_symptom_prob(test_data,total_sorted_symptom_diseases_count)


In [81]:
result.loc["Fibrocystic breast disease","Frequent menstruation"]

1.9455252918287937

In [83]:
common_columns = result.columns.intersection(test_set.columns)
train_common = result[common_columns]
test_common = test_set[common_columns]

In [86]:
X_train = train_common.values  
y_train = train_common.index  # 질병
X_test = test_common.values
y_test = test_common.index

In [87]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
# 랜덤 포레스트 모델 생성
rf_model = RandomForestClassifier(n_estimators=30, random_state=42)  # 트리 개수는 100으로 설정하고, 난수 초기값을 고정합니다.

# 학습 데이터로 모델 학습
rf_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred_rf = rf_model.predict(X_test)

# 정확도 평가
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)

# F1-score 계산
f1_score_rf = f1_score(y_test, y_pred_rf, average='weighted')
print("Random Forest F1-score:", f1_score_rf)

# AUC 계산
y_prob_rf = rf_model.predict_proba(X_test)
auc_rf = roc_auc_score(y_test, y_prob_rf, multi_class='ovr')
print("Random Forest AUC:", auc_rf)

Random Forest Accuracy: 0.9666666666666667
Random Forest F1-score: 0.9555555555555556
Random Forest AUC: 0.9993133583021224


In [88]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_curve, roc_auc_score

# 로지스틱 회귀 모델 생성
logistic_model = LogisticRegression()

# 데이터 학습
logistic_model.fit(X_train, y_train)

# 테스트 데이터 예측
y_pred_logistic = logistic_model.predict(X_test)

# 정확도 평가
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print("Logistic Regression Accuracy:", accuracy_logistic)

# f1-score 계산
f1_score_logistic = f1_score(y_test, y_pred_logistic, average='weighted')
print("Logistic Regression F1-score:", f1_score_logistic)

# ROC curve 및 AUC 계산
y_prob_logistic = logistic_model.predict_proba(X_test)
auc_logistic = roc_auc_score(y_test, y_prob_logistic, multi_class='ovr')
print("Logistic Regression AUC:", auc_logistic)

Logistic Regression Accuracy: 0.9666666666666667
Logistic Regression F1-score: 0.9629629629629629
Logistic Regression AUC: 0.9995006242197254


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [89]:
from sklearn.neighbors import KNeighborsClassifier

# KNN 모델 생성
knn_model = KNeighborsClassifier()

# 학습 데이터로 모델 학습
knn_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred_knn = knn_model.predict(X_test)

# 정확도 평가
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("KNN Accuracy:", accuracy_knn)

# F1-score 계산
f1_score_knn = f1_score(y_test, y_pred_knn, average='weighted')
print("KNN F1-score:", f1_score_knn)

KNN Accuracy: 0.24444444444444444
KNN F1-score: 0.160036075036075


In [90]:
from sklearn.naive_bayes import MultinomialNB

# 나이브 베이즈 모델 생성
nb_model = MultinomialNB()

# 학습 데이터로 모델 학습
nb_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred_nb = nb_model.predict(X_test)

# 정확도 평가
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print("Naive Bayes Accuracy:", accuracy_nb)

# F1-score 계산
f1_score_nb = f1_score(y_test, y_pred_nb, average='weighted')
print("Naive Bayes F1-score:", f1_score_nb)

Naive Bayes Accuracy: 0.9666666666666667
Naive Bayes F1-score: 0.9611111111111111


In [91]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve

# SVM 모델 생성
svm_model = SVC(kernel='linear')

# 학습 데이터로 모델 학습
svm_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred_svm = svm_model.predict(X_test)

# 정확도 평가
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", accuracy_svm)

# f1-score 계산
f1_score_svm = f1_score(y_test, y_pred_svm, average='weighted')
print("SVM F1-score:", f1_score_svm)

SVM Accuracy: 0.9666666666666667
SVM F1-score: 0.9629629629629629


In [19]:
pip install catboost

Note: you may need to restart the kernel to use updated packages.


In [92]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier

# 캣부스트 모델 생성
catboost_model = CatBoostClassifier()
catboost_model.fit(X_train, y_train.values)
y_pred_catboost = catboost_model.predict(X_test)

# 정확도 평가
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
print("CatBoost Accuracy:", accuracy_catboost)

# F1-score 계산
f1_score_catboost = f1_score(y_test, y_pred_catboost, average='weighted')
print("CatBoost F1-score:", f1_score_catboost)

# AUC 계산
y_prob_catboost = catboost_model.predict_proba(X_test)
auc_catboost = roc_auc_score(y_test, y_prob_catboost, multi_class='ovr')
print("CatBoost AUC:", auc_catboost)

Learning rate set to 0.069983
0:	learn: 4.4913897	total: 111ms	remaining: 1m 50s
1:	learn: 4.4816698	total: 221ms	remaining: 1m 50s
2:	learn: 4.4717090	total: 335ms	remaining: 1m 51s
3:	learn: 4.4606164	total: 460ms	remaining: 1m 54s
4:	learn: 4.4493282	total: 586ms	remaining: 1m 56s
5:	learn: 4.4389115	total: 722ms	remaining: 1m 59s
6:	learn: 4.4323649	total: 839ms	remaining: 1m 59s
7:	learn: 4.4229813	total: 960ms	remaining: 1m 59s
8:	learn: 4.4139066	total: 1.09s	remaining: 1m 59s
9:	learn: 4.4031800	total: 1.21s	remaining: 2m
10:	learn: 4.3914502	total: 1.34s	remaining: 2m
11:	learn: 4.3795090	total: 1.48s	remaining: 2m 1s
12:	learn: 4.3709279	total: 1.64s	remaining: 2m 4s
13:	learn: 4.3621863	total: 1.77s	remaining: 2m 4s
14:	learn: 4.3542320	total: 1.89s	remaining: 2m 3s
15:	learn: 4.3442399	total: 2.01s	remaining: 2m 3s
16:	learn: 4.3358682	total: 2.13s	remaining: 2m 3s
17:	learn: 4.3258219	total: 2.29s	remaining: 2m 4s
18:	learn: 4.3207739	total: 2.41s	remaining: 2m 4s
19:	lear

160:	learn: 3.4460598	total: 20.8s	remaining: 1m 48s
161:	learn: 3.4435144	total: 20.9s	remaining: 1m 48s
162:	learn: 3.4409749	total: 21.1s	remaining: 1m 48s
163:	learn: 3.4376479	total: 21.2s	remaining: 1m 48s
164:	learn: 3.4347471	total: 21.3s	remaining: 1m 48s
165:	learn: 3.4318570	total: 21.5s	remaining: 1m 47s
166:	learn: 3.4294177	total: 21.6s	remaining: 1m 47s
167:	learn: 3.4269325	total: 21.7s	remaining: 1m 47s
168:	learn: 3.4221684	total: 21.8s	remaining: 1m 47s
169:	learn: 3.4193098	total: 21.9s	remaining: 1m 47s
170:	learn: 3.4155934	total: 22.1s	remaining: 1m 46s
171:	learn: 3.4127091	total: 22.2s	remaining: 1m 46s
172:	learn: 3.4102881	total: 22.3s	remaining: 1m 46s
173:	learn: 3.4078111	total: 22.4s	remaining: 1m 46s
174:	learn: 3.4029719	total: 22.6s	remaining: 1m 46s
175:	learn: 3.3996699	total: 22.7s	remaining: 1m 46s
176:	learn: 3.3971584	total: 22.9s	remaining: 1m 46s
177:	learn: 3.3939370	total: 23s	remaining: 1m 46s
178:	learn: 3.3865224	total: 23.1s	remaining: 1m

316:	learn: 2.9771492	total: 40.9s	remaining: 1m 28s
317:	learn: 2.9747563	total: 41s	remaining: 1m 27s
318:	learn: 2.9708631	total: 41.2s	remaining: 1m 27s
319:	learn: 2.9681188	total: 41.3s	remaining: 1m 27s
320:	learn: 2.9654761	total: 41.6s	remaining: 1m 27s
321:	learn: 2.9621264	total: 41.7s	remaining: 1m 27s
322:	learn: 2.9593470	total: 41.9s	remaining: 1m 27s
323:	learn: 2.9563022	total: 42.1s	remaining: 1m 27s
324:	learn: 2.9538976	total: 42.2s	remaining: 1m 27s
325:	learn: 2.9511264	total: 42.3s	remaining: 1m 27s
326:	learn: 2.9487050	total: 42.4s	remaining: 1m 27s
327:	learn: 2.9455290	total: 42.6s	remaining: 1m 27s
328:	learn: 2.9412321	total: 42.8s	remaining: 1m 27s
329:	learn: 2.9387796	total: 42.9s	remaining: 1m 27s
330:	learn: 2.9360260	total: 43s	remaining: 1m 26s
331:	learn: 2.9336285	total: 43.1s	remaining: 1m 26s
332:	learn: 2.9312212	total: 43.2s	remaining: 1m 26s
333:	learn: 2.9289231	total: 43.4s	remaining: 1m 26s
334:	learn: 2.9261499	total: 43.5s	remaining: 1m 2

473:	learn: 2.5303205	total: 1m 1s	remaining: 1m 8s
474:	learn: 2.5277689	total: 1m 1s	remaining: 1m 8s
475:	learn: 2.5255822	total: 1m 1s	remaining: 1m 8s
476:	learn: 2.5201695	total: 1m 1s	remaining: 1m 7s
477:	learn: 2.5173672	total: 1m 2s	remaining: 1m 7s
478:	learn: 2.5151676	total: 1m 2s	remaining: 1m 7s
479:	learn: 2.5109414	total: 1m 2s	remaining: 1m 7s
480:	learn: 2.5086932	total: 1m 2s	remaining: 1m 7s
481:	learn: 2.5063849	total: 1m 2s	remaining: 1m 7s
482:	learn: 2.5041494	total: 1m 2s	remaining: 1m 7s
483:	learn: 2.5019899	total: 1m 2s	remaining: 1m 6s
484:	learn: 2.4997633	total: 1m 2s	remaining: 1m 6s
485:	learn: 2.4949031	total: 1m 2s	remaining: 1m 6s
486:	learn: 2.4923806	total: 1m 3s	remaining: 1m 6s
487:	learn: 2.4857063	total: 1m 3s	remaining: 1m 6s
488:	learn: 2.4834677	total: 1m 3s	remaining: 1m 6s
489:	learn: 2.4812325	total: 1m 3s	remaining: 1m 5s
490:	learn: 2.4786248	total: 1m 3s	remaining: 1m 5s
491:	learn: 2.4739231	total: 1m 3s	remaining: 1m 5s
492:	learn: 

630:	learn: 2.0991748	total: 1m 21s	remaining: 47.6s
631:	learn: 2.0960862	total: 1m 21s	remaining: 47.4s
632:	learn: 2.0941121	total: 1m 21s	remaining: 47.3s
633:	learn: 2.0918862	total: 1m 21s	remaining: 47.1s
634:	learn: 2.0895776	total: 1m 21s	remaining: 47s
635:	learn: 2.0874131	total: 1m 21s	remaining: 46.9s
636:	learn: 2.0854559	total: 1m 22s	remaining: 46.7s
637:	learn: 2.0834060	total: 1m 22s	remaining: 46.6s
638:	learn: 2.0810289	total: 1m 22s	remaining: 46.5s
639:	learn: 2.0789934	total: 1m 22s	remaining: 46.4s
640:	learn: 2.0770891	total: 1m 22s	remaining: 46.2s
641:	learn: 2.0747444	total: 1m 22s	remaining: 46.1s
642:	learn: 2.0727084	total: 1m 22s	remaining: 46s
643:	learn: 2.0703828	total: 1m 22s	remaining: 45.8s
644:	learn: 2.0680336	total: 1m 23s	remaining: 45.7s
645:	learn: 2.0657587	total: 1m 23s	remaining: 45.6s
646:	learn: 2.0637897	total: 1m 23s	remaining: 45.4s
647:	learn: 2.0617123	total: 1m 23s	remaining: 45.3s
648:	learn: 2.0594779	total: 1m 23s	remaining: 45.

786:	learn: 1.7341712	total: 1m 40s	remaining: 27.2s
787:	learn: 1.7323924	total: 1m 40s	remaining: 27.1s
788:	learn: 1.7303754	total: 1m 40s	remaining: 27s
789:	learn: 1.7285803	total: 1m 41s	remaining: 26.9s
790:	learn: 1.7268260	total: 1m 41s	remaining: 26.7s
791:	learn: 1.7250737	total: 1m 41s	remaining: 26.6s
792:	learn: 1.7232583	total: 1m 41s	remaining: 26.5s
793:	learn: 1.7198919	total: 1m 41s	remaining: 26.3s
794:	learn: 1.7166181	total: 1m 41s	remaining: 26.2s
795:	learn: 1.7142978	total: 1m 41s	remaining: 26.1s
796:	learn: 1.7124438	total: 1m 41s	remaining: 25.9s
797:	learn: 1.7100390	total: 1m 41s	remaining: 25.8s
798:	learn: 1.7080108	total: 1m 42s	remaining: 25.7s
799:	learn: 1.7056920	total: 1m 42s	remaining: 25.5s
800:	learn: 1.7038305	total: 1m 42s	remaining: 25.4s
801:	learn: 1.7012858	total: 1m 42s	remaining: 25.3s
802:	learn: 1.6995610	total: 1m 42s	remaining: 25.2s
803:	learn: 1.6973740	total: 1m 42s	remaining: 25s
804:	learn: 1.6954881	total: 1m 42s	remaining: 24.

942:	learn: 1.4064848	total: 1m 58s	remaining: 7.17s
943:	learn: 1.4046464	total: 1m 58s	remaining: 7.04s
944:	learn: 1.4021976	total: 1m 58s	remaining: 6.92s
945:	learn: 1.4005017	total: 1m 58s	remaining: 6.79s
946:	learn: 1.3979703	total: 1m 59s	remaining: 6.67s
947:	learn: 1.3962412	total: 1m 59s	remaining: 6.54s
948:	learn: 1.3939393	total: 1m 59s	remaining: 6.41s
949:	learn: 1.3917990	total: 1m 59s	remaining: 6.29s
950:	learn: 1.3897474	total: 1m 59s	remaining: 6.16s
951:	learn: 1.3857056	total: 1m 59s	remaining: 6.03s
952:	learn: 1.3842195	total: 1m 59s	remaining: 5.91s
953:	learn: 1.3820359	total: 1m 59s	remaining: 5.78s
954:	learn: 1.3802081	total: 2m	remaining: 5.66s
955:	learn: 1.3787297	total: 2m	remaining: 5.53s
956:	learn: 1.3767537	total: 2m	remaining: 5.4s
957:	learn: 1.3751904	total: 2m	remaining: 5.28s
958:	learn: 1.3726605	total: 2m	remaining: 5.15s
959:	learn: 1.3709982	total: 2m	remaining: 5.02s
960:	learn: 1.3682934	total: 2m	remaining: 4.9s
961:	learn: 1.3667325	t

In [111]:
def diagnose_disease(symptom, weighted_prob, threshold=100):
    # 가장 높은 확률 값을 가지는 질병 선택
    max_prob_disease = weighted_prob.idxmax(axis=0).loc[symptom]
    max_prob_value = weighted_prob.loc[max_prob_disease, symptom]
    
    # 초기 confidence 값 설정
    confidence = {disease: 0 for disease in weighted_prob.index}
    
    # 초기 질병 설정
    current_disease = max_prob_disease

    # 진단에 사용된 증상들을 저장할 리스트
    diagnosed_symptoms = []
    
    while True:
        # 질병에 대한 confidence 값 누적
        confidence[current_disease] += max_prob_value

        # confidence가 threshold 이상인 질병 확인
        candidate_diseases = {disease: conf for disease, conf in confidence.items() if conf >= threshold}

        # threshold 이상인 질병이 존재하는 경우
        if candidate_diseases:
            # confidence가 threshold 이상인 질병 중 가장 높은 값을 가지는 질병 선택
            diagnosed_disease = max(candidate_diseases, key=candidate_diseases.get)
            return diagnosed_disease, confidence[diagnosed_disease], diagnosed_symptoms
        
        # threshold 이상인 질병이 없는 경우
        else:
            # 질병과 관련된 증상 중 가장 높은 증상 확인
            max_symptom = weighted_prob.loc[current_disease].idxmax()
        
            # 증상을 가지고 있는지 물어보고 증상이 있으면 해당 질병으로 이동
            if max_symptom == symptom:
                return current_disease, confidence[current_disease], diagnosed_symptoms
            else:
                # 해당 질병과 증상에 대한 확률값 가져오기
                max_prob_value = weighted_prob.loc[current_disease, max_symptom]
                diagnosed_symptoms.append(max_symptom)

                # 질의한 증상을 제외한 다른 증상들 중에서 질의할 수 있도록 설정
                available_symptoms = weighted_prob.loc[current_disease].index.difference([symptom])
                max_symptom = weighted_prob.loc[current_disease, available_symptoms].idxmax()

                # 다음 증상을 질의
                symptom = max_symptom

weighted_prob = result.astype(float)
# 초기 증상 입력
initial_symptom = 'Skin swelling'
diagnosed_disease, confidence, diagnosed_symptoms = diagnose_disease(initial_symptom, weighted_prob)

print(f"진단된 질병: {diagnosed_disease}, Confidence: {confidence}")
print("진단에 사용된 증상들:")
for symptom in diagnosed_symptoms:
    print(symptom)


진단된 질병: Cat scratch disease, Confidence: 90.11494252873564
진단에 사용된 증상들:
Neck swelling


In [98]:
given_symptoms = ['Diminished vision']  # 주어진 증상

# 각 질병에 대한 중요도(증상 확률 값 합산)
disease_scores = result[given_symptoms].sum(axis=1)

# 중요도가 0이 아닌 질병만 선택하여 정렬
non_zero_disease_scores = disease_scores[disease_scores > 0]

# 중요도를 기준으로 질병들을 정렬하여 순위를 매김
ranked_diseases = non_zero_disease_scores.sort_values(ascending=False)

# 결과 출력
print("질병 순위:")
for rank, (disease, score) in enumerate(ranked_diseases.items(), start=1):
    print(f"{rank}. {disease}: {score}")

질병 순위:
1. Central retinal artery or vein occlusion: 17.36111111111111
2. Corneal disorder: 15.09433962264151
3. Diabetic retinopathy: 15.017543859649123
4. Aphakia: 14.921875
5. Acute glaucoma: 13.846153846153847
6. Endophthalmitis: 10.606060606060606
7. Cerebral edema: 9.003984063745019
8. Cyst of the eyelid: 8.841201716738198
9. Conjunctivitis due to allergy: 7.039999999999999
10. Chalazion: 6.1946902654867255
11. Ectropion: 5.735294117647058
12. Corneal abrasion: 5.503355704697986
