In [1]:
import pandas as pd
import pickle
%config Completer.use_jedi = False

goal_set = pickle.load(open('synthetic_dataset/goal_set.p', 'rb'))
slot_set = pickle.load(open('synthetic_dataset/slot_set.p', 'rb'))


In [2]:
lengths = {key: len(value) for key, value in goal_set.items()}
lengths

{'train': 24000, 'test': 6000, 'validate': 0}

In [3]:
train_data = goal_set['train']
test_data = goal_set['test']
train_data

[{'consult_id': 1059,
  'disease_tag': 'Central retinal artery or vein occlusion',
  'group_id': '7',
  'goal': {'request_slots': {'disease': 'UNK'},
   'explicit_inform_slots': {'Spots or clouds in vision': True},
   'implicit_inform_slots': {'Diminished vision': True,
    'Symptoms of eye': True,
    'Pain in eye': True}}},
 {'consult_id': 19510,
  'disease_tag': 'Degenerative disc disease',
  'group_id': '6',
  'goal': {'request_slots': {'disease': 'UNK'},
   'explicit_inform_slots': {'Shoulder pain': True},
   'implicit_inform_slots': {'Back pain': True,
    'Low back pain': True,
    'Neck pain': True,
    'Hip pain': True,
    'Ache all over': True}}},
 {'consult_id': 25630,
  'disease_tag': 'Diabetic retinopathy',
  'group_id': '4',
  'goal': {'request_slots': {'disease': 'UNK'},
   'explicit_inform_slots': {'Foreign body sensation in eye': True},
   'implicit_inform_slots': {}}},
 {'consult_id': 1467,
  'disease_tag': 'Chronic back pain',
  'group_id': '13',
  'goal': {'request

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
# 데이터 매핑
X_train = [list(item['goal']['explicit_inform_slots'].keys())[0] for item in train_data]
Y_train = [list((item['goal']['implicit_inform_slots'].keys())) for item in train_data]

X_train_expanded = []
Y_train_expanded = []

for x, ys in zip(X_train, Y_train):
    for y in ys:
        X_train_expanded.append(x)
        Y_train_expanded.append(y)

# TF-IDF 변환
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train_expanded)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, Y_train_expanded)

test_input = ['Spots or clouds in vision']
test_input_vectorized = vectorizer.transform(test_input)
predicted_output = nb_classifier.predict(test_input_vectorized)

print("입력 'a'일 때 예측된 레이블:", predicted_output)

# 순위
predicted_probabilities = nb_classifier.predict_proba(test_input_vectorized)


# # 예측된 확률을 내림차순으로 정렬
sorted_indices = predicted_probabilities.argsort()[0][::-1]


print("입력 'Spots or clouds in vision'에 대한 상위 5개")
for i in range(5):
    class_label = nb_classifier.classes_[sorted_indices[i]]
    probability = predicted_probabilities[0, sorted_indices[i]]
    print(f"{i+1}등 - {class_label}: {probability}")

입력 'a'일 때 예측된 레이블: ['Diminished vision']
입력 'Spots or clouds in vision'에 대한 상위 5개
1등 - Diminished vision: 0.3002679951997312
2등 - Symptoms of eye: 0.16742204448894427
3등 - Pain in eye: 0.13072107047882742
4등 - Eye redness: 0.09230921387434955
5등 - Itchiness of eye: 0.07004288628508877


In [5]:
from sklearn.metrics import accuracy_score

# 테스트 데이터 준비
X_test = [list(item['goal']['explicit_inform_slots'].keys())[0] for item in test_data]
Y_test = [list((item['goal']['implicit_inform_slots'].keys())) for item in test_data]

X_test_expanded = []
Y_test_expanded = []

for x, ys in zip(X_test, Y_test):
    for y in ys:
        X_test_expanded.append(x)
        Y_test_expanded.append(y)

# TF-IDF 변환
X_test_vectorized = vectorizer.transform(X_test_expanded)

# 모델을 사용하여 테스트 데이터에 대한 예측 수행
predicted_test_output = nb_classifier.predict(X_test_vectorized)

# 정확도 계산
accuracy = accuracy_score(Y_test_expanded, predicted_test_output)
print("모델 정확도:", accuracy)

모델 정확도: 0.13138075313807532


In [228]:
X_test = [list(item['goal']['explicit_inform_slots'].keys())[0] for item in test_data]
Y_test = [list(item['goal']['implicit_inform_slots'].keys()) for item in test_data]

X_test_dummy = pd.get_dummies(X_test)
X_test_dummy.reindex(columns=X_train_dummy.columns, fill_value=False)
X_test_dummy
# Y_test_transform = mlb.transform(Y_test)

# predictions = model.predict(X_test_dummy)
# accuracy = accuracy_score(Y_test_transform, predictions)
# print("Accuracy:", accuracy)

Unnamed: 0,Abnormal appearing skin,Abnormal involuntary movements,Abnormal movement of eyelid,Absence of menstruation,Abusing alcohol,Ache all over,Acne or pimples,Allergic reaction,Ankle pain,Ankle stiffness or tightness,...,Warts,Weakness,Weight gain,White discharge from eye,Wrinkles on skin,Wrist lump or mass,Wrist pain,Wrist stiffness or tightness,Wrist swelling,Wrist weakness
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5996,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5997,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5998,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [229]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import pandas as pd

X_train = [list(item['goal']['explicit_inform_slots'].keys())[0] for item in train_data]
Y_train = [list((item['goal']['implicit_inform_slots'].keys())) for item in train_data]

X_train_expanded = []
Y_train_expanded = []

for x, ys in zip(X_train, Y_train):
    for y in ys:
        X_train_expanded.append(x)
        Y_train_expanded.append(y)


X_train_dummy = pd.get_dummies(X_train_expanded)
Y_train_tranform = mlb.fit_transform(Y_train_expanded)
Y_train

[['Diminished vision', 'Symptoms of eye', 'Pain in eye'],
 ['Back pain', 'Low back pain', 'Neck pain', 'Hip pain', 'Ache all over'],
 [],
 ['Back pain', 'Side pain'],
 ['Pain in eye',
  'Shoulder cramps or spasms',
  'Facial pain',
  'Ankle pain',
  'Pain during pregnancy',
  'Joint stiffness or tightness',
  'Pain or soreness of breast',
  'Knee lump or mass',
  'Excessive urination at night'],
 ['Skin dryness, peeling, scaliness, or roughness'],
 [],
 ['Shoulder cramps or spasms',
  'Ankle pain',
  'Wrist pain',
  'Excessive anger',
  'Joint stiffness or tightness',
  'Pain or soreness of breast',
  'Knee lump or mass',
  'Fatigue',
  'Excessive urination at night'],
 ['Sweating', 'Abnormal involuntary movements', 'Fatigue', 'Cough'],
 ['Pain in eye',
  'Shoulder cramps or spasms',
  'Facial pain',
  'Ankle pain',
  'Pain during pregnancy',
  'Excessive anger',
  'Joint stiffness or tightness',
  'Knee lump or mass',
  'Fatigue',
  'Excessive urination at night'],
 ['Shortness of bre

In [128]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_dummy, Y_train_tranform)

In [135]:
X_test = [list(item['goal']['explicit_inform_slots'].keys())[0] for item in test_data]
Y_test = [list((item['goal']['implicit_inform_slots'].keys())) for item in test_data]

X_test_expanded = []
Y_test_expanded = []

for x, ys in zip(X_test, Y_test):
    for y in ys:
        X_test_expanded.append(x)
        Y_test_expanded.append(y)


X_test_dummy = X_train = pd.get_dummies(X_test_expanded)
X_test_dummy = X_test_dummy.reindex(columns=X_train_dummy.columns, fill_value=0)
Y_test_transform = mlb.fit_transform(Y_test_expanded)

In [132]:
predictions = model.predict(X_test_dummy)

In [136]:
from sklearn.metrics import accuracy_score, hamming_loss, precision_score, recall_score, f1_score

# 성능 평가: 여러 성능 지표를 사용해 모델 평가
precision = precision_score(Y_test_transform, predictions, average='samples')
recall = recall_score(Y_test_transform, predictions, average='samples')
f1 = f1_score(Y_test_transform, predictions, average='samples')
hammingloss = hamming_loss(Y_test_transform, predictions)

print(f"Precision: {precision}\nRecall: {recall}\nF1 Score: {f1}\nHamming Loss: {hammingloss}")


Precision: 0.734631414253236
Recall: 0.5549288258960391
F1 Score: 0.6176208238193042
Hamming Loss: 0.13449758609591245


In [218]:
new_input = pd.DataFrame({'explicit_inform_slots': ['Spots or clouds in vision']})
new_input
X_test = pd.get_dummies(new_input)

X_test = X_test.reindex(columns=X_train.columns, fill_value=False)
X_test['Spots or clouds in vision'] = True
probabilities = model.predict_proba(X_test)

# 각 레이블에 대한 "있음" 확률을 모으기 위한 배열 초기화
positive_probabilities = []

# 각 레이블에 대한 "있음" 확률을 추출
for prob in probabilities:
    positive_probabilities.append(prob[0][1])  # 레이블이 "있음"일 확률

# "있음" 확률이 높은 상위 5개 인덱스 찾기
top_indices = np.argsort(positive_probabilities)[-5:][::-1]

print(top_indices)
# 상위 5개 인덱스에 해당하는 암묵적 증상 이름 조회
top_symptoms = [X_train.columns[i] for i in top_indices]

print("상위 5개 암묵적 증상:", top_symptoms)

print("상위 5개 암묵적 증상:", top_symptoms)


[28  0 37 32 38]
상위 5개 암묵적 증상: ['Blindness', 'Abnormal appearing skin', 'Cough', 'Burning abdominal pain', 'Cramps and spasms']
상위 5개 암묵적 증상: ['Blindness', 'Abnormal appearing skin', 'Cough', 'Burning abdominal pain', 'Cramps and spasms']


In [23]:
symptom_relations = {}

for _, row in train_df.iterrows():
    explicit_symptoms = row['goal']['explicit_inform_slots'].keys()
    implicit_symptoms = row['goal']['implicit_inform_slots'].keys()

    for e_symptom in explicit_symptoms:
        if e_symptom not in symptom_relations:
            symptom_relations[e_symptom] = {}
        for i_symptom in implicit_symptoms:
            if i_symptom in symptom_relations[e_symptom]:
                symptom_relations[e_symptom][i_symptom] += 1
            else:
                symptom_relations[e_symptom][i_symptom] = 1


for e_symptom, i_symptoms in symptom_relations.items():
    sorted_i_symptoms = sorted(i_symptoms.items(), key=lambda x: x[1], reverse=True)
    total_occurrences = sum(i_symptoms.values())
    if(e_symptom == ''Cat
    print(f"\n명시적 증상: {e_symptom}, 총 발생 횟수: {total_occurrences}")
    for i_symptom, count in sorted_i_symptoms:
        ratio = count / total_occurrences
        print(f"  암묵적 증상: {i_symptom}, 발생 횟수: {count}, 비율: {ratio:.2f}")


명시적 증상: Spots or clouds in vision, 총 발생 횟수: 315
  암묵적 증상: Diminished vision, 발생 횟수: 90, 비율: 0.29
  암묵적 증상: Pain in eye, 발생 횟수: 52, 비율: 0.17
  암묵적 증상: Symptoms of eye, 발생 횟수: 38, 비율: 0.12
  암묵적 증상: Eye redness, 발생 횟수: 23, 비율: 0.07
  암묵적 증상: Itchiness of eye, 발생 횟수: 21, 비율: 0.07
  암묵적 증상: Lacrimation, 발생 횟수: 19, 비율: 0.06
  암묵적 증상: Foreign body sensation in eye, 발생 횟수: 15, 비율: 0.05
  암묵적 증상: Abnormal movement of eyelid, 발생 횟수: 12, 비율: 0.04
  암묵적 증상: Blindness, 발생 횟수: 10, 비율: 0.03
  암묵적 증상: Eye burns or stings, 발생 횟수: 7, 비율: 0.02
  암묵적 증상: Swollen eye, 발생 횟수: 6, 비율: 0.02
  암묵적 증상: Eyelid lesion or rash, 발생 횟수: 5, 비율: 0.02
  암묵적 증상: White discharge from eye, 발생 횟수: 3, 비율: 0.01
  암묵적 증상: Bleeding from eye, 발생 횟수: 3, 비율: 0.01
  암묵적 증상: Irregular heartbeat, 발생 횟수: 2, 비율: 0.01
  암묵적 증상: Mass on eyelid, 발생 횟수: 2, 비율: 0.01
  암묵적 증상: Feeling ill, 발생 횟수: 2, 비율: 0.01
  암묵적 증상: Decreased appetite, 발생 횟수: 1, 비율: 0.00
  암묵적 증상: Intermenstrual bleeding, 발생 횟수: 1, 비율: 0.00
  암묵적 증상: Eye deviation, 발생 횟수

In [69]:
all_symptoms = []
for _, row in train_df.iterrows():
    explicit_symptoms = row['goal']['explicit_inform_slots'].keys()
    all_symptoms.extend(explicit_symptoms)

unique_symptoms1 = pd.Series(all_symptoms).unique()
unique_count = len(unique_symptoms)
unique_count

264

In [70]:
all_symptoms = []
for _, row in train_df.iterrows():
    explicit_symptoms = row['goal']['implicit_inform_slots'].keys()
    all_symptoms.extend(explicit_symptoms)

unique_symptoms2 = pd.Series(all_symptoms).unique()
unique_count = len(unique_symptoms)
unique_count

264

In [71]:
set1 = set(unique_symptoms1)
set2 = set(unique_symptoms2)
duplicates = set1.intersection(set2)
print(duplicates)

{'Penis pain', 'Elbow weakness', 'Painful urination', 'Leg pain', 'Back weakness', 'Skin lesion', 'Skin irritation', 'Foot or toe lump or mass', 'Irregular heartbeat', 'Vomiting', 'Temper problems', 'Knee pain', 'Symptoms of eye', 'Sneezing', 'Allergic reaction', 'Skin moles', 'Hostile behavior', 'Itching of skin', 'Difficulty speaking', 'Feeling ill', 'Muscle stiffness or tightness', 'Sweating', 'Shoulder swelling', 'Pus in sputum', 'Skin dryness, peeling, scaliness, or roughness', 'Kidney mass', 'Pain during pregnancy', 'Difficulty breathing', 'Depression', 'Increased heart rate', 'Excessive appetite', 'Sinus congestion', 'Facial pain', 'Abnormal movement of eyelid', 'Irritable infant', 'Infrequent menstruation', 'Eye strain', 'Painful sinuses', 'Back pain', 'Foot or toe swelling', 'Scanty menstrual flow', 'Skin rash', 'Foot or toe pain', 'Pain in eye', 'Muscle weakness', 'Foreign body sensation in eye', 'Hand or finger swelling', 'Unusual color or odor to urine', 'Mass on eyelid', '

In [31]:
implicit_symptoms_counts = []

for _, row in train_df.iterrows():
    implicit_symptoms = row['goal']['implicit_inform_slots'].keys()
    implicit_symptoms_counts.append(len(implicit_symptoms))

print(max(implicit_symptoms_counts))
# 5개 이하인 'implicit_symptoms'를 가진 객체의 수 계산
count_5_or_less = sum(1 for count in implicit_symptoms_counts if count <= 5)

# 전체 객체 수
total_count = len(implicit_symptoms_counts)

# 5개 이하인 객체의 비율 계산
percentage_5_or_less = (count_5_or_less / total_count) * 100

percentage_5_or_less



11


93.50416666666666