# Import Libraries

In [2]:
import os
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import classification_report

import data_params as input_data_params

# Set Parameters

In [3]:
DISEASE_PREFIX = input_data_params.disease
assert DISEASE_PREFIX == 'dmd' or 'hd' or 'oi'

DATASET_PREFIXES = ['prev', 'restr']

embedding_method = 'e2v'

seeded_emb = False

if seeded_emb:
    fixed_emb = '_seeded'
    title_seeded = ' with fixed node embeddings'
else:
    fixed_emb = ''
    title_seeded = ''
    
print(f'{DISEASE_PREFIX} {DATASET_PREFIXES}')

hd ['prev', 'restr']


# Get Result Paths

In [4]:
curr_working_dir = os.path.dirname(os.getcwd())
curr_output_dir = os.path.join(curr_working_dir, 'output', DISEASE_PREFIX)

run_names_per_dataset = {}
run_folders_per_dataset = {}
pred_folders_per_dataset = {}

dataset_output_dirs = {}

for dataset_prefix in DATASET_PREFIXES:
    dataset_output_dir = os.path.join(curr_output_dir, f'{dataset_prefix}_{embedding_method}{fixed_emb}')

    if not os.path.exists(dataset_output_dir):
        print('First, run the edge2vec embedding script. Then, run this script.')
    else:
        print(f'Output folder for dataset {dataset_prefix} exists and will be loaded: {dataset_output_dir}')
        
    run_folders_list = []
    for item in os.listdir(dataset_output_dir):
        curr_path = os.path.join(dataset_output_dir, item)
        if os.path.isdir(curr_path) and 'run' in item:
            run_folders_list.append(item)

    run_names_per_dataset[dataset_prefix] = run_folders_list

    print(f'For dataset {dataset_prefix}, a total of {len(run_folders_list)} runs will be included in the analysis.')

    run_folders_paths = []
    pred_folders_paths = []
    for run_folder in run_folders_list:
        run_path = os.path.join(dataset_output_dir, run_folder)
        run_folders_paths.append(run_path)
        pred_run_path = os.path.join(run_path, 'pred')
        pred_folders_paths.append(pred_run_path)
        print(pred_run_path)

    run_folders_per_dataset[dataset_prefix] = run_folders_paths
    pred_folders_per_dataset[dataset_prefix] = pred_folders_paths
    
    dataset_output_dirs[dataset_prefix] = dataset_output_dir

Output folder for dataset prev exists and will be loaded: c:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\prev_e2v
For dataset prev, a total of 10 runs will be included in the analysis.
c:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\prev_e2v\run_001\pred
c:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\prev_e2v\run_002\pred
c:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\prev_e2v\run_003\pred
c:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\prev_e2v\run_004\pred
c:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\prev_e2v\run_005\pred
c:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\prev_e2v\run_006\pred
c:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\prev_e2v\run_007\pred
c:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\prev_e2v\run_008\pred
c:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\prev_e2v\run_009\pred
c:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\prev_e2v\run_010\pred
Outp

In [5]:
dataset_output_dirs

{'prev': 'c:\\Users\\rosa-\\OneDrive\\Documents\\GitHub\\XAI-FO\\output\\hd\\prev_e2v',
 'restr': 'c:\\Users\\rosa-\\OneDrive\\Documents\\GitHub\\XAI-FO\\output\\hd\\restr_e2v'}

In [6]:
auc_scores_all_runs_per_dataset = {}
auc_loss_scores_all_runs_per_dataset = {}

for dataset_prefix in DATASET_PREFIXES:
    run_folders_list = run_names_per_dataset[dataset_prefix]
    pred_folders_paths = pred_folders_per_dataset[dataset_prefix]

    auc_scores_all_runs = []
    auc_loss_scores_all_runs = []

    for run_name, pred_folder in zip(run_folders_list, pred_folders_paths):
        with open(f'{pred_folder}/{dataset_prefix}_{DISEASE_PREFIX}_performance_scores_{embedding_method}.pkl', 'rb') as f:
            loaded_info = pickle.load(f)
            
        keys = ['AUC Train', 'AUC Validation', 'AUC Test']
        for key in keys:
            auc_scores = loaded_info[key]
            for index, auc_score in enumerate(auc_scores):
                auc_scores_per_run = {'run': run_name, 'name': key, 'iteration': index, 'score': auc_score}
                auc_scores_all_runs.append(auc_scores_per_run)
                auc_loss_scores_all_runs.append(auc_scores_per_run)

        loss_scores = loaded_info['Loss']
        for index, loss_score in enumerate(loss_scores):
            formatted_loss_score = float(np.log10(loss_score))
            loss_scores_per_run = {'run': run_name, 'name': 'Cross-Entropy Loss', 'iteration': index, 'score': formatted_loss_score}
            auc_loss_scores_all_runs.append(loss_scores_per_run)

    auc_scores_all_runs_per_dataset[dataset_prefix] = auc_scores_all_runs
    auc_loss_scores_all_runs_per_dataset[dataset_prefix] = auc_loss_scores_all_runs

# Plot ROC Curves, AUC-ROC Scores and F1 Scores for Each Model

## ROC Curves

In [7]:
curr_working_dir = os.path.dirname(os.getcwd()) 
curr_output_dir = os.path.join(curr_working_dir, 'output', DISEASE_PREFIX)

final_test_auc_roc_scores_all_runs_all_models = []
roc_curve_all_runs_all_models = []
f1_scores_all_runs_all_models = []

for dataset_prefix in DATASET_PREFIXES:
    curr_dataset_output_dir = os.path.join(curr_output_dir, f'{dataset_prefix}_{embedding_method}{fixed_emb}')

    if not os.path.exists(curr_dataset_output_dir):
        print('First, run the edge2vec embedding script. Then, run this script.')
    else:
        print(f'Output folder for dataset {dataset_prefix} exists and will be loaded: {curr_dataset_output_dir}')
            
        run_folders_list = []
        for item in os.listdir(curr_dataset_output_dir):
            curr_path = os.path.join(curr_dataset_output_dir, item)
            if os.path.isdir(curr_path) and 'run' in item:
                run_folders_list.append(item)

        print(f'A total of {len(run_folders_list)} runs will be included in the analysis.')

        run_folders_paths = []
        curr_pred_folders_paths = []
        for run_folder in run_folders_list:
            run_path = os.path.join(curr_dataset_output_dir, run_folder)
            run_folders_paths.append(run_path)
            pred_run_path = os.path.join(run_path, 'pred')
            curr_pred_folders_paths.append(pred_run_path)
            
        for run_name, pred_folder in zip(run_folders_list, curr_pred_folders_paths):
            with open(f'{pred_folder}/{dataset_prefix}_{DISEASE_PREFIX}_performance_scores_{embedding_method}.pkl', 'rb') as f:
                loaded_info = pickle.load(f)

            auc_roc_score = loaded_info['ROC AUC Score']
            formatted_auc_roc_score = float(auc_roc_score)
            auc_roc_score_per_run = {'Model': f'{dataset_prefix}_{embedding_method}{fixed_emb}', 
                                     'ROC AUC Score': formatted_auc_roc_score}
            final_test_auc_roc_scores_all_runs_all_models.append(auc_roc_score_per_run)

            roc_fpr_scores = loaded_info['ROC FPR']
            roc_tpr_scores = loaded_info['ROC TPR']
            
            for fpr, tpr in zip(roc_fpr_scores, roc_tpr_scores):
                auc_per_threshold_per_run = {'Model': f'{dataset_prefix}_{embedding_method}{fixed_emb}',
                                             'ROC FPR': fpr, 'ROC TPR': tpr}
                roc_curve_all_runs_all_models.append(auc_per_threshold_per_run)

            f1_score = loaded_info['F1 Score']
            formatted_f1_score = float(f1_score)
            f1_score_per_run = {'Model': f'{dataset_prefix}_{embedding_method}{fixed_emb}', 
                                'F1 Score': formatted_f1_score}
            f1_scores_all_runs_all_models.append(f1_score_per_run)

            print(f'F1-Score in the test set of dataset {dataset_prefix} and method {embedding_method}:', f1_score)
            print(classification_report(loaded_info['True Labels'], loaded_info['Predicted Labels']))

Output folder for dataset prev exists and will be loaded: c:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\prev_e2v
A total of 10 runs will be included in the analysis.
F1-Score in the test set of dataset prev and method e2v: 0.8947817754159771
              precision    recall  f1-score   support

           0       0.97      0.88      0.92     40428
           1       0.84      0.95      0.89     26952

    accuracy                           0.91     67380
   macro avg       0.90      0.92      0.91     67380
weighted avg       0.92      0.91      0.91     67380

F1-Score in the test set of dataset prev and method e2v: 0.895870456550698
              precision    recall  f1-score   support

           0       0.97      0.88      0.92     40428
           1       0.84      0.96      0.90     26952

    accuracy                           0.91     67380
   macro avg       0.91      0.92      0.91     67380
weighted avg       0.92      0.91      0.91     67380

F1-Score in the t

In [8]:
roc_curve_all_runs_all_models = pd.DataFrame(roc_curve_all_runs_all_models)
roc_curve_all_runs_all_models

Unnamed: 0,Model,ROC FPR,ROC TPR
0,prev_e2v,0.000000,0.000000
1,prev_e2v,0.000124,0.116652
2,prev_e2v,0.000124,0.116800
3,prev_e2v,0.000148,0.116800
4,prev_e2v,0.000148,0.116949
...,...,...,...
82975,restr_e2v,0.996544,0.999853
82976,restr_e2v,0.996544,0.999926
82977,restr_e2v,0.998529,0.999926
82978,restr_e2v,0.998529,1.000000


In [9]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.set_title(f'ROC TPR/ROC FPR Distribution over Each Run for Each Model on {DISEASE_PREFIX.upper()}{title_seeded}')
sns.scatterplot(data=roc_curve_all_runs_all_models, x="ROC FPR", y="ROC TPR", hue="Model", s=1)
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend(markerscale=10)

fig.savefig(f'{curr_output_dir}/{DISEASE_PREFIX}_roc_curves.png', bbox_inches='tight')
fig.clear()

<Figure size 800x600 with 0 Axes>

## F1 Scores

In [10]:
f1_scores_all_runs_all_models = pd.DataFrame(f1_scores_all_runs_all_models)
f1_scores_all_runs_all_models

Unnamed: 0,Model,F1 Score
0,prev_e2v,0.894782
1,prev_e2v,0.89587
2,prev_e2v,0.897419
3,prev_e2v,0.895215
4,prev_e2v,0.895351
5,prev_e2v,0.890663
6,prev_e2v,0.895404
7,prev_e2v,0.896704
8,prev_e2v,0.897821
9,prev_e2v,0.904595


In [11]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.set_title(f'F1 Scores Overview for Each Model on {DISEASE_PREFIX.upper()}{title_seeded}')
sns.barplot(f1_scores_all_runs_all_models, x="Model", y="F1 Score", errorbar="sd", color='cornflowerblue')
ax.bar_label(ax.containers[0], fontsize=10, padding=5)
ax.set_xlabel('Model Variant')
ax.set_ylabel('F1 Score')

fig.savefig(f'{curr_output_dir}/{DISEASE_PREFIX}_f1_scores.png', bbox_inches='tight')
fig.clear()

<Figure size 800x600 with 0 Axes>

## Final AUC-ROC Scores

In [12]:
final_test_auc_roc_scores_all_runs_all_models = pd.DataFrame(final_test_auc_roc_scores_all_runs_all_models)
final_test_auc_roc_scores_all_runs_all_models

Unnamed: 0,Model,ROC AUC Score
0,prev_e2v,0.9769
1,prev_e2v,0.979159
2,prev_e2v,0.978937
3,prev_e2v,0.979793
4,prev_e2v,0.976854
5,prev_e2v,0.977307
6,prev_e2v,0.97775
7,prev_e2v,0.97931
8,prev_e2v,0.978315
9,prev_e2v,0.981156


In [13]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.set_title(f'AUC-ROC Scores Overview for Each Model on {DISEASE_PREFIX.upper()}{title_seeded}')
sns.barplot(final_test_auc_roc_scores_all_runs_all_models, x="Model", y="ROC AUC Score", errorbar="sd")
ax.bar_label(ax.containers[0], fontsize=10, padding=10)
ax.set_ylim(0.85,1) # 0.85, 1
ax.set_xlabel('Model Variant')
ax.set_ylabel('AUC-ROC Score')

fig.savefig(f'{curr_output_dir}/{DISEASE_PREFIX}_auc_roc_scores.png', bbox_inches='tight')
fig.clear()

<Figure size 800x600 with 0 Axes>

# Plot Training Curve

In [14]:
for dataset_prefix in DATASET_PREFIXES:
    auc_scores_all_runs = auc_scores_all_runs_per_dataset[dataset_prefix]
    auc_scores_all_runs = pd.DataFrame(auc_scores_all_runs)

    auc_loss_scores_all_runs = auc_loss_scores_all_runs_per_dataset[dataset_prefix]
    auc_loss_scores_all_runs = pd.DataFrame(auc_loss_scores_all_runs)

    fig, ax = plt.subplots(figsize=(8, 8))

    ax.set_title(f'Training curve on dataset {dataset_prefix.upper()} {DISEASE_PREFIX.upper()}{title_seeded}')
    sns.lineplot(data=auc_scores_all_runs, x='iteration', y='score', hue='name')
    ax.set_ylim(0.8,1) # 0.85, 1
    ax.set_xlabel('Iteration')
    ax.set_ylabel('AUC-ROC Score')

    fig.savefig(f'{dataset_output_dirs[dataset_prefix]}/{dataset_prefix}_{DISEASE_PREFIX}_training_curve.png', bbox_inches='tight')
    fig.clear()

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.set_title(f'Training curve on dataset {dataset_prefix.upper()} {DISEASE_PREFIX.upper()}{title_seeded}')
    sns.lineplot(data=auc_loss_scores_all_runs, x='iteration', y='score', hue='name')
    ax.set_ylim(top=1)
    ax.set_xlabel('Iteration')
    ax.set_ylabel('AUC-ROC/log10(Loss)')

    fig.savefig(f'{dataset_output_dirs[dataset_prefix]}/{dataset_prefix}_{DISEASE_PREFIX}_training_curve_with_loss.png', bbox_inches='tight')
    fig.clear()

<Figure size 800x800 with 0 Axes>

<Figure size 800x800 with 0 Axes>

<Figure size 800x800 with 0 Axes>

<Figure size 800x800 with 0 Axes>

# Similarity between top scoring symptom-drug pairs

In [15]:
def visualize_similarity_matrix(similarity_matrix_df, mean_overlap_ratio, median_overlap_ratio, dataset_prefix):
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.set_title(f'Overlap ratio (Mean: {round(mean_overlap_ratio, 2)}, Median: {round(median_overlap_ratio, 2)}) between list of predicted symptom-drug pairs per run on dataset {dataset_prefix.upper()} {DISEASE_PREFIX.upper()}{title_seeded}')
    sns.heatmap(similarity_matrix_df, annot=True, fmt='.1f', linewidths=0.5, ax=ax, cmap='RdYlGn')
    ax.collections[0].set_clim(0,100)

    fig.savefig(f'{dataset_output_dirs[dataset_prefix]}/{dataset_prefix}_{DISEASE_PREFIX}_overlap_between_runs.png', bbox_inches='tight')
    fig.clear()

def get_pred_similarity_matrix(drug_symptom_pairs_per_run, dataset_prefix):
    similarity_matrix = {}
    ratios_non_diagonals = []
    for index1, pairs1 in enumerate(drug_symptom_pairs_per_run):
        similarities = {}
        for index2, pairs2 in enumerate(drug_symptom_pairs_per_run):
            overlap = set([tuple(sorted(ele)) for ele in pairs1]) & set([tuple(sorted(ele)) for ele in pairs2])
            ratio_overlap = len(overlap) / total_drug_symptom_pairs * 100
            
            similarities[f'run {index2+1}'] = ratio_overlap
            
            if index1 != index2:
                ratios_non_diagonals.append(ratio_overlap)
            
        similarity_matrix[f'run {index1+1}'] = similarities
        
    similarity_matrix_df = pd.DataFrame(similarity_matrix)
    print('Similarity matrix:\n', similarity_matrix_df)

    mean_overlap_ratio = np.mean(ratios_non_diagonals)
    median_overlap_ratio = np.median(ratios_non_diagonals)
    print('Overlap ratio mean:', mean_overlap_ratio)
    print('Overlap ratio median:', median_overlap_ratio)

    visualize_similarity_matrix(similarity_matrix_df, mean_overlap_ratio, median_overlap_ratio, dataset_prefix)

def get_overlap_all_runs(drug_symptom_pairs_per_run, dataset_prefix):
    for i in range(0, len(drug_symptom_pairs_per_run)):
        if i == 0:
            overlapping_pairs_all_runs = set(drug_symptom_pairs_per_run[i])
        else:
            overlapping_pairs_all_runs = overlapping_pairs_all_runs & set(drug_symptom_pairs_per_run[i])
                
    print(f'For dataset {dataset_prefix}, there are {len(overlapping_pairs_all_runs)} symptom-drug pairs that are found in the top list of drug candidates in {len(drug_symptom_pairs_per_run)} runs: \n {overlapping_pairs_all_runs}')

    with open(f'{dataset_output_dirs[dataset_prefix]}/symptom_drug_pair_overlapping_all_runs_{DISEASE_PREFIX}_{dataset_prefix}_{embedding_method}{fixed_emb}.pkl', 'wb') as f:
        pickle.dump(overlapping_pairs_all_runs, f)

def get_overlap_threshold_runs(drug_symptom_pairs_per_run, dataset_prefix, threshold):
    same_drug_symptom_pairs_thresholded = set()

    threshold = threshold
    total_runs = len(drug_symptom_pairs_per_run)
    min_nr_runs = int(threshold * total_runs)

    for i in range(0, len(drug_symptom_pairs_per_run)):
        for pair in drug_symptom_pairs_per_run[i]:
            same_pairs = 0
            for j in range(0, len(drug_symptom_pairs_per_run)):
                for pair_to_compare in drug_symptom_pairs_per_run[j]:
                    if pair == pair_to_compare:
                        same_pairs += 1
                            
            if same_pairs >= min_nr_runs:
                same_drug_symptom_pairs_thresholded.add(pair)
                
    print(f'There are {len(same_drug_symptom_pairs_thresholded)} symptom-drug pairs that are found in the top list of drug candidates in at least {min_nr_runs} of the {total_runs} runs: \n {same_drug_symptom_pairs_thresholded}')

    with open(f'{dataset_output_dirs[dataset_prefix]}/symptom_drug_pair_overlapping_{min_nr_runs}_runs_{DISEASE_PREFIX}_{dataset_prefix}_{embedding_method}{fixed_emb}.pkl', 'wb') as f:
        pickle.dump(same_drug_symptom_pairs_thresholded, f)

for dataset_prefix in DATASET_PREFIXES:
    pred_folders_paths = pred_folders_per_dataset[dataset_prefix]
    drug_symptom_pairs_per_run = []

    for index, pred_path in enumerate(pred_folders_paths):
        with open(f'{pred_path}/{dataset_prefix}_{DISEASE_PREFIX}_candidates_per_symptom_{embedding_method}.pkl', 'rb') as f:
            loaded_list = pickle.load(f)
            
            drug_symptom_pairs = []
            
            for _, row in loaded_list.iterrows():
                symptom_id = row['Symptom']
                candidates = row['Candidates']
                
                for candidate in candidates:
                    drug_symptom_pairs.append(tuple([symptom_id, candidate]))
                    
            total_drug_symptom_pairs = len(drug_symptom_pairs)
        
        drug_symptom_pairs_per_run.append(drug_symptom_pairs)

    get_pred_similarity_matrix(drug_symptom_pairs_per_run, dataset_prefix)

    get_overlap_all_runs(drug_symptom_pairs_per_run, dataset_prefix)
    
    for threshold in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        get_overlap_threshold_runs(drug_symptom_pairs_per_run, dataset_prefix, threshold)

Similarity matrix:
              run 1       run 2       run 3       run 4       run 5  \
run 1   100.000000   29.885057   39.080460   36.781609   29.885057   
run 2    29.885057  100.000000   35.632184   27.586207   20.689655   
run 3    39.080460   35.632184  100.000000   25.287356   29.885057   
run 4    36.781609   27.586207   25.287356  100.000000   28.735632   
run 5    29.885057   20.689655   29.885057   28.735632  100.000000   
run 6    31.034483   26.436782   29.310345   20.114943   18.965517   
run 7    10.919540   17.241379   17.816092   12.643678   10.344828   
run 8    32.183908   32.758621   33.908046   35.632184   21.264368   
run 9    26.436782   26.436782   40.804598   17.816092   17.241379   
run 10   19.540230   18.965517   18.390805   18.965517   13.218391   

             run 6       run 7       run 8       run 9      run 10  
run 1    31.034483   10.919540   32.183908   26.436782   19.540230  
run 2    26.436782   17.241379   32.758621   26.436782   18.965517  
ru

<Figure size 800x800 with 0 Axes>

<Figure size 800x800 with 0 Axes>

# Check overlap between each model

In [16]:
curr_working_dir = os.path.dirname(os.getcwd())
curr_output_dir = os.path.join(curr_working_dir, 'output', DISEASE_PREFIX)

overlapping_pairs = []

for dataset_prefix in DATASET_PREFIXES:
    nodes = pd.read_csv(f'{curr_output_dir}/{dataset_prefix}_{DISEASE_PREFIX}_indexed_nodes.csv')
    nodes.drop('index_id', axis=1, inplace=True)
    nodes['semantic'] = nodes['semantic'].astype('category')

    if dataset_prefix == 'prev':
        drug_semantic = 'DRUG'
    else:
        drug_semantic = 'drug'

    dataset_output_dir = os.path.join(curr_output_dir, f'{dataset_prefix}_{embedding_method}{fixed_emb}')
    if not os.path.exists(dataset_output_dir):
        print('First, run the edge2vec embedding script. Then, run this script.')
    else:
        print(f'Output folder for dataset {dataset_prefix} exists and will be loaded: {dataset_output_dir}')
        
        with open(f'{dataset_output_dir}/symptom_drug_pair_overlapping_all_runs_{DISEASE_PREFIX}_{dataset_prefix}_{embedding_method}{fixed_emb}.pkl', 'rb') as f:
            loaded_list = pickle.load(f)
            overlapping_pairs.append(loaded_list)
            
            pair_dict_list = []
            for pair in loaded_list:
                symptom_id, drug_id = pair
                
                symptom_name = nodes.loc[nodes['id'] == symptom_id]['label'].iloc[0]
                
                drug_name = nodes.loc[nodes['id'] == drug_id]['label'].iloc[0]
                pair_dict_list.append({'Drug': drug_name, 'Symptom ID': symptom_id, 'Symptom': symptom_name})
                
            overlapping_all_runs_df = pd.DataFrame(pair_dict_list)

            print(f'Drug-symptom pairs overlapping all runs:\n', overlapping_all_runs_df)

            overlapping_all_runs_df.to_csv(f'{dataset_output_dir}/symptom_drug_pair_overlapping_all_runs_{DISEASE_PREFIX}_{dataset_prefix}_{embedding_method}{fixed_emb}.csv', index=False)

dataset1_emb_overlap = overlapping_pairs[0].intersection(overlapping_pairs[1])
for pair in dataset1_emb_overlap:
    symptom_id, drug_id = pair
    symptom_name = nodes.loc[nodes['id'] == symptom_id]['label'].iloc[0]
    drug_name = nodes.loc[nodes['id'] == drug_id]['label'].iloc[0]
    print(drug_name, 'treats', symptom_name)

Output folder for dataset prev exists and will be loaded: c:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\prev_e2v
Drug-symptom pairs overlapping all runs:
         Drug  Symptom ID  Symptom
0  cerulenin  HP:0002171  Gliosis
Output folder for dataset restr exists and will be loaded: c:\Users\rosa-\OneDrive\Documents\GitHub\XAI-FO\output\hd\restr_e2v
Drug-symptom pairs overlapping all runs:
                      Drug  Symptom ID           Symptom
0  adenosine triphosphate  HP:0002591        Polyphagia
1  adenosine triphosphate  HP:0001288  Gait disturbance
2  adenosine triphosphate  HP:0001824       Weight loss
