In [29]:
import os
import pickle

import pandas as pd

Load all files that store the drug-symptom predictions for each disease-dataset combination.

In [30]:
DISEASE_PREFIXES = ['dmd', 'hd', 'oi']
DATASET_PREFIXES = ['prev', 'restr']
EMB_METHOD = 'e2v'


results_dict = {}

for disease_prefix in DISEASE_PREFIXES:
    results_dict[disease_prefix] = {}
    
    curr_working_dir = os.path.dirname(os.getcwd())
    curr_output_dir = os.path.join(curr_working_dir, 'output', disease_prefix)
    
    for dataset_prefix in DATASET_PREFIXES:
        dataset_output_dir = os.path.join(curr_output_dir, f'{dataset_prefix}_{EMB_METHOD}')
        
        run_folders_list = []
        for item in os.listdir(dataset_output_dir):
            curr_path = os.path.join(dataset_output_dir, item)
            if os.path.isdir(curr_path) and 'run' in item:
                run_folders_list.append(item)
        
        pred_files_paths = []
        for run_folder in run_folders_list:
            run_path = os.path.join(dataset_output_dir, run_folder)
            pred_run_path = os.path.join(run_path, 'pred')
            pred_file_path = os.path.join(pred_run_path, f'{dataset_prefix}_{disease_prefix}_candidates_per_symptom_{EMB_METHOD}.pkl')
            pred_files_paths.append(tuple([run_folder, pred_file_path]))
        
        results_dict[disease_prefix][dataset_prefix] = pred_files_paths
        
results_dict

{'dmd': {'prev': [('run_001',
    'c:\\Users\\rosa-\\OneDrive\\Documents\\GitHub\\XAI-FO\\output\\dmd\\prev_e2v\\run_001\\pred\\prev_dmd_candidates_per_symptom_e2v.pkl'),
   ('run_002',
    'c:\\Users\\rosa-\\OneDrive\\Documents\\GitHub\\XAI-FO\\output\\dmd\\prev_e2v\\run_002\\pred\\prev_dmd_candidates_per_symptom_e2v.pkl'),
   ('run_003',
    'c:\\Users\\rosa-\\OneDrive\\Documents\\GitHub\\XAI-FO\\output\\dmd\\prev_e2v\\run_003\\pred\\prev_dmd_candidates_per_symptom_e2v.pkl'),
   ('run_004',
    'c:\\Users\\rosa-\\OneDrive\\Documents\\GitHub\\XAI-FO\\output\\dmd\\prev_e2v\\run_004\\pred\\prev_dmd_candidates_per_symptom_e2v.pkl'),
   ('run_005',
    'c:\\Users\\rosa-\\OneDrive\\Documents\\GitHub\\XAI-FO\\output\\dmd\\prev_e2v\\run_005\\pred\\prev_dmd_candidates_per_symptom_e2v.pkl'),
   ('run_006',
    'c:\\Users\\rosa-\\OneDrive\\Documents\\GitHub\\XAI-FO\\output\\dmd\\prev_e2v\\run_006\\pred\\prev_dmd_candidates_per_symptom_e2v.pkl'),
   ('run_007',
    'c:\\Users\\rosa-\\OneDrive\\D

Load the indexed nodes for retrieving the names of symptoms and drug candidates.

In [33]:
def load_nodes(disease_prefix, dataset_prefix):
    curr_working_dir = os.path.dirname(os.getcwd())
    curr_output_dir = os.path.join(curr_working_dir, 'output', disease_prefix)

    nodes = pd.read_csv(f'{curr_output_dir}/{dataset_prefix}_{disease_prefix}_indexed_nodes.csv')
    nodes.drop('index_id', axis=1, inplace=True)
    
    return nodes

Save the drug-symptom pairs from all runs to CSV files for each disease-dataset combination.

In [32]:
for disease_prefix in DISEASE_PREFIXES:
    
    for dataset_prefix in DATASET_PREFIXES:
        
        nodes = load_nodes(disease_prefix, dataset_prefix)
        drug_symptom_pairs = []
        
        for index, pred_tuple in enumerate(results_dict[disease_prefix][dataset_prefix]):
            pred_run, pred_path = pred_tuple
            
            with open(pred_path, 'rb') as f:
                loaded_list = pickle.load(f)
                
                for _, row in loaded_list.iterrows():
                    symptom_id = row['Symptom']
                    candidates = row['Candidates']
                    
                    for candidate in candidates:
                        symptom_name = nodes.loc[nodes['id'] == symptom_id]['label'].iloc[0]
                        drug_name = nodes.loc[nodes['id'] == candidate]['label'].iloc[0]
                        drug_symptom_pairs.append({'Run': pred_run, 'Drug': drug_name, 'Drug ID': candidate, 'Symptom ID': symptom_id, 'Symptom': symptom_name})
                        
        drug_symptom_pairs_df = pd.DataFrame.from_dict(drug_symptom_pairs)
        drug_symptom_pairs_df.to_csv(f'predicted_drug_symptom_pairs_{disease_prefix}_{dataset_prefix}.csv', index=False)