# Disease to Drug Path Extraction

This notebook extracts different logical paths between diseases and drugs from the PrimeKG knowledge graph.

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
import time
from tqdm.notebook import tqdm

## Load Data

In [None]:
# File paths
NODE_FILE = "/playpen/jesse/drug_repurpose/PrimeKG/nodes.csv"
EDGE_FILE = "/playpen/jesse/drug_repurpose/PrimeKG/edges.csv"

# Load nodes and edges
print("Loading node data...")
df_node = pd.read_csv(NODE_FILE)

print("Loading edge data...")
df_edges = pd.read_csv(EDGE_FILE)

print(f"Loaded {len(df_node)} nodes and {len(df_edges)} edges")

## Create Node Type Maps

In [None]:
# Create node mapping dictionaries
print("Creating node mappings...")
node_type_map = dict(zip(df_node['node_index'], df_node['node_type']))
node_name_map = dict(zip(df_node['node_index'], df_node['node_name']))

# Get node type sets for faster filtering
disease_nodes = set(df_node[df_node['node_type'] == 'disease']['node_index'])
drug_nodes = set(df_node[df_node['node_type'] == 'drug']['node_index'])
gene_protein_nodes = set(df_node[df_node['node_type'] == 'gene/protein']['node_index'])
phenotype_nodes = set(df_node[df_node['node_type'] == 'effect/phenotype']['node_index'])

print(f"Disease nodes: {len(disease_nodes)}")
print(f"Drug nodes: {len(drug_nodes)}")
print(f"Gene/protein nodes: {len(gene_protein_nodes)}")
print(f"Phenotype nodes: {len(phenotype_nodes)}")

## Filter Edges by Type

Pre-filter edges to improve performance

In [None]:
print("Pre-filtering edges by type...")
start_time = time.time()

# Direct disease-drug edges
disease_drug_direct = df_edges[
    (df_edges['x_index'].isin(disease_nodes)) & 
    (df_edges['y_index'].isin(drug_nodes)) & 
    (df_edges['relation'].isin(['indication', 'contraindication', 'off-label use']))
]

drug_disease_direct = df_edges[
    (df_edges['x_index'].isin(drug_nodes)) & 
    (df_edges['y_index'].isin(disease_nodes)) & 
    (df_edges['relation'].isin(['indication', 'contraindication', 'off-label use']))
]

# Disease-protein edges
disease_protein = df_edges[
    (df_edges['x_index'].isin(disease_nodes)) & 
    (df_edges['y_index'].isin(gene_protein_nodes)) & 
    (df_edges['relation'] == 'disease_protein')
]

# Protein-drug edges
protein_drug = df_edges[
    (df_edges['x_index'].isin(gene_protein_nodes)) & 
    (df_edges['y_index'].isin(drug_nodes)) & 
    (df_edges['relation'] == 'drug_protein')
]

# Disease-phenotype edges
disease_phenotype = df_edges[
    (df_edges['x_index'].isin(disease_nodes)) & 
    (df_edges['y_index'].isin(phenotype_nodes)) & 
    (df_edges['relation'] == 'disease_phenotype_positive')
]

# Phenotype-protein edges
phenotype_protein = df_edges[
    (df_edges['x_index'].isin(phenotype_nodes)) & 
    (df_edges['y_index'].isin(gene_protein_nodes)) & 
    (df_edges['relation'] == 'phenotype_protein')
]

# Disease-disease edges
disease_disease = df_edges[
    (df_edges['x_index'].isin(disease_nodes)) & 
    (df_edges['y_index'].isin(disease_nodes)) & 
    (df_edges['relation'] == 'disease_disease')
]

print(f"Edge filtering completed in {time.time() - start_time:.2f} seconds")
print(f"Disease-drug direct edges: {len(disease_drug_direct)}")
print(f"Drug-disease direct edges: {len(drug_disease_direct)}")
print(f"Disease-protein edges: {len(disease_protein)}")
print(f"Protein-drug edges: {len(protein_drug)}")
print(f"Disease-phenotype edges: {len(disease_phenotype)}")
print(f"Phenotype-protein edges: {len(phenotype_protein)}")
print(f"Disease-disease edges: {len(disease_disease)}")

## Create Lookup Dictionaries

Build fast lookup dictionaries for path construction

In [None]:
print("Creating lookup dictionaries...")
start_time = time.time()

# Create protein-drug lookup
protein_to_drugs = defaultdict(list)
for _, row in protein_drug.iterrows():
    protein_to_drugs[row['x_index']].append(row['y_index'])

# Create phenotype-protein lookup
phenotype_to_proteins = defaultdict(list)
for _, row in phenotype_protein.iterrows():
    phenotype_to_proteins[row['x_index']].append(row['y_index'])

# Create disease-drug lookup with relation
disease_to_drugs = defaultdict(list)
for _, row in disease_drug_direct.iterrows():
    disease_to_drugs[row['x_index']].append((row['y_index'], row['relation']))

# Track drug-disease relations (reverse direction)
for _, row in drug_disease_direct.iterrows():
    # Store with the disease as key but mark as reverse direction
    disease_to_drugs[row['y_index']].append((row['x_index'], row['relation'] + '_reverse'))

print(f"Lookup dictionaries created in {time.time() - start_time:.2f} seconds")

## Find Direct Disease-Drug Relations

In [None]:
def find_direct_disease_drug_relations():
    print("Finding direct disease-drug relations...")
    start_time = time.time()
    direct_relations = []
    
    # Process disease→drug edges
    for _, row in disease_drug_direct.iterrows():
        direct_relations.append({
            'disease_index': row['x_index'],
            'disease_name': node_name_map[row['x_index']],
            'drug_index': row['y_index'],
            'drug_name': node_name_map[row['y_index']],
            'relation': row['relation'],
            'path_type': 'direct'
        })
    
    # Process drug→disease edges
    for _, row in drug_disease_direct.iterrows():
        direct_relations.append({
            'disease_index': row['y_index'],
            'disease_name': node_name_map[row['y_index']],
            'drug_index': row['x_index'],
            'drug_name': node_name_map[row['x_index']],
            'relation': row['relation'],
            'path_type': 'direct_reverse'
        })
        
    print(f"Found {len(direct_relations)} direct relations in {time.time() - start_time:.2f} seconds")
    return direct_relations

direct_relations = find_direct_disease_drug_relations()

## Find Disease-Protein-Drug Paths

In [None]:
def find_disease_protein_drug_paths():
    print("Finding disease-protein-drug paths...")
    start_time = time.time()
    indirect_paths = []
    
    # Process in chunks for better performance
    chunk_size = 10000
    for chunk_start in tqdm(range(0, len(disease_protein), chunk_size)):
        chunk_end = min(chunk_start + chunk_size, len(disease_protein))
        chunk = disease_protein.iloc[chunk_start:chunk_end]
        
        for _, row in chunk.iterrows():
            disease = row['x_index']
            protein = row['y_index']
            
            # Find drugs connected to this protein
            for drug in protein_to_drugs.get(protein, []):
                indirect_paths.append({
                    'disease_index': disease,
                    'disease_name': node_name_map[disease],
                    'protein_index': protein,
                    'protein_name': node_name_map[protein],
                    'drug_index': drug,
                    'drug_name': node_name_map[drug],
                    'path_type': 'disease-protein-drug'
                })
    
    print(f"Found {len(indirect_paths)} disease-protein-drug paths in {time.time() - start_time:.2f} seconds")
    return indirect_paths

disease_protein_drug_paths = find_disease_protein_drug_paths()

## Find Disease-Phenotype-Protein-Drug Paths

In [None]:
def find_disease_phenotype_protein_drug_paths():
    print("Finding disease-phenotype-protein-drug paths...")
    start_time = time.time()
    complex_paths = []
    
    # Process in chunks for better performance
    chunk_size = 1000
    for chunk_start in tqdm(range(0, len(disease_phenotype), chunk_size)):
        chunk_end = min(chunk_start + chunk_size, len(disease_phenotype))
        chunk = disease_phenotype.iloc[chunk_start:chunk_end]
        
        for _, row in chunk.iterrows():
            disease = row['x_index']
            phenotype = row['y_index']
            
            # Get proteins connected to this phenotype
            for protein in phenotype_to_proteins.get(phenotype, []):
                # Get drugs connected to this protein
                for drug in protein_to_drugs.get(protein, []):
                    complex_paths.append({
                        'disease_index': disease,
                        'disease_name': node_name_map[disease],
                        'phenotype_index': phenotype,
                        'phenotype_name': node_name_map[phenotype],
                        'protein_index': protein,
                        'protein_name': node_name_map[protein],
                        'drug_index': drug,
                        'drug_name': node_name_map[drug],
                        'path_type': 'disease-phenotype-protein-drug'
                    })
                    
                    # Limit output size if it gets too large
                    if len(complex_paths) >= 1000000:
                        print("Warning: Reached 1 million paths, stopping early")
                        return complex_paths
    
    print(f"Found {len(complex_paths)} disease-phenotype-protein-drug paths in {time.time() - start_time:.2f} seconds")
    return complex_paths

disease_phenotype_paths = find_disease_phenotype_protein_drug_paths()

## Find Disease-Disease-Drug Paths

In [None]:
def find_disease_disease_drug_paths():
    print("Finding disease-disease-drug paths...")
    start_time = time.time()
    disease_disease_paths = []
    
    # Process in chunks for better performance
    chunk_size = 5000
    for chunk_start in tqdm(range(0, len(disease_disease), chunk_size)):
        chunk_end = min(chunk_start + chunk_size, len(disease_disease))
        chunk = disease_disease.iloc[chunk_start:chunk_end]
        
        for _, row in chunk.iterrows():
            d1 = row['x_index']
            d2 = row['y_index']
            
            # Get drugs connected to the second disease
            for drug, relation in disease_to_drugs.get(d2, []):
                path_type = 'disease-disease-drug'
                if '_reverse' in relation:
                    relation = relation.replace('_reverse', '')
                    path_type = 'disease-disease-drug-reverse'
                    
                disease_disease_paths.append({
                    'disease1_index': d1,
                    'disease1_name': node_name_map[d1],
                    'disease2_index': d2,
                    'disease2_name': node_name_map[d2],
                    'drug_index': drug,
                    'drug_name': node_name_map[drug],
                    'final_relation': relation,
                    'path_type': path_type
                })
    
    print(f"Found {len(disease_disease_paths)} disease-disease-drug paths in {time.time() - start_time:.2f} seconds")
    return disease_disease_paths

disease_disease_paths = find_disease_disease_drug_paths()

## Save Results to CSV

In [None]:
print("Saving results to CSV files...")

# Save to CSV files
pd.DataFrame(direct_relations).to_csv('direct_disease_drug_relations.csv', index=False)
pd.DataFrame(disease_protein_drug_paths).to_csv('disease_protein_drug_paths.csv', index=False)
pd.DataFrame(disease_phenotype_paths).to_csv('disease_phenotype_protein_drug_paths.csv', index=False)
pd.DataFrame(disease_disease_paths).to_csv('disease_disease_drug_paths.csv', index=False)

print("CSV files saved successfully")

## Summary Statistics

In [None]:
print("Summary Statistics:")
print(f"Direct disease-drug relations: {len(direct_relations)}")
print(f"Disease-protein-drug paths: {len(disease_protein_drug_paths)}")
print(f"Disease-phenotype-protein-drug paths: {len(disease_phenotype_paths)}")
print(f"Disease-disease-drug paths: {len(disease_disease_paths)}")
print(f"Total paths found: {len(direct_relations) + len(disease_protein_drug_paths) + len(disease_phenotype_paths) + len(disease_disease_paths)}")