# Benchmark analysis

This notebook compares the different benchmark according to the two KG.

# Imports

In [1]:
import os
import math
import json
from collections import Counter
import hetnetpy.hetnet

import pandas as pd
import numpy as np
from tqdm import tqdm

import networkx as nx
from networkx import DiGraph, MultiDiGraph, connected_components

# Generate graph

In [2]:
KG_DATA_PATH = '../data/kg/splits/'

MOA_NET = os.path.join(KG_DATA_PATH, 'MoA-net')

MOA_NET_10K = os.path.join(MOA_NET, '10k')

PROT_MOA_NET = os.path.join(KG_DATA_PATH, 'MoA-net-protclass')

In [3]:
def create_graph_from_df(
    graph_df
) -> DiGraph:
    """Create fully connected graph from dataframe."""
    graph = DiGraph()

    for sub_name, obj_name, relation in graph_df.values:
        # Store edge in the graph
        graph.add_edge(
            sub_name,
            obj_name,
            polarity=relation,
        )

    print(f"Report on the number of relations: {dict(Counter(graph_df.edge_type))}")

    connected_components_subgraph = [
        component
        for component in sorted(
            connected_components(
                graph.to_undirected()
            ),
            key=len,
            reverse=True
        )
    ]

    final_subgraph = graph.subgraph(connected_components_subgraph[0])

    return final_subgraph

For versions of MoA-net which were cut down by the automatic KG trimming feature:

In [4]:
def create_graph_from_nxobj(nx_graph_obj, node_mapping_file, edge_mapping_file):
    graph = DiGraph()

    old_graph = nx.read_graphml(nx_graph_obj,
                    node_type=int,
                    edge_key_type=int,
                    force_multigraph=True)
    
    for u, v, data in old_graph.edges(data=True):
        graph.add_edge(
            node_mapping_file[u],
            node_mapping_file[v],
            polarity=edge_mapping_file[data['type']],
        )

    connected_components_subgraph = [
        component
        for component in sorted(
            connected_components(
                graph.to_undirected()
            ),
            key=len,
            reverse=True
        )
    ]

    final_subgraph = graph.subgraph(connected_components_subgraph[0])

    return final_subgraph

In [5]:
dataset_pairs = []

for graph_type in tqdm([MOA_NET, PROT_MOA_NET]):

    graph_df = pd.read_csv(
        os.path.join(graph_type, 'kg_with_train_smpls.tsv'),
        sep='\t',
        usecols=['source', 'target', 'edge_type']
    )
    graph = create_graph_from_df(graph_df)

    gold_standard = pd.read_csv(
        os.path.join(graph_type, 'test.tsv'),
        sep='\t',
        usecols=['source', 'target']
    )
    gold_standard['pairs'] = gold_standard['source'] + '_' + gold_standard['target']
    dataset_pairs.append([
        os.path.basename(graph_type),
        graph,
        gold_standard['pairs'].tolist()
    ])


  0%|          | 0/2 [00:00<?, ?it/s]

Report on the number of relations: {'interacts': 86786, 'induces': 986, 'downregulates': 2205, 'participates': 4325, 'upregulates': 1631}


 50%|█████     | 1/2 [00:00<00:00,  1.21it/s]

Report on the number of relations: {'interacts': 50664, 'downregulates': 2202, 'upregulates': 1624, 'participates': 4325, 'induces': 986}


100%|██████████| 2/2 [00:01<00:00,  1.40it/s]


In [6]:
# read the entity mapper in
entity_mapping = json.load(open(os.path.join(MOA_NET, 'MARS/vocab/entity_vocab.json')))
entity_mapping = {int(v): k for k, v in entity_mapping.items()}
relation_mapping = json.load(open(os.path.join(MOA_NET, 'MARS/vocab/relation_vocab.json')))
relation_mapping = {int(v): k for k, v in relation_mapping.items()}

In [7]:
graph = create_graph_from_nxobj(os.path.join(MOA_NET_10K, 'nx_graph.graphml'),
                                 entity_mapping,
                                 relation_mapping)

In [8]:
test_10k = np.load(os.path.join(MOA_NET_10K, 'test-10k.npy'))
val_10k = np.load(os.path.join(MOA_NET_10K, 'dev-10k.npy'))

Drop those from the test set which are no longer connected:

In [9]:
gold_standard = {'source': [], 'target': []}
test_10k_df = {'source': [], 'target': [], 'relation': []}

for source, rel, target in test_10k:
    e1 = entity_mapping[source]
    e2 = entity_mapping[target]
    r = relation_mapping[rel]

    if e1 in graph and e2 in graph and nx.has_path(graph, e1, e2) and nx.shortest_path_length(graph, e1, e2) <= 4:
        gold_standard['source'].append(e1)
        test_10k_df['source'].append(e1)
        gold_standard['target'].append(e2)
        test_10k_df['target'].append(e2)
        test_10k_df['relation'].append(r)

gold_standard = pd.DataFrame(gold_standard)
test_10k_df = pd.DataFrame(test_10k_df)
print(len(gold_standard))
print(len(test_10k_df))

100
100


In [10]:
validation_set = {'source': [], 'target': [], 'relation': []}

for source, rel, target in val_10k:
    e1 = entity_mapping[source]
    e2 = entity_mapping[target]
    r = relation_mapping[rel]

    if e1 in graph and e2 in graph and nx.has_path(graph, e1, e2) and nx.shortest_path_length(graph, e1, e2) <= 4:
        validation_set['source'].append(e1)
        validation_set['target'].append(e2)
        validation_set['relation'].append(r)

validation_set = pd.DataFrame(validation_set)
print(len(validation_set))

90


In [11]:
gold_standard['pairs'] = gold_standard['source'] + '_' + gold_standard['target']
dataset_pairs.append([
    os.path.basename(MOA_NET) + '-10k',
    graph,
    gold_standard['pairs'].tolist()
    ])

# Value by change for each KG

What is the chance of getting our gold standard pair from all possible pair combinations.

In [12]:
val_by_chance_dict = {}

for graph_type, graph, gold_standard in tqdm(dataset_pairs):
    drugs = set()
    bps = set()

    for drug_bp_pair in gold_standard:
        drug, bp = drug_bp_pair.split('_')
        drugs.add(drug)
        bps.add(bp)

    total = len(drugs) * len(bps)
    prob = len(gold_standard) / total
    val_by_chance_dict[graph_type] = round(prob * 100, 3)

val_by_chance_dict 

100%|██████████| 3/3 [00:00<00:00, 2714.76it/s]


{'MoA-net': 3.381, 'MoA-net-protclass': 3.381, 'MoA-net-10k': 2.875}

# Helper Functions

In [13]:
score_actual = {}
kg_dfs = {}

In [14]:
def khop(
    nodeA: str, 
    nodeB: str, 
    graph: nx.Graph, 
    total: bool
) -> tuple:
    
    """Find nodes within the distance limit """
    
    khop_A = {u for u in graph.neighbors(nodeA)}
    khop_B = {u for u in graph.neighbors(nodeB)}
    
    if total:
        return list(khop_A | khop_B), khop_A, khop_B
    else:
        return list(khop_A & khop_B), khop_A, khop_B

In [15]:
def get_dict_df(
    bps, 
    drugs, 
    undirected_kg_graph, 
    di_kg_graph,
    similarity_type,
    similarity_name
):

    t = []
    
    for bp in bps:
        
        cn = []
        
        # for each disease, find the similarity score with for each drug and append to list
        for drug in drugs:
                        
            shared_nodes, nodeA_neighbor, nodeB_neighbor = khop(
                nodeA=drug,
                nodeB=bp,
                graph=undirected_kg_graph, 
                total=False,
            )
            
            if similarity_type == 'cn':
                similarity = len(shared_nodes)
            
            elif similarity_type == 'sp':
                # try to see if path is between two nodes
                try:
                    similarity = len(nx.shortest_path(di_kg_graph, source=drug, target=bp))
                except nx.NetworkXNoPath:
                    similarity = 1000

            cn.append(similarity)
        
        if not similarity_type == 'sp':
            index = np.where(cn == np.amax(cn))
        else:
            index = np.where(cn == np.amin(cn))

        # if list is full of 0's (i.e sum == 0), then there are no shared neighbors 
        if np.sum(cn) == 0:
            continue 
        
        for val in index:
            for j in val:
                t.append(
                    {
                        'source': list(drugs)[j], 
                        'target': bp, 
                        similarity_name: cn[j]
                    }
                )

    return pd.DataFrame(t)

In [16]:
def get_precision(
    gold_standard_pairs: list, 
    predicted: list,
)-> tuple: 
    
    total = len(predicted)
    pos = 0
    
    for pair in predicted:
        if pair in gold_standard_pairs:
            pos += 1
    
    return round(((pos/total) * 100), 3), pos, total


# Different benchmark methods

In [17]:
sim_scores = {
    'cn': 'Common Neighbors',
    'sp': 'Shortest Path'
}

In [18]:
score_df = []

In [19]:
for graph_type, graph, gold_standard in dataset_pairs:
    drugs = set()
    bps = set()

    for drug_bp_pair in gold_standard:
        drug, bp = drug_bp_pair.split('_')
        if drug in graph.nodes():
            drugs.add(drug)
        
        if bp in graph.nodes():
            bps.add(bp)

    undirected_kg_graph = graph.to_undirected()

    for algo in tqdm(sim_scores, desc=f'Calculating scores for algorithms - {graph_type}'):
        algo_name = sim_scores[algo]
        
        full_df = get_dict_df(
            bps=list(bps),
            drugs=list(drugs), 
            undirected_kg_graph=undirected_kg_graph,
            di_kg_graph=graph,
            similarity_type=algo,
            similarity_name=algo_name
        )

        if full_df.empty:
            print(f'No results for {algo_name}')
            continue

            
        full_df['pair'] = full_df['source'] + '_' + full_df['target']
                
        precision, pos, total = get_precision(
            gold_standard_pairs=gold_standard,
            predicted=list(full_df['pair'].unique()),
        )

        score_df.append({
            'graph_type': graph_type,
            'algo_name': algo_name,
            'precision': f'{precision}%',
            'val_by_chance': f'{val_by_chance_dict[graph_type]}',
            '# pairs': f'{pos}/{total}',
        })

Calculating scores for algorithms - MoA-net: 100%|██████████| 2/2 [00:02<00:00,  1.17s/it]
Calculating scores for algorithms - MoA-net-protclass: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]
Calculating scores for algorithms - MoA-net-10k: 100%|██████████| 2/2 [00:00<00:00,  5.03it/s]


In [20]:
scores = pd.DataFrame(score_df)
scores

Unnamed: 0,graph_type,algo_name,precision,val_by_chance,# pairs
0,MoA-net,Common Neighbors,19.903%,3.381,41/206
1,MoA-net,Shortest Path,3.546%,3.381,35/987
2,MoA-net-protclass,Common Neighbors,19.903%,3.381,41/206
3,MoA-net-protclass,Shortest Path,3.546%,3.381,35/987
4,MoA-net-10k,Common Neighbors,29.286%,2.875,41/140
5,MoA-net-10k,Shortest Path,10.479%,2.875,35/334


## Metrics based on Metapaths

First, we need the KG with the test BPs:

In [21]:
from hetnetpy.abbreviation import metaedges_from_metapath
from hetnetpy.pathtools import DWPC, paths_between
from itertools import chain

In [22]:
def get_paths(
    graph,
    source,
    target,
    rule,
):
    try:
        paths = paths_between(
            graph,
            source=('Compound', source),
            target=('Biological Process', target),
            metapath=tuple(rule),
            duplicates=False,
        )
    except Exception as e:
        print(e)
        paths = []        
    
    return paths

In [23]:
kg_triples = nx.to_pandas_edgelist(graph)

# TODO: Get DWPC code to work
- add types to kg again
- change edge types back
- make metagraph and all again.

In [24]:
def map_relation(rel):
    if rel == 'GpBP':
        return 'participates'
    if rel == 'CuG':
        return 'upregulates'
    if rel == 'CdG':
        return 'downregulates'
    if rel == 'GiG':
        return 'interacts'
    if rel == 'CtBP':
        return 'induces'
    else:
        return None

In [25]:
def map_entity(entity):
    if 'ncbigene' in entity:
        return 'Gene'
    if 'pubchem.compound' in entity:
        return 'Compound'
    if 'GO:' in entity:
        return 'Biological Process'

In [26]:
kg_triples['source_node_type'] = kg_triples['source'].apply(lambda x: map_entity(x))
kg_triples['target_node_type'] = kg_triples['target'].apply(lambda x: map_entity(x))
kg_triples['edge_type'] = kg_triples['polarity'].apply(lambda x: map_relation(x))

In [27]:
kg_triples.dropna(inplace=True, ignore_index=True)
kg_triples = kg_triples[['source', 'source_node_type', 'target', 'target_node_type', 'edge_type']]
kg_triples

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,ncbigene:5602,Gene,GO:0007165,Biological Process,participates
1,ncbigene:5602,Gene,ncbigene:1847,Gene,interacts
2,ncbigene:5602,Gene,ncbigene:1822,Gene,interacts
3,ncbigene:3798,Gene,ncbigene:5602,Gene,interacts
4,ncbigene:3798,Gene,GO:0016192,Biological Process,participates
...,...,...,...,...,...
19544,ncbigene:162514,Gene,ncbigene:4626,Gene,interacts
19545,pubchem.compound:54676537,Compound,ncbigene:79001,Gene,downregulates
19546,pubchem.compound:54678486,Compound,ncbigene:79001,Gene,downregulates
19547,ncbigene:9554,Gene,ncbigene:662,Gene,interacts


In [28]:
test_10k_df['source_node_type'] = test_10k_df['source'].apply(lambda x: map_entity(x))
test_10k_df['target_node_type'] = test_10k_df['target'].apply(lambda x: map_entity(x))
test_10k_df['edge_type'] = test_10k_df['relation'].apply(lambda x: map_relation(x))

In [29]:
test_10k_df.dropna(inplace=True, ignore_index=True)
test_10k_df = test_10k_df[['source', 'source_node_type', 'target', 'target_node_type', 'edge_type']]
test_10k_df

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:31703,Compound,GO:0051276,Biological Process,induces
1,pubchem.compound:4826,Compound,GO:0042311,Biological Process,induces
2,pubchem.compound:3957,Compound,GO:0034776,Biological Process,induces
3,pubchem.compound:4636,Compound,GO:0042310,Biological Process,induces
4,pubchem.compound:6167,Compound,GO:0006913,Biological Process,induces
...,...,...,...,...,...
95,pubchem.compound:5282136,Compound,GO:0001696,Biological Process,induces
96,pubchem.compound:444795,Compound,GO:0007155,Biological Process,induces
97,pubchem.compound:3476,Compound,GO:0006629,Biological Process,induces
98,pubchem.compound:1548887,Compound,GO:0008283,Biological Process,induces


In [30]:
validation_set['source_node_type'] = validation_set['source'].apply(lambda x: map_entity(x))
validation_set['target_node_type'] = validation_set['target'].apply(lambda x: map_entity(x))
validation_set['edge_type'] = validation_set['relation'].apply(lambda x: map_relation(x))

In [31]:
validation_set.dropna(inplace=True, ignore_index=True)
validation_set = validation_set[['source', 'source_node_type', 'target', 'target_node_type', 'edge_type']]
validation_set

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:10607,Compound,GO:0007010,Biological Process,induces
1,pubchem.compound:3547,Compound,GO:0007165,Biological Process,induces
2,pubchem.compound:1548943,Compound,GO:0043065,Biological Process,induces
3,pubchem.compound:2265,Compound,GO:0007049,Biological Process,induces
4,pubchem.compound:1548887,Compound,GO:0006629,Biological Process,induces
...,...,...,...,...,...
85,pubchem.compound:16362,Compound,GO:0006914,Biological Process,induces
86,pubchem.compound:3117,Compound,GO:0050877,Biological Process,induces
87,pubchem.compound:5833,Compound,GO:0008219,Biological Process,induces
88,pubchem.compound:444795,Compound,GO:0002376,Biological Process,induces


In [32]:
kg_triples.to_csv(os.path.join(MOA_NET_10K, 'trimmed_kg.tsv'), sep='\t', index=False)
test_10k_df.to_csv(os.path.join(MOA_NET_10K, 'trimmed_kg_test.tsv'), sep='\t', index=False)
validation_set.to_csv(os.path.join(MOA_NET_10K, 'trimmed_kg_validation.tsv'), sep='\t', index=False)

In [33]:
with open(os.path.join(MOA_NET, 'CtoBP_metapaths.txt')) as f:
    CtoBP_metapaths = [line.strip() for line in f.readlines()]

Make the metagraph again:

In [34]:
kind_to_abbev = {
    
    # metanodes
    'Compound': 'C',
    'Gene': 'G',
    'Biological Process': 'BP',
    
    # metaedges
    'upregulates': 'u',
    'downregulates': 'd',
    'interacts': 'i',
    'participates': 'p',
    'induces': 't',
}

metaedge_tuples = [
    ('Compound', 'Gene', 'upregulates', 'forward'),
    ('Compound', 'Gene', 'downregulates', 'forward'),
    ('Gene', 'Gene', 'interacts', 'forward'),
    ('Gene', 'Biological Process', 'participates', 'forward'),
    ('Compound', 'Biological Process', 'induces', 'forward')
]

metagraph = hetnetpy.hetnet.MetaGraph.from_edge_tuples(metaedge_tuples, kind_to_abbev)
graph = hetnetpy.hetnet.Graph(metagraph)

In [35]:
seen = set()

for i, row in kg_triples.iterrows():
    if not row['source'] in seen:
        graph.add_node(kind=row['source_node_type'], identifier=row['source'])
        seen.add(row['source'])
    if not row['target'] in seen:
        graph.add_node(kind=row['target_node_type'], identifier=row['target'])
        seen.add(row['target'])

    src_id = row['source_node_type'], row['source']
    trgt_id = row['target_node_type'], row['target']

    graph.add_edge(source_id=src_id, target_id=trgt_id, kind=row['edge_type'], direction='forward')

In [36]:
hits_at_10 = []
hits_at_3 = []
hits_at_1 = []
mrr = []

all_bps = test_10k_df['target'].unique()

for i, row in tqdm(test_10k_df.iterrows(), total=test_10k_df.shape[0]):
    source = row['source']
    target = row['target']
    
    rank = {}
    
    for bp in all_bps:
        
        all_paths = []
    
        # All metapaths going from compound to biological process
        for mpath in CtoBP_metapaths:
                    
            paths = get_paths(
                graph=graph,
                source=source,
                target=bp,
                rule=mpath,
            )
            
            if not paths:
                continue
                
            all_paths.append(paths)
                    
        # flatten paths
        all_paths = list(chain(*all_paths))
        
        if not all_paths:
            continue
    
        score = DWPC(all_paths, damping_exponent=0.4)
                
        rank[bp] = score
        
    # Sort rank by score
    ranked_bp_predictions = [
        go
        for go, _ in sorted(rank.items(), key=lambda x: x[1], reverse=True)
    ]
            
    # Hits at 10
    if target in ranked_bp_predictions[:10]:
        hits_at_10.append(1) 
    else:
        hits_at_10.append(0)
    
    # Hits at 3
    if target in ranked_bp_predictions[:3]:
        hits_at_3.append(1)
    else:
        hits_at_3.append(0)
    
    # Hits at 1
    if target in ranked_bp_predictions[:1]:
        hits_at_1.append(1)
    else:
        hits_at_1.append(0)
        
    # MRR
    if target in ranked_bp_predictions:
        print(f"{ranked_bp_predictions.index(target)} / {len(ranked_bp_predictions)}")
        mrr.append(
            1.0 / (ranked_bp_predictions.index(target) + 1)
        )
    else:
        mrr.append(0)
        
    # print(f'Hits at 10: {sum(hits_at_10) / len(hits_at_10)}')
    # print(f'Hits at 3: {sum(hits_at_3) / len(hits_at_3)}')
    # print(f'Hits at 1: {sum(hits_at_1) / len(hits_at_1)}')
    # print(f'MRR: {sum(mrr) / len(mrr)}')

        
print(f'Hits at 10: {sum(hits_at_10) / len(hits_at_10)}')
print(f'Hits at 3: {sum(hits_at_3) / len(hits_at_3)}')
print(f'Hits at 1: {sum(hits_at_1) / len(hits_at_1)}')
print(f'MRR: {sum(mrr) / len(mrr)}')

100%|██████████| 100/100 [00:00<00:00, 594.94it/s]














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































# Metrics for permuted graph

In [37]:
permuted_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'splits', 'moa_net_full_permuted.tsv'),
    sep='\t',
    usecols=['source', 'target', 'edge_type']
)
permuted_graph = create_graph_from_df(permuted_df)

gold_standard = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'splits', 'test.tsv'),
    sep='\t',
    usecols=['source', 'target']
)
gold_standard['pairs'] = gold_standard['source'] + '_' + gold_standard['target']

FileNotFoundError: [Errno 2] No such file or directory: '../data/kg/splits/splits/moa_net_full_permuted.tsv'

In [None]:
drugs = set()
bps = set()

for drug_bp_pair in gold_standard['pairs'].tolist():
    drug, bp = drug_bp_pair.split('_')
    if drug in permuted_graph.nodes():
        drugs.add(drug)
    
    if bp in permuted_graph.nodes():
        bps.add(bp)

undirected_kg_permuted_graph = permuted_graph.to_undirected()

for algo in tqdm(sim_scores, desc=f'Calculating scores for algorithms - permuted'):
    algo_name = sim_scores[algo]
    
    full_df = get_dict_df(
        bps=list(bps),
        drugs=list(drugs), 
        undirected_kg_graph=undirected_kg_permuted_graph,
        di_kg_graph=permuted_graph,
        similarity_type=algo,
        similarity_name=algo_name
    )

    if full_df.empty:
        print(f'No results for {algo_name}')
        continue

        
    full_df['pair'] = full_df['source'] + '_' + full_df['target']
            
    precision, pos, total = get_precision(
        gold_standard_pairs=gold_standard,
        predicted=list(full_df['pair'].unique()),
    )

    print({
        'graph_type': 'permuted',
        'algo_name': algo_name,
        'precision': precision,
        'val_by_chance': 3.379,
        '# pairs': f'{pos}/{total}',
    })