# Benchmark analysis

This notebook compares the different benchmark according to the two KG.

# Imports

In [1]:
import os
import math
from collections import Counter

import pandas as pd
import numpy as np
from tqdm import tqdm

import networkx as nx
from networkx import DiGraph, connected_components

# Generate graph

In [2]:
KG_DATA_PATH = '../data/kg/splits/'

MOA_NET = os.path.join(KG_DATA_PATH, 'MoA-net')

PROT_MOA_NET = os.path.join(KG_DATA_PATH, 'MoA-net-protclass')

In [3]:
def create_graph_from_df(
    graph_df
) -> DiGraph:
    """Create fully connected graph from dataframe."""
    graph = DiGraph()

    for sub_name, obj_name, relation in graph_df.values:
        # Store edge in the graph
        graph.add_edge(
            sub_name,
            obj_name,
            polarity=relation,
        )

    print(f"Report on the number of relations: {dict(Counter(graph_df.edge_type))}")

    connected_components_subgraph = [
        component
        for component in sorted(
            connected_components(
                graph.to_undirected()
            ),
            key=len,
            reverse=True
        )
    ]

    final_subgraph = graph.subgraph(connected_components_subgraph[0])

    return final_subgraph

In [4]:
dataset_pairs = []

for graph_type in tqdm([MOA_NET, PROT_MOA_NET]):

    graph_df = pd.read_csv(
        os.path.join(graph_type, 'kg_with_train_smpls.tsv'),
        sep='\t',
        usecols=['source', 'target', 'edge_type']
    )
    graph = create_graph_from_df(graph_df)

    gold_standard = pd.read_csv(
        os.path.join(graph_type, 'test.tsv'),
        sep='\t',
        usecols=['source', 'target']
    )
    gold_standard['pairs'] = gold_standard['source'] + '_' + gold_standard['target']
    dataset_pairs.append([
        os.path.basename(graph_type),
        graph,
        gold_standard['pairs'].tolist()
    ])


  0%|          | 0/2 [00:00<?, ?it/s]

Report on the number of relations: {'interacts': 86786, 'induces': 961, 'upregulates': 1631, 'participates': 4325, 'downregulates': 2205}


 50%|█████     | 1/2 [00:00<00:00,  1.07it/s]

Report on the number of relations: {'interacts': 78617, 'participates': 5027, 'upregulates': 1949, 'downregulates': 2486, 'induces': 1002}


100%|██████████| 2/2 [00:01<00:00,  1.17it/s]


# Value by change for each KG

What is the change of getting our gold standard pair from all possible pair combinations.

In [5]:
val_by_chance_dict = {}

for graph_type, graph, gold_standard in tqdm(dataset_pairs):
    drugs = set()
    bps = set()

    for drug_bp_pair in gold_standard:
        drug, bp = drug_bp_pair.split('_')
        drugs.add(drug)
        bps.add(bp)

    total = len(drugs) * len(bps)
    prob = len(gold_standard) / total
    val_by_chance_dict[graph_type] = round(prob * 100, 3)

val_by_chance_dict 

100%|██████████| 2/2 [00:00<00:00, 3511.35it/s]


{'MoA-net': 3.176, 'MoA-net-protclass': 3.267}

# Helper Functions

In [6]:
score_actual = {}
kg_dfs = {}

In [7]:
def khop(
    nodeA: str, 
    nodeB: str, 
    graph: nx.Graph, 
    total: bool
) -> tuple:
    
    """Find nodes within the distance limit """
    
    khop_A = {u for u in graph.neighbors(nodeA)}
    khop_B = {u for u in graph.neighbors(nodeB)}
    
    if total:
        return list(khop_A | khop_B), khop_A, khop_B
    else:
        return list(khop_A & khop_B), khop_A, khop_B

In [8]:
def get_dict_df(
    bps, 
    drugs, 
    undirected_kg_graph, 
    di_kg_graph,
    similarity_type,
    similarity_name
):

    t = []
    
    for bp in bps:
        
        cn = []
        
        # for each disease, find the similarity score with for each drug and append to list
        for drug in drugs:
                        
            shared_nodes, nodeA_neighbor, nodeB_neighbor = khop(
                nodeA=drug,
                nodeB=bp,
                graph=undirected_kg_graph, 
                total=False,
            )
            
            if similarity_type == 'cn':
                similarity = len(shared_nodes)
            
            elif similarity_type == 'sp':
                # try to see if path is between two nodes
                try:
                    similarity = len(nx.shortest_path(di_kg_graph, source=drug, target=bp))
                except nx.NetworkXNoPath:
                    similarity = 1000

            cn.append(similarity)
        
        if not similarity_type == 'sp':
            index = np.where(cn == np.amax(cn))
        else:
            index = np.where(cn == np.amin(cn))

        # if list is full of 0's (i.e sum == 0), then there are no shared neighbors 
        if np.sum(cn) == 0:
            continue 
        
        for val in index:
            for j in val:
                t.append(
                    {
                        'source': list(drugs)[j], 
                        'target': bp, 
                        similarity_name: cn[j]
                    }
                )

    return pd.DataFrame(t)

In [9]:
def get_precision(
    gold_standard_pairs: list, 
    predicted: list,
)-> tuple: 
    
    total = len(predicted)
    pos = 0
    
    for pair in predicted:
        if pair in gold_standard_pairs:
            pos += 1
    
    return round(((pos/total) * 100), 3), pos, total


# Different benchmark methods

In [10]:
sim_scores = {
    'cn': 'Common Neighbors',
    'sp': 'Shortest Path'
}

In [11]:
score_df = []

In [12]:
for graph_type, graph, gold_standard in dataset_pairs:
    drugs = set()
    bps = set()

    for drug_bp_pair in gold_standard:
        drug, bp = drug_bp_pair.split('_')
        if drug in graph.nodes():
            drugs.add(drug)
        
        if bp in graph.nodes():
            bps.add(bp)

    undirected_kg_graph = graph.to_undirected()

    for algo in tqdm(sim_scores, desc=f'Calculating scores for algorithms - {graph_type}'):
        algo_name = sim_scores[algo]
        
        full_df = get_dict_df(
            bps=list(bps),
            drugs=list(drugs), 
            undirected_kg_graph=undirected_kg_graph,
            di_kg_graph=graph,
            similarity_type=algo,
            similarity_name=algo_name
        )

        if full_df.empty:
            print(f'No results for {algo_name}')
            continue

            
        full_df['pair'] = full_df['source'] + '_' + full_df['target']
                
        precision, pos, total = get_precision(
            gold_standard_pairs=gold_standard,
            predicted=list(full_df['pair'].unique()),
        )

        score_df.append({
            'graph_type': graph_type,
            'algo_name': algo_name,
            'precision': precision,
            'val_by_chance': val_by_chance_dict[graph_type],
            '# pairs': f'{pos}/{total}',
        })

Calculating scores for algorithms - MoA-net: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
Calculating scores for algorithms - MoA-net-protclass: 100%|██████████| 2/2 [00:01<00:00,  1.01it/s]


In [13]:
scores = pd.DataFrame(score_df)
scores

Unnamed: 0,graph_type,algo_name,precision,val_by_chance,# pairs
0,MoA-net,Common Neighbors,18.957,3.176,40/211
1,MoA-net,Shortest Path,3.483,3.176,35/1005
2,MoA-net-protclass,Common Neighbors,17.89,3.267,39/218
3,MoA-net-protclass,Shortest Path,3.565,3.267,37/1038


# Metrics for permuted graph

In [14]:
permuted_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'splits', 'moa_net_full_permuted.tsv'),
    sep='\t',
    usecols=['source', 'target', 'edge_type']
)
permuted_graph = create_graph_from_df(permuted_df)

gold_standard = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'splits', 'test.tsv'),
    sep='\t',
    usecols=['source', 'target']
)
gold_standard['pairs'] = gold_standard['source'] + '_' + gold_standard['target']

Report on the number of relations: {'interacts': 86786, 'upregulates': 1626, 'downregulates': 2200, 'induces': 1111, 'participates': 4351}


In [15]:
drugs = set()
bps = set()

for drug_bp_pair in gold_standard['pairs'].tolist():
    drug, bp = drug_bp_pair.split('_')
    if drug in permuted_graph.nodes():
        drugs.add(drug)
    
    if bp in permuted_graph.nodes():
        bps.add(bp)

undirected_kg_permuted_graph = permuted_graph.to_undirected()

for algo in tqdm(sim_scores, desc=f'Calculating scores for algorithms - permuted'):
    algo_name = sim_scores[algo]
    
    full_df = get_dict_df(
        bps=list(bps),
        drugs=list(drugs), 
        undirected_kg_graph=undirected_kg_permuted_graph,
        di_kg_graph=permuted_graph,
        similarity_type=algo,
        similarity_name=algo_name
    )

    if full_df.empty:
        print(f'No results for {algo_name}')
        continue

        
    full_df['pair'] = full_df['source'] + '_' + full_df['target']
            
    precision, pos, total = get_precision(
        gold_standard_pairs=gold_standard,
        predicted=list(full_df['pair'].unique()),
    )

    print({
        'graph_type': 'permuted',
        'algo_name': algo_name,
        'precision': precision,
        'val_by_chance': 3.379,
        '# pairs': f'{pos}/{total}',
    })

Calculating scores for algorithms - permuted:   0%|          | 0/2 [00:00<?, ?it/s]

{'graph_type': 'permuted', 'algo_name': 'Common Neighbors', 'precision': 0.0, 'val_by_chance': 3.379, '# pairs': '0/106'}


Calculating scores for algorithms - permuted: 100%|██████████| 2/2 [00:02<00:00,  1.44s/it]

{'graph_type': 'permuted', 'algo_name': 'Shortest Path', 'precision': 0.0, 'val_by_chance': 3.379, '# pairs': '0/1321'}



