# Metagraph and Metapath Generation

Here, we use the same syntax as [HetNetPy](https://github.com/hetio/hetnetpy/tree/e1ff1f8296b766dd5a63e5870a8a1a7d82427a80) to create a metagraph and extract metapaths from the KG.

Metagraph creation is based on [this notebook](https://github.com/dhimmel/integrate/blob/93feba1765fbcd76fd79e22f25121f5399629148/integrate.ipynb).

In [1]:
import pandas as pd
import seaborn as sns
import os
import os.path as osp
import numpy as np

import hetnetpy.hetnet
import hetnetpy.readwrite
import hetnetpy.stats

from hetnetpy.abbreviation import metaedges_from_metapath
from hetnetpy.pathtools import DWPC, paths_between

import networkx as nx
from networkx import DiGraph, MultiDiGraph, connected_components

import re
import json
from tqdm import tqdm
from collections import Counter

from itertools import chain

%matplotlib inline

In [2]:
KG_DATA_PATH = '../data/kg/splits/'

MOA_NET = os.path.join(KG_DATA_PATH, 'MoA-net')

MOA_NET_10K = os.path.join(MOA_NET, '10k')

PROT_MOA_NET = os.path.join(KG_DATA_PATH, 'MoA-net-protclass')

Let's load in MoA-net-10k, the version of MoA-net which was cut down by the automatic KG trimming feature:

In [3]:
def create_graph_from_nxobj(nx_graph_obj, node_mapping_file, edge_mapping_file):
    graph = DiGraph()

    old_graph = nx.read_graphml(nx_graph_obj,
                    node_type=int,
                    edge_key_type=int,
                    force_multigraph=True)
    
    for u, v, data in old_graph.edges(data=True):
        graph.add_edge(
            node_mapping_file[u],
            node_mapping_file[v],
            polarity=edge_mapping_file[data['type']],
        )

    connected_components_subgraph = [
        component
        for component in sorted(
            connected_components(
                graph.to_undirected()
            ),
            key=len,
            reverse=True
        )
    ]

    final_subgraph = graph.subgraph(connected_components_subgraph[0])

    return final_subgraph

In [4]:
# read the entity mapper in
entity_mapping = json.load(open(os.path.join(MOA_NET, 'MARS/vocab/entity_vocab.json')))
entity_mapping = {int(v): k for k, v in entity_mapping.items()}
relation_mapping = json.load(open(os.path.join(MOA_NET, 'MARS/vocab/relation_vocab.json')))
relation_mapping = {int(v): k for k, v in relation_mapping.items()}

In [5]:
kg = create_graph_from_nxobj(os.path.join(MOA_NET_10K, 'nx_graph.graphml'),
                                 entity_mapping,
                                 relation_mapping)

In [6]:
test_10k = np.load(os.path.join(MOA_NET_10K, 'test-10k.npy'))
val_10k = np.load(os.path.join(MOA_NET_10K, 'dev-10k.npy'))

Drop those from the test set which are no longer connected:

In [7]:
gold_standard = {'source': [], 'target': []}
test_10k_df = {'source': [], 'target': [], 'relation': []}

for source, rel, target in test_10k:
    e1 = entity_mapping[source]
    e2 = entity_mapping[target]
    r = relation_mapping[rel]

    if e1 in kg and e2 in kg and nx.has_path(kg, e1, e2) and nx.shortest_path_length(kg, e1, e2) <= 4:
        gold_standard['source'].append(e1)
        test_10k_df['source'].append(e1)
        gold_standard['target'].append(e2)
        test_10k_df['target'].append(e2)
        test_10k_df['relation'].append(r)

gold_standard = pd.DataFrame(gold_standard)
test_10k_df = pd.DataFrame(test_10k_df)
print(len(gold_standard))
print(len(test_10k_df))

100
100


In [8]:
validation_set = {'source': [], 'target': [], 'relation': []}

for source, rel, target in val_10k:
    e1 = entity_mapping[source]
    e2 = entity_mapping[target]
    r = relation_mapping[rel]

    if e1 in kg and e2 in kg and nx.has_path(kg, e1, e2) and nx.shortest_path_length(kg, e1, e2) <= 4:
        validation_set['source'].append(e1)
        validation_set['target'].append(e2)
        validation_set['relation'].append(r)

validation_set = pd.DataFrame(validation_set)
print(len(validation_set))

90


### Define and Instantiate Metagraph:

In [9]:
kind_to_abbev = {
    
    # metanodes
    'Compound': 'C',
    'Gene': 'G',
    'Biological Process': 'BP',
    
    # metaedges
    'CuG': 'u',
    'CdG': 'd',
    'GiG': 'i',
    'GpBP': 'p',
    'CtBP': 't',
}

metaedge_tuples = [
    ('Compound', 'Gene', 'CuG', 'forward'),
    ('Compound', 'Gene', 'CdG', 'forward'),
    ('Gene', 'Gene', 'GiG', 'forward'),
    ('Gene', 'Biological Process', 'GpBP', 'forward'),
    ('Compound', 'Biological Process', 'CtBP', 'forward')
]

metagraph = hetnetpy.hetnet.MetaGraph.from_edge_tuples(metaedge_tuples, kind_to_abbev)
graph = hetnetpy.hetnet.Graph(metagraph)

In [10]:
def map2type(node_name):
    if 'ncbigene' in node_name:
        return 'Gene'
    elif 'GO:' in node_name:
        return 'Biological Process'
    elif 'pubchem' in node_name:
        return 'Compound'

In [11]:
seen = set()
all_bps = set()

for u, v, data in kg.edges(data=True):
    if '_' in data['polarity']:
        continue
    if not u in seen:
        graph.add_node(kind=map2type(u), identifier=u)
        seen.add(u)
        if map2type(u) == 'Biological Process':
            all_bps.add(u)
    if not v in seen:
        graph.add_node(kind=map2type(v), identifier=v)
        seen.add(v)
        if map2type(v) == 'Biological Process':
            all_bps.add(v)

    src_id = map2type(u), u
    trgt_id = map2type(v), v

    graph.add_edge(source_id=src_id, target_id=trgt_id, kind=data['polarity'], direction='forward')

## Metapath Extraction

Here, we have functions which can extract metapaths:

Extract all metapaths up to a certain length:

Extract metapaths from a given source node type (and, optionally, to a target node type):

In [12]:
tgt_metapaths = metagraph.extract_metapaths(source='Compound', target='Biological Process')
print(tgt_metapaths)

[Ct>BP, Cd>Gp>BP, Cu>Gp>BP, Ct>BP<tCt>BP, Ct>BP<pGp>BP, Cd>G<dCt>BP, Cd>G<uCt>BP, Cd>G<iGp>BP, Cd>Gi>Gp>BP, Cu>G<dCt>BP, Cu>G<uCt>BP, Cu>G<iGp>BP, Cu>Gi>Gp>BP, Ct>BP<tCd>Gp>BP, Ct>BP<tCu>Gp>BP, Ct>BP<pG<dCt>BP, Ct>BP<pG<uCt>BP, Ct>BP<pG<iGp>BP, Ct>BP<pGi>Gp>BP, Cd>Gp>BP<tCt>BP, Cd>Gp>BP<pGp>BP, Cd>G<dCd>Gp>BP, Cd>G<dCu>Gp>BP, Cd>G<uCd>Gp>BP, Cd>G<uCu>Gp>BP, Cd>G<iG<dCt>BP, Cd>G<iG<uCt>BP, Cd>G<iG<iGp>BP, Cd>G<iGi>Gp>BP, Cd>Gi>G<dCt>BP, Cd>Gi>G<uCt>BP, Cd>Gi>G<iGp>BP, Cd>Gi>Gi>Gp>BP, Cu>Gp>BP<tCt>BP, Cu>Gp>BP<pGp>BP, Cu>G<dCd>Gp>BP, Cu>G<dCu>Gp>BP, Cu>G<uCd>Gp>BP, Cu>G<uCu>Gp>BP, Cu>G<iG<dCt>BP, Cu>G<iG<uCt>BP, Cu>G<iG<iGp>BP, Cu>G<iGi>Gp>BP, Cu>Gi>G<dCt>BP, Cu>Gi>G<uCt>BP, Cu>Gi>G<iGp>BP, Cu>Gi>Gi>Gp>BP]


We do not want any with a compound in the middle:

In [13]:
CtoBP_metapaths = []


for i in tgt_metapaths:
    if (repr(i) != 'Ct>BP') & (Counter(re.split('>|<|[^A-Z]', repr(i)))['C'] == 1):
        CtoBP_metapaths.append(i)

How many did it prune?

In [14]:
print(len(CtoBP_metapaths))
print(len(tgt_metapaths))

19
47


## Metrics based on Metapaths

First, we need the KG with the test BPs:

In [15]:
def get_paths(
    graph,
    source,
    target,
    rule,
):
    try:
        paths = paths_between(
            graph,
            source=('Compound', source),
            target=('Biological Process', target),
            metapath=tuple(rule),
            duplicates=False,
        )
    except Exception as e:
        print(e)
        paths = []        
    
    return paths

In [17]:
hits_at_10 = []
hits_at_3 = []
hits_at_1 = []
mrr = []

for i, row in tqdm(test_10k_df.iterrows(), total=len(test_10k_df)):
    
    source = row['source']
    target = row['target']

    rank = {}
    
    for bp in all_bps:
        
        all_paths = []
    
        # All metapaths going from compound to biological process
        for rule in CtoBP_metapaths:
                    
            paths = get_paths(
                graph=graph,
                source=source,
                target=bp,
                rule=rule,
            )
            
            if not paths:
                continue
                
            all_paths.append(paths)
                    
        # flatten paths
        all_paths = list(chain(*all_paths))
        
        if not all_paths:
            continue
    
        score = DWPC(all_paths, damping_exponent=0.4)
                
        rank[bp] = score
        
    # Sort rank by score
    ranked_bp_predictions = [
        go
        for go, _ in sorted(rank.items(), key=lambda x: x[1], reverse=True)
    ]
            
    # Hits at 10
    if target in ranked_bp_predictions[:10]:
        hits_at_10.append(1) 
    else:
        hits_at_10.append(0)
    
    # Hits at 3
    if target in ranked_bp_predictions[:3]:
        hits_at_3.append(1)
    else:
        hits_at_3.append(0)
    
    # Hits at 1
    if target in ranked_bp_predictions[:1]:
        hits_at_1.append(1)
    else:
        hits_at_1.append(0)
        
    # MRR
    if target in ranked_bp_predictions:
        print(f"{ranked_bp_predictions.index(target)} / {len(ranked_bp_predictions)}")
        mrr.append(
            1.0 / (ranked_bp_predictions.index(target) + 1)
        )
    else:
        mrr.append(0)

        
print(f'Hits at 10: {sum(hits_at_10) / len(hits_at_10)}')
print(f'Hits at 3: {sum(hits_at_3) / len(hits_at_3)}')
print(f'Hits at 1: {sum(hits_at_1) / len(hits_at_1)}')
print(f'MRR: {sum(mrr) / len(mrr)}')

  1%|          | 1/100 [00:37<1:01:04, 37.02s/it]

20 / 79


  2%|▏         | 2/100 [00:38<26:12, 16.05s/it]  

1 / 42


  3%|▎         | 3/100 [00:41<16:11, 10.01s/it]

1 / 58


  4%|▍         | 4/100 [00:45<12:10,  7.61s/it]

1 / 59


  5%|▌         | 5/100 [00:54<12:56,  8.17s/it]

5 / 73


  6%|▌         | 6/100 [01:00<11:50,  7.56s/it]

0 / 77


  7%|▋         | 7/100 [01:01<08:32,  5.51s/it]

0 / 9


  8%|▊         | 8/100 [01:07<08:23,  5.48s/it]

0 / 70


  9%|▉         | 9/100 [01:14<08:54,  5.88s/it]

0 / 65


 10%|█         | 10/100 [01:25<11:23,  7.60s/it]

5 / 75


 11%|█         | 11/100 [01:28<09:03,  6.11s/it]

1 / 57


 12%|█▏        | 12/100 [01:29<06:50,  4.66s/it]

0 / 8


 13%|█▎        | 13/100 [01:39<08:50,  6.10s/it]

6 / 75


 14%|█▍        | 14/100 [02:12<20:45, 14.48s/it]

14 / 75


 15%|█▌        | 15/100 [02:19<17:11, 12.14s/it]

3 / 70


 16%|█▌        | 16/100 [02:28<15:30, 11.07s/it]

19 / 73


 17%|█▋        | 17/100 [02:46<18:20, 13.25s/it]

4 / 80


 18%|█▊        | 18/100 [02:54<15:52, 11.61s/it]

15 / 74


 19%|█▉        | 19/100 [03:02<14:07, 10.46s/it]

1 / 74


 20%|██        | 20/100 [03:12<13:50, 10.39s/it]

3 / 75


 21%|██        | 21/100 [03:18<12:11,  9.26s/it]

8 / 70


 22%|██▏       | 22/100 [03:23<10:18,  7.93s/it]

8 / 72


 23%|██▎       | 23/100 [03:39<13:10, 10.27s/it]

1 / 75


 24%|██▍       | 24/100 [03:41<10:00,  7.90s/it]

1 / 66


 25%|██▌       | 25/100 [03:52<10:59,  8.79s/it]

16 / 78


 26%|██▌       | 26/100 [04:07<12:51, 10.43s/it]

0 / 73


 27%|██▋       | 27/100 [04:12<10:43,  8.81s/it]

7 / 70


 28%|██▊       | 28/100 [04:13<07:51,  6.55s/it]

0 / 7


 29%|██▉       | 29/100 [04:14<05:50,  4.93s/it]

1 / 9


 30%|███       | 30/100 [04:15<04:29,  3.85s/it]

0 / 4


 31%|███       | 31/100 [04:18<03:53,  3.38s/it]

0 / 51


 32%|███▏      | 32/100 [04:32<07:39,  6.76s/it]

19 / 79


 33%|███▎      | 33/100 [04:34<05:45,  5.16s/it]

0 / 20


 34%|███▍      | 34/100 [04:37<04:55,  4.48s/it]

6 / 63


 35%|███▌      | 35/100 [04:41<04:44,  4.37s/it]

2 / 72


 36%|███▌      | 36/100 [06:03<29:33, 27.72s/it]

3 / 78


 37%|███▋      | 37/100 [06:16<24:33, 23.39s/it]

0 / 71


 38%|███▊      | 38/100 [06:27<20:12, 19.55s/it]

1 / 73


 39%|███▉      | 39/100 [06:32<15:36, 15.34s/it]

0 / 64


 40%|████      | 40/100 [06:40<12:55, 12.92s/it]

0 / 72


 41%|████      | 41/100 [07:28<23:03, 23.45s/it]

2 / 74


 42%|████▏     | 42/100 [08:03<26:10, 27.07s/it]

12 / 78


 43%|████▎     | 43/100 [08:18<22:16, 23.44s/it]

13 / 79


 44%|████▍     | 44/100 [08:19<15:38, 16.75s/it]

0 / 7


 45%|████▌     | 45/100 [08:30<13:40, 14.92s/it]

21 / 78


 46%|████▌     | 46/100 [08:33<10:14, 11.37s/it]

9 / 60


 47%|████▋     | 47/100 [09:54<28:22, 32.13s/it]

0 / 78


 48%|████▊     | 48/100 [09:55<19:50, 22.90s/it]

0 / 3


 49%|████▉     | 49/100 [09:56<13:56, 16.39s/it]

0 / 4


 50%|█████     | 50/100 [09:57<09:53, 11.88s/it]

1 / 8


 51%|█████     | 51/100 [10:12<10:18, 12.63s/it]

19 / 77


 52%|█████▏    | 52/100 [10:13<07:23,  9.25s/it]

0 / 39


 53%|█████▎    | 53/100 [10:28<08:27, 10.81s/it]

1 / 77


 54%|█████▍    | 54/100 [10:29<06:06,  7.96s/it]

0 / 7


 55%|█████▌    | 55/100 [10:33<05:06,  6.82s/it]

6 / 62


 56%|█████▌    | 56/100 [10:34<03:46,  5.15s/it]

0 / 16


 57%|█████▋    | 57/100 [10:44<04:33,  6.37s/it]

2 / 72


 58%|█████▊    | 58/100 [11:07<08:08, 11.62s/it]

1 / 78


 59%|█████▉    | 59/100 [11:23<08:42, 12.73s/it]

4 / 75


 60%|██████    | 60/100 [11:41<09:33, 14.33s/it]

14 / 77


 61%|██████    | 61/100 [11:48<07:51, 12.09s/it]

1 / 72


 62%|██████▏   | 62/100 [12:00<07:46, 12.28s/it]

12 / 75


 63%|██████▎   | 63/100 [12:04<05:54,  9.59s/it]

9 / 64


 64%|██████▍   | 64/100 [12:05<04:14,  7.06s/it]

0 / 7


 65%|██████▌   | 65/100 [12:06<03:06,  5.33s/it]

0 / 16


 66%|██████▌   | 66/100 [12:08<02:30,  4.42s/it]

0 / 52


 67%|██████▋   | 67/100 [12:10<02:01,  3.68s/it]

11 / 55


 68%|██████▊   | 68/100 [12:13<01:50,  3.44s/it]

0 / 57


 69%|██████▉   | 69/100 [12:23<02:49,  5.45s/it]

1 / 75


 70%|███████   | 70/100 [12:25<02:06,  4.21s/it]

0 / 5


 71%|███████   | 71/100 [13:47<13:18, 27.53s/it]

17 / 78


 72%|███████▏  | 72/100 [13:55<10:05, 21.63s/it]

20 / 74


 73%|███████▎  | 73/100 [14:01<07:42, 17.12s/it]

1 / 73


 74%|███████▍  | 74/100 [14:07<05:55, 13.67s/it]

1 / 70


 75%|███████▌  | 75/100 [14:16<05:08, 12.36s/it]

0 / 73


 76%|███████▌  | 76/100 [14:17<03:37,  9.07s/it]

0 / 46


 77%|███████▋  | 77/100 [14:27<03:32,  9.22s/it]

6 / 73


 78%|███████▊  | 78/100 [14:35<03:12,  8.75s/it]

4 / 71


 79%|███████▉  | 79/100 [15:09<05:44, 16.39s/it]

32 / 75


 80%|████████  | 80/100 [15:56<08:33, 25.66s/it]

22 / 74


 81%|████████  | 81/100 [15:58<05:48, 18.36s/it]

0 / 16


 82%|████████▏ | 82/100 [15:59<03:58, 13.27s/it]

0 / 42


 83%|████████▎ | 83/100 [16:12<03:46, 13.35s/it]

0 / 73


 84%|████████▍ | 84/100 [16:30<03:55, 14.72s/it]

19 / 77


 85%|████████▌ | 85/100 [16:32<02:39, 10.65s/it]

0 / 12


 86%|████████▌ | 86/100 [16:33<01:49,  7.84s/it]

0 / 15


 87%|████████▋ | 87/100 [16:42<01:47,  8.29s/it]

5 / 78


 88%|████████▊ | 88/100 [16:45<01:18,  6.57s/it]

0 / 51


 89%|████████▉ | 89/100 [16:57<01:32,  8.40s/it]

31 / 75


 90%|█████████ | 90/100 [17:12<01:43, 10.33s/it]

4 / 79


 91%|█████████ | 91/100 [17:15<01:12,  8.02s/it]

0 / 53


 92%|█████████▏| 92/100 [17:18<00:53,  6.63s/it]

0 / 61


 93%|█████████▎| 93/100 [17:54<01:47, 15.30s/it]

40 / 78


 94%|█████████▍| 94/100 [18:02<01:19, 13.17s/it]

0 / 67


 95%|█████████▌| 95/100 [18:10<00:58, 11.77s/it]

27 / 70


 96%|█████████▌| 96/100 [18:12<00:34,  8.64s/it]

0 / 7


 97%|█████████▋| 97/100 [18:47<00:49, 16.60s/it]

4 / 78


 98%|█████████▊| 98/100 [18:51<00:25, 12.81s/it]

21 / 62


 99%|█████████▉| 99/100 [19:00<00:11, 11.79s/it]

7 / 73


100%|██████████| 100/100 [19:24<00:00, 11.64s/it]

8 / 78
Hits at 10: 0.78
Hits at 3: 0.56
Hits at 1: 0.37
MRR: 0.5077486538860714



