# Graph Permutation via XSWAP

Here, we use the XSwap algorithm as implemented in [Zietz et al.](https://academic.oup.com/gigascience/article/doi/10.1093/gigascience/giae001/7602464)

This is to create an ablation of MoA-net in which we can test how much of the performance is attributable to node degree bias.

Code adapted from https://github.com/drug2ways/drug2ways/blob/master/src/drug2ways/permute.py

In [1]:
import os
from tqdm import tqdm
from collections import Counter
import numpy as np
import pandas as pd
import re
import json

from xswap.permute import permute_edge_list
from xswap.preprocessing import load_str_edges, map_str_edges

import hetnetpy.hetnet
import hetnetpy.readwrite
import hetnetpy.stats
from hetnetpy.abbreviation import metaedges_from_metapath


# Load graph

In [2]:
KG_DATA_PATH = '../data/kg/splits'

MoA_net = os.path.join(KG_DATA_PATH, 'MoA-net')

PERMUTED_KG_PATH  = os.path.join(KG_DATA_PATH, 'MoA-net-permuted')
os.makedirs(PERMUTED_KG_PATH, exist_ok=True)

In [3]:
network_df = pd.read_csv(
    os.path.join(MoA_net, 'kg_with_train_smpls.tsv'), sep='\t',
    usecols=['source', 'target', 'edge_type']
)
network_df.drop_duplicates(subset=['source', 'target'], inplace=True)
network_df.head(2)

Unnamed: 0,source,target,edge_type
0,ncbigene:5602,ncbigene:2354,interacts
1,ncbigene:8021,ncbigene:6430,interacts


In [4]:
relations = network_df.edge_type
Counter(relations)

Counter({'interacts': 86786,
         'participates': 4325,
         'downregulates': 2201,
         'upregulates': 1625,
         'induces': 986})

### Split data based on edge types:

In [5]:
protein_df = network_df[network_df['edge_type'] == 'interacts']
cmp_bp_df = network_df[network_df['edge_type'] == 'induces']
gene_bp_df = network_df[network_df['edge_type'] == 'participates']
drug_gene_df = network_df[network_df['edge_type'].isin({'downregulates', 'upregulates'})]

protein_df.shape, cmp_bp_df.shape, gene_bp_df.shape, drug_gene_df.shape

((86786, 3), (986, 3), (4325, 3), (3826, 3))

In [6]:
EDGE_TYPES = os.path.join(PERMUTED_KG_PATH, 'edge_classes')
os.makedirs(EDGE_TYPES, exist_ok=True)

In [7]:
protein_df.to_csv(os.path.join(EDGE_TYPES, 'kg_protein.tsv'), sep='\t', index=False, header=False)
cmp_bp_df.to_csv(os.path.join(EDGE_TYPES, 'kg_cmp_bp.tsv'), sep='\t', index=False, header=False)
gene_bp_df.to_csv(os.path.join(EDGE_TYPES, 'kg_gene_bp.tsv'), sep='\t', index=False, header=False)
drug_gene_df.to_csv(os.path.join(EDGE_TYPES, 'kg_drug_gene.tsv'), sep='\t', index=False, header=False)

# Generate permuted network

In [8]:
final_data = []

for subgraph in tqdm(['kg_protein', 'kg_cmp_bp', 'kg_gene_bp', 'kg_drug_gene']):

    network_df = pd.read_csv(
        os.path.join(EDGE_TYPES, f'{subgraph}.tsv'), 
        sep='\t',
        names=['source', 'target', 'edge_type']
    )
    # Read edge list
    edge_list = load_str_edges(
        os.path.join(EDGE_TYPES, f'{subgraph}.tsv'), 
        node_delim='\t',
    )

    # Get mapping since the edge list contains now integers
    edge_list_integers, node_mapping, _ = map_str_edges(edge_list, bipartite=False)

    np.random.seed(123)  # Set seed for reproducibility
    permuted_edges, stats = permute_edge_list(edge_list_integers)
    print(stats)

    # Reverse mapping dictionary
    node_mapping = {
        v: k
        for k, v in node_mapping.items()
    }

    relations = network_df.edge_type

    permuted_rels = np.random.permutation(relations, ) # Get random edges

    permuted_kg = pd.DataFrame({
            'source': node_mapping[source],
            'target': node_mapping[target],
            'relation': permuted_rels[index],
        }
        for index, (source, target) in enumerate(permuted_edges)
        if index < len(permuted_rels)  # Skip the header if has been read by xswap
    )
    final_data.append(permuted_kg)


100%|██████████| 4/4 [00:01<00:00,  3.28it/s]

{'swap_attempts': 867860, 'same_edge': 10, 'self_loop': 810, 'duplicate': 114331, 'undir_duplicate': 29425, 'excluded': 0}
{'swap_attempts': 9860, 'same_edge': 14, 'self_loop': 0, 'duplicate': 5404, 'undir_duplicate': 0, 'excluded': 0}
{'swap_attempts': 43250, 'same_edge': 7, 'self_loop': 0, 'duplicate': 5836, 'undir_duplicate': 0, 'excluded': 0}
{'swap_attempts': 38260, 'same_edge': 8, 'self_loop': 0, 'duplicate': 6972, 'undir_duplicate': 0, 'excluded': 0}





### Put all the permuted edges back into one KG:

In [9]:
permuted_kg = pd.concat(final_data)
permuted_kg.rename(columns={'relation': 'edge_type'}, inplace=True)
permuted_kg.head(2)

Unnamed: 0,source,target,edge_type
0,ncbigene:5602,ncbigene:10905,interacts
1,ncbigene:8021,ncbigene:4352,interacts


In [10]:
def map_to_nodetype(id):
    if 'ncbigene:' in id:
        return 'Gene'
    elif 'GO:' in id:
        return 'Biological Process'
    elif 'pubchem' in id:
        return 'Compound'
    else:
        raise ValueError(f'Unknown node type for {id}')

In [11]:
permuted_kg['source_node_type'] = [map_to_nodetype(i) for i in permuted_kg['source']]
permuted_kg['target_node_type'] = [map_to_nodetype(i) for i in permuted_kg['target']]

In [12]:
permuted_kg.to_csv(os.path.join(PERMUTED_KG_PATH, 'kg_with_train_smpls.tsv'), sep='\t', index=False)

## Metapath Generation + MARS Preparation

Now, we can skip to the metapath generation and preparation for MARS input.

First, we take the same validation and test sets from MoA-net, but we get our training triples from the permuted KG:

In [13]:
MARS_DIR = os.path.join(PERMUTED_KG_PATH, 'MARS')
os.makedirs(MARS_DIR, exist_ok=True)

In [14]:
training_triples = permuted_kg.loc[permuted_kg['edge_type'] == 'induces']

training_triples.to_csv(os.path.join(PERMUTED_KG_PATH, 'train.tsv'), sep='\t', index=False)

In [15]:
kg = permuted_kg.loc[permuted_kg['edge_type'] != 'induces']

kg.to_csv(os.path.join(PERMUTED_KG_PATH, 'kg_no_cmp_bp.tsv'), sep='\t', index=False)

In [16]:
len(training_triples) + len(kg) == len(permuted_kg)

True

In [17]:
dev = pd.read_csv(os.path.join(MoA_net, 'dev.tsv'), sep='\t')
test = pd.read_csv(os.path.join(MoA_net, 'test.tsv'), sep='\t')

dev.to_csv(os.path.join(PERMUTED_KG_PATH, 'dev.tsv'), sep='\t', index=False)
test.to_csv(os.path.join(PERMUTED_KG_PATH, 'test.tsv'), sep='\t', index=False)

### Metapath Generation:

In [18]:
kind_to_abbev = {
    
    # metanodes
    'Compound': 'C',
    'Gene': 'G',
    'Biological Process': 'BP',
    
    # metaedges
    'upregulates': 'u',
    'downregulates': 'd',
    'interacts': 'i',
    'participates': 'p',
    'induces': 't',
}

metaedge_tuples = [
    ('Compound', 'Gene', 'upregulates', 'forward'),
    ('Compound', 'Gene', 'downregulates', 'forward'),
    ('Gene', 'Gene', 'interacts', 'forward'),
    ('Gene', 'Biological Process', 'participates', 'forward'),
    ('Compound', 'Biological Process', 'induces', 'forward')
]

metagraph = hetnetpy.hetnet.MetaGraph.from_edge_tuples(metaedge_tuples, kind_to_abbev)
graph = hetnetpy.hetnet.Graph(metagraph)

In [19]:
# show me the duplicated rows in the dataframe
kg[kg.duplicated(keep=False)]

Unnamed: 0,source,target,edge_type,source_node_type,target_node_type


In [20]:
seen = set()

for i, row in kg.iterrows():
    if not row['source'] in seen:
        graph.add_node(kind=row['source_node_type'], identifier=row['source'])
        seen.add(row['source'])
    if not row['target'] in seen:
        graph.add_node(kind=row['target_node_type'], identifier=row['target'])
        seen.add(row['target'])

    src_id = row['source_node_type'], row['source']
    trgt_id = row['target_node_type'], row['target']

    graph.add_edge(source_id=src_id, target_id=trgt_id, kind=row['edge_type'], direction='forward')

In [21]:
tgt_metapaths = metagraph.extract_metapaths(source='Compound', target='Biological Process')
print(tgt_metapaths)

[Ct>BP, Cd>Gp>BP, Cu>Gp>BP, Ct>BP<tCt>BP, Ct>BP<pGp>BP, Cd>G<dCt>BP, Cd>G<uCt>BP, Cd>G<iGp>BP, Cd>Gi>Gp>BP, Cu>G<dCt>BP, Cu>G<uCt>BP, Cu>G<iGp>BP, Cu>Gi>Gp>BP, Ct>BP<tCd>Gp>BP, Ct>BP<tCu>Gp>BP, Ct>BP<pG<dCt>BP, Ct>BP<pG<uCt>BP, Ct>BP<pG<iGp>BP, Ct>BP<pGi>Gp>BP, Cd>Gp>BP<tCt>BP, Cd>Gp>BP<pGp>BP, Cd>G<dCd>Gp>BP, Cd>G<dCu>Gp>BP, Cd>G<uCd>Gp>BP, Cd>G<uCu>Gp>BP, Cd>G<iG<dCt>BP, Cd>G<iG<uCt>BP, Cd>G<iG<iGp>BP, Cd>G<iGi>Gp>BP, Cd>Gi>G<dCt>BP, Cd>Gi>G<uCt>BP, Cd>Gi>G<iGp>BP, Cd>Gi>Gi>Gp>BP, Cu>Gp>BP<tCt>BP, Cu>Gp>BP<pGp>BP, Cu>G<dCd>Gp>BP, Cu>G<dCu>Gp>BP, Cu>G<uCd>Gp>BP, Cu>G<uCu>Gp>BP, Cu>G<iG<dCt>BP, Cu>G<iG<uCt>BP, Cu>G<iG<iGp>BP, Cu>G<iGi>Gp>BP, Cu>Gi>G<dCt>BP, Cu>Gi>G<uCt>BP, Cu>Gi>G<iGp>BP, Cu>Gi>Gi>Gp>BP]


Get rid of those with a compound in the middle:

In [22]:
CtoBP_metapaths = []

for i in tgt_metapaths:
    if '<' in repr(i):  # get rid of those with inverse edges
        continue
    if (repr(i) != 'Ct>BP') & (Counter(re.split('>|<|[^A-Z]', repr(i)))['C'] == 1):
        CtoBP_metapaths.append(i)

How many metapaths did this prune?

In [23]:
print(len(CtoBP_metapaths))
print(len(tgt_metapaths))

6
47


### MARS Input Preparation:

In [24]:
prob = 0.5

In [25]:
def get_forward_rule_body(mpath, conf, metagraph):
    """Gets the rule body for a given metapath in the format that MARS requires
        :param mpath: the metapath instance, written in hetnetpy format
        :param conf: the confidence value to use for the rule
        :param metagraph: the metagraph instance, as a hetnetpy object
    """
    st_edges = metaedges_from_metapath(mpath, standardize_by=metagraph)
    directional_edges = metaedges_from_metapath(mpath)
    rule_head = re.split('>|<|[^A-Z]', mpath)
    rule_head = rule_head[0] + 't' + rule_head[-1]
    rule_body = [conf, rule_head]
    for st, dic in zip(st_edges, directional_edges):
        if '>' in dic:
            rule_body.append(st)
        else:
            rule_body.append('_' + st)
    return rule_body

In [26]:
def get_reverse_rule_body(forward_body):
    """Gets the REVERSE rule body for a given metapath in the format that MARS requires
        :param forward_body: the forward rule body as a list
    """
    rev_body = [forward_body[0], '_' + forward_body[1]]
    for i in reversed(forward_body[2:]):
        if '_' in i:
            rev_body.append(re.sub('_', '', i))
        else:
            rev_body.append('_' + i)
    return rev_body

In [27]:
rules = {}
rules['CtBP'] = [get_forward_rule_body(str(mpath), prob, metagraph) for mpath in CtoBP_metapaths]
print(rules)

{'CtBP': [[0.5, 'CtBP', 'CdG', 'GpBP'], [0.5, 'CtBP', 'CuG', 'GpBP'], [0.5, 'CtBP', 'CdG', 'GiG', 'GpBP'], [0.5, 'CtBP', 'CuG', 'GiG', 'GpBP'], [0.5, 'CtBP', 'CdG', 'GiG', 'GiG', 'GpBP'], [0.5, 'CtBP', 'CuG', 'GiG', 'GiG', 'GpBP']]}


Write those rules to the MARS folder:

In [28]:
with open(os.path.join(MARS_DIR, 'rules.txt'), 'w') as f:
    json.dump(rules, f)

Now, let's get the input triples in a format consistent with MARS's input files:

In [29]:
def convert_to_mars(df):
    """Converts the KG into a format that MARS can use
    """
    df['metaedge'] = [kind_to_abbev[row['source_node_type']] + kind_to_abbev[row['edge_type']] + \
                       kind_to_abbev[row['target_node_type']] for i, row in df.iterrows()]
    return df[['source', 'metaedge', 'target']]

In [30]:
train = convert_to_mars(pd.read_csv(os.path.join(PERMUTED_KG_PATH, 'train.tsv'), sep='\t'))
test = convert_to_mars(pd.read_csv(os.path.join(PERMUTED_KG_PATH, 'test.tsv'), sep='\t'))
val = convert_to_mars(pd.read_csv(os.path.join(PERMUTED_KG_PATH, 'dev.tsv'), sep='\t'))

kg_triples = convert_to_mars(pd.read_csv(os.path.join(PERMUTED_KG_PATH, 'kg_with_train_smpls.tsv'), sep='\t'))

Get the inverse triples:

In [31]:
inverse_triples = kg_triples[['source', 'target', 'metaedge']].copy()
inverse_triples['inv_metaedge'] = ['_' + i for i in inverse_triples['metaedge']]
inverse_triples = inverse_triples[['target', 'inv_metaedge', 'source']]

# get rid of those with _CtBP
#inverse_triples = inverse_triples.loc[inverse_triples['inv_metaedge'] != '_CtBP']

Write to MARS folder:

In [32]:
kg_triples.to_csv(os.path.join(MARS_DIR, 'graph_triples.txt'), sep='\t', index=False, header=False)
inverse_triples.to_csv(os.path.join(MARS_DIR, 'graph_inverses.txt'), sep='\t', index=False, header=False)

In [33]:
train.to_csv(os.path.join(MARS_DIR, 'train.txt'), sep='\t', index=False, header=False)
test.to_csv(os.path.join(MARS_DIR, 'test.txt'), sep='\t', index=False, header=False)
val.to_csv(os.path.join(MARS_DIR, 'dev.txt'), sep='\t', index=False, header=False)

Finally, we need dictionaries to map the entities and relations to unique IDs.

In [34]:
entities = {"PAD": 0, "UNK": 1}

next_id = 2
for df in kg_triples, train, test, val:
    for i in df['source'].unique():
        if i not in entities:
            entities[i] = next_id
            next_id += 1
    for i in df['target'].unique():
        if i not in entities:
            entities[i] = next_id
            next_id += 1

In [35]:
relations = {"PAD": 0, "DUMMY_START_RELATION": 1, "NO_OP": 2, "UNK": 3}

# include inverse edges within the KG:
metaedges = set(kg_triples['metaedge']) | (set(inverse_triples['inv_metaedge']))

next_id = 4
for rel in metaedges:
    relations[rel] = next_id
    next_id += 1

In [36]:
# write dictionary to json file
VOCAB_DIR = os.path.join(MARS_DIR, 'vocab')
os.makedirs(VOCAB_DIR, exist_ok=True)

with open(os.path.join(VOCAB_DIR, 'entity_vocab.json'), 'w') as f:
    json.dump(entities, f)

with open(os.path.join(VOCAB_DIR, 'relation_vocab.json'), 'w') as f:
    json.dump(relations, f)

Let's also write the meta-edge and meta-node mappings to json files:

In [37]:
meta_mapping = {v: k for k, v in kind_to_abbev.items()}

In [38]:
with open(os.path.join(VOCAB_DIR, 'meta_mapping.json'), 'w') as f:
    json.dump(meta_mapping, f)

In [39]:
meta_mapping

{'C': 'Compound',
 'G': 'Gene',
 'BP': 'Biological Process',
 'u': 'upregulates',
 'd': 'downregulates',
 'i': 'interacts',
 'p': 'participates',
 't': 'induces'}