# Graph Permutation via XSWAP

Here, we use the XSwap algorithm as implemented in [Zietz et al.](https://academic.oup.com/gigascience/article/doi/10.1093/gigascience/giae001/7602464)

This is to create an ablation of MoA-net in which we can test how much of the performance is attributable to node degree bias.

Code adapted from https://github.com/drug2ways/drug2ways/blob/master/src/drug2ways/permute.py

In [1]:
import os
from tqdm import tqdm
from collections import Counter

import numpy as np
import pandas as pd
from xswap.permute import permute_edge_list
from xswap.preprocessing import load_str_edges, map_str_edges

# Load graph

In [2]:
KG_DATA_PATH = '../data/kg/splits'

MoA_net = os.path.join(KG_DATA_PATH, 'MoA-net')

PERMUTED_KG_PATH  = os.path.join(KG_DATA_PATH, 'MoA-net-permuted')
os.makedirs(PERMUTED_KG_PATH, exist_ok=True)

In [3]:
network_df = pd.read_csv(
    os.path.join(MoA_net, 'kg_with_train_smpls.tsv'), sep='\t',
    usecols=['source', 'target', 'edge_type']
)
network_df.drop_duplicates(subset=['source', 'target'], inplace=True)
network_df.head(2)

Unnamed: 0,source,target,edge_type
0,ncbigene:5898,ncbigene:5601,interacts
1,ncbigene:51308,ncbigene:347468,interacts


In [4]:
relations = network_df.edge_type
Counter(relations)

Counter({'interacts': 86786,
         'participates': 4325,
         'downregulates': 2200,
         'upregulates': 1626,
         'induces': 961})

### Split data based on edge types:

In [5]:
protein_df = network_df[network_df['edge_type'] == 'interacts']
cmp_bp_df = network_df[network_df['edge_type'] == 'induces']
gene_bp_df = network_df[network_df['edge_type'] == 'participates']
drug_gene_df = network_df[network_df['edge_type'].isin({'downregulates', 'upregulates'})]

protein_df.shape, cmp_bp_df.shape, gene_bp_df.shape, drug_gene_df.shape

((86786, 3), (961, 3), (4325, 3), (3826, 3))

In [6]:
EDGE_TYPES = os.path.join(PERMUTED_KG_PATH, 'edge_classes')
os.makedirs(EDGE_TYPES, exist_ok=True)

In [7]:
protein_df.to_csv(os.path.join(EDGE_TYPES, 'kg_protein.tsv'), sep='\t', index=False, header=False)
cmp_bp_df.to_csv(os.path.join(EDGE_TYPES, 'kg_cmp_bp.tsv'), sep='\t', index=False, header=False)
gene_bp_df.to_csv(os.path.join(EDGE_TYPES, 'kg_gene_bp.tsv'), sep='\t', index=False, header=False)
drug_gene_df.to_csv(os.path.join(EDGE_TYPES, 'kg_drug_gene.tsv'), sep='\t', index=False, header=False)

# Generate permuted network

In [8]:
final_data = []

for subgraph in tqdm(['kg_protein', 'kg_cmp_bp', 'kg_gene_bp', 'kg_drug_gene']):

    network_df = pd.read_csv(
        os.path.join(EDGE_TYPES, f'{subgraph}.tsv'), 
        sep='\t',
        names=['source', 'target', 'edge_type']
    )
    # Read edge list
    edge_list = load_str_edges(
        os.path.join(EDGE_TYPES, f'{subgraph}.tsv'), 
        node_delim='\t',
    )

    # Get mapping since the edge list contains now integers
    edge_list_integers, node_mapping, _ = map_str_edges(edge_list, bipartite=False)

    np.random.seed(123)  # Set seed for reproducibility
    permuted_edges, stats = permute_edge_list(edge_list_integers)
    print(stats)

    # Reverse mapping dictionary
    node_mapping = {
        v: k
        for k, v in node_mapping.items()
    }

    relations = network_df.edge_type

    permuted_rels = np.random.permutation(relations, ) # Get random edges

    permuted_kg = pd.DataFrame({
            'source': node_mapping[source],
            'target': node_mapping[target],
            'relation': permuted_rels[index],
        }
        for index, (source, target) in enumerate(permuted_edges)
        if index < len(permuted_rels)  # Skip the header if has been read by xswap
    )
    final_data.append(permuted_kg)


100%|██████████| 4/4 [00:01<00:00,  3.22it/s]

{'swap_attempts': 867860, 'same_edge': 10, 'self_loop': 808, 'duplicate': 114930, 'undir_duplicate': 29216, 'excluded': 0}
{'swap_attempts': 9610, 'same_edge': 11, 'self_loop': 0, 'duplicate': 5352, 'undir_duplicate': 0, 'excluded': 0}
{'swap_attempts': 43250, 'same_edge': 7, 'self_loop': 0, 'duplicate': 5872, 'undir_duplicate': 0, 'excluded': 0}
{'swap_attempts': 38260, 'same_edge': 8, 'self_loop': 0, 'duplicate': 7046, 'undir_duplicate': 0, 'excluded': 0}





### Put all the permuted edges back into one KG:

In [9]:
permuted_kg = pd.concat(final_data)
permuted_kg.rename(columns={'relation': 'edge_type'}, inplace=True)
permuted_kg.head(2)

Unnamed: 0,source,target,edge_type
0,ncbigene:5898,ncbigene:408,interacts
1,ncbigene:51308,ncbigene:64326,interacts


In [10]:
def map_to_nodetype(id):
    if 'ncbigene:' in id:
        return 'Gene'
    elif 'GO:' in id:
        return 'Biological Process'
    elif 'pubchem' in id:
        return 'Compound'
    else:
        raise ValueError(f'Unknown node type for {id}')

In [11]:
permuted_kg['source_node_type'] = [map_to_nodetype(i) for i in permuted_kg['source']]
permuted_kg['target_node_type'] = [map_to_nodetype(i) for i in permuted_kg['target']]

In [12]:
permuted_kg.to_csv(os.path.join(PERMUTED_KG_PATH, 'moa_net_full_permuted.tsv'), sep='\t', index=False)