# Graph Permutation notebook

Code adapted from https://github.com/drug2ways/drug2ways/blob/master/src/drug2ways/permute.py

In [1]:
import os
from tqdm import tqdm
from collections import Counter

import numpy as np
import pandas as pd
from xswap.permute import permute_edge_list
from xswap.preprocessing import load_str_edges, map_str_edges

# Load graph

In [2]:
KG_DATA_PATH = '../data/kg'

In [3]:
network_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'splits', 'kg_with_train_smpls.tsv'), sep='\t',
    usecols=['source', 'target', 'edge_type']
)
network_df.drop_duplicates(subset=['source', 'target'], inplace=True)
network_df.head(2)

Unnamed: 0,source,target,edge_type
0,ncbigene:10325,ncbigene:57521,interacts
1,ncbigene:7048,ncbigene:7021,interacts


In [4]:
relations = network_df.edge_type
Counter(relations)

Counter({'interacts': 86786,
         'participates': 4325,
         'upregulates': 1626,
         'downregulates': 2200,
         'induces': 1002})

# Split data based on edge types

Participates (gene-bp), induces (cmp-bp), other (ppi)

In [5]:
protein_df = network_df[~network_df['edge_type'].isin(['participates', 'induces'])]
cmp_bp_df = network_df[network_df['edge_type'] == 'induces']
gene_bp_df = network_df[network_df['edge_type'] == 'participates']

protein_df.shape, cmp_bp_df.shape, gene_bp_df.shape

((90612, 3), (1002, 3), (4325, 3))

In [6]:
protein_df.to_csv(os.path.join(KG_DATA_PATH, 'splits', 'kg_protein.tsv'), sep='\t', index=False, header=False)
cmp_bp_df.to_csv(os.path.join(KG_DATA_PATH, 'splits', 'kg_cmp_bp.tsv'), sep='\t', index=False, header=False)
gene_bp_df.to_csv(os.path.join(KG_DATA_PATH, 'splits', 'kg_gene_bp.tsv'), sep='\t', index=False, header=False)

# Generate permuted network

In [7]:
final_data = []

for subgrapgh in tqdm(['kg_protein', 'kg_cmp_bp', 'kg_gene_bp']):

    network_df = pd.read_csv(
        os.path.join(KG_DATA_PATH, 'splits', f'{subgrapgh}.tsv'), 
        sep='\t',
        names=['source', 'target', 'edge_type']
    )
    # Read edge list
    edge_list = load_str_edges(
        os.path.join(KG_DATA_PATH, 'splits', f'{subgrapgh}.tsv'), 
        node_delim='\t',
    )

    # Get mapping since the edge list contains now integers
    edge_list_integers, node_mapping, _ = map_str_edges(edge_list, bipartite=False)

    np.random.seed(123)  # Set seed for reproducibility
    permuted_edges, stats = permute_edge_list(edge_list_integers)
    print(stats)

    # Reverse mapping dictionary
    node_mapping = {
        v: k
        for k, v in node_mapping.items()
    }

    relations = network_df.edge_type

    permuted_rels = np.random.permutation(relations, ) # Get random edges

    permuted_kg = pd.DataFrame({
            'source': node_mapping[source],
            'target': node_mapping[target],
            'relation': permuted_rels[index],
        }
        for index, (source, target) in enumerate(permuted_edges)
        if index < len(permuted_rels)  # Skip the header if has been read by xswap
    )
    final_data.append(permuted_kg)


100%|██████████| 3/3 [00:01<00:00,  2.25it/s]

{'swap_attempts': 906120, 'same_edge': 8, 'self_loop': 812, 'duplicate': 113009, 'undir_duplicate': 28782, 'excluded': 0}
{'swap_attempts': 10020, 'same_edge': 10, 'self_loop': 0, 'duplicate': 5623, 'undir_duplicate': 0, 'excluded': 0}
{'swap_attempts': 43250, 'same_edge': 7, 'self_loop': 0, 'duplicate': 5940, 'undir_duplicate': 0, 'excluded': 0}





In [8]:
permuted_kg = pd.concat(final_data)
permuted_kg.rename(columns={'relation': 'edge_type'}, inplace=True)
permuted_kg.head(2)

Unnamed: 0,source,target,edge_type
0,ncbigene:10325,ncbigene:3075,interacts
1,ncbigene:7048,ncbigene:2782,interacts


In [9]:
permuted_kg.to_csv(os.path.join(KG_DATA_PATH, 'splits', 'moa_net_full_permuted.tsv'), sep='\t', index=False)