# Notebook to extract base networks from raw data.

In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
KG_DIR = '../data/kg'

# Load raw kg

Loading the KG from [drug2ways](https://github.com/drug2ways/results/tree/master) resource:

First, the [OpenBioLink KG](https://github.com/OpenBioLink/OpenBioLink):

In [3]:
openbiolink_df = pd.read_csv(
    'https://raw.githubusercontent.com/enveda/RPath/master/data/kg/openbiolink_filtered_kg.tsv', sep='\t'
)
openbiolink_df.head(2)

Unnamed: 0,source,target,polarity
0,ncbigene:3308,ncbigene:6622,-1
1,ncbigene:4804,ncbigene:2885,1


... and the Custom KG, or "In-House" KG, described in the [drug2ways paper](https://journals.plos.org/ploscompbiol/article/file?id=10.1371/journal.pcbi.1008464&type=printable).

In [4]:
custom_df = pd.read_csv(
    'https://raw.githubusercontent.com/enveda/RPath/master/data/kg/custom_filtered_kg.tsv', sep='\t'
)
custom_df.head(2)

Unnamed: 0,source,target,polarity
0,pubchem.compound:4943,ncbigene:2562,1
1,pubchem.compound:4943,ncbigene:2563,1


Let's investigate the columns here:

In [5]:
custom_df['polarity'].unique()

array([ 1, -1])

Seems that polarity indicates directionality here.

In [6]:
custom_df.head(50)

Unnamed: 0,source,target,polarity
0,pubchem.compound:4943,ncbigene:2562,1
1,pubchem.compound:4943,ncbigene:2563,1
2,pubchem.compound:4943,ncbigene:2555,1
3,pubchem.compound:4943,ncbigene:2560,1
4,pubchem.compound:4943,ncbigene:2554,1
5,pubchem.compound:4943,ncbigene:6326,-1
6,pubchem.compound:4943,ncbigene:2566,1
7,ncbigene:2562,ncbigene:10399,1
8,ncbigene:147,ncbigene:7225,1
9,ncbigene:147,ncbigene:2767,1


While I can only see genes/proteins and drugs, the original paper says that this KG also contains phenotypes and diseases/indications.

# Subsetting only drug-protein and protein-protein edges

These KGs have PUBCHEM.COMPOUND ids as drugs and NCBIGENE IDs as proteins.

In [7]:
# drug-protein
openbiolink_drug_edges = openbiolink_df[
    (openbiolink_df['source'].str.contains('pubchem.compound')) & (openbiolink_df['target'].str.contains('ncbigene'))
]

# protein-protein
openbiolink_pp_edges = openbiolink_df[
    (openbiolink_df['source'].str.contains('ncbigene')) & (openbiolink_df['target'].str.contains('ncbigene'))
]

openbiolink_drug_edges.shape, openbiolink_pp_edges.shape

((3401, 3), (38040, 3))

In [8]:
# drug-protein
custom_drug_edges = custom_df[
    (custom_df['source'].str.contains('pubchem.compound')) & (custom_df['target'].str.contains('ncbigene'))
]

# protein-protein
custom_pp_edges = custom_df[
    (custom_df['source'].str.contains('ncbigene')) & (custom_df['target'].str.contains('ncbigene'))
]

custom_drug_edges.shape, custom_pp_edges.shape

((743, 3), (50657, 3))

# Saving the subset graph

In [9]:
# concatenate the drug-protein and protein-protein edges for each respective KG
openbiolink_filtered = pd.concat([openbiolink_drug_edges, openbiolink_pp_edges])
custom_filtered = pd.concat([custom_drug_edges])

In [10]:
# save each to separate files
openbiolink_filtered.to_csv(f'{KG_DIR}/openbiolink_filtered.tsv', sep='\t', index=False)
custom_filtered.to_csv(f'{KG_DIR}/customkg_filtered.tsv', sep='\t', index=False)