# Notebook to extract base networks from raw data.

In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
KG_DIR = '../data/kg'

# Load raw kg

Loading the KG from [drug2ways](https://github.com/drug2ways/results/tree/master) resource

In [3]:
openbiolink_df = pd.read_csv(
    'https://raw.githubusercontent.com/enveda/RPath/master/data/kg/openbiolink_filtered_kg.tsv', sep='\t'
)
openbiolink_df.head(2)

Unnamed: 0,source,target,polarity
0,ncbigene:3308,ncbigene:6622,-1
1,ncbigene:4804,ncbigene:2885,1


In [4]:
custom_df = pd.read_csv(
    'https://raw.githubusercontent.com/enveda/RPath/master/data/kg/custom_filtered_kg.tsv', sep='\t'
)
custom_df.head(2)

Unnamed: 0,source,target,polarity
0,pubchem.compound:4943,ncbigene:2562,1
1,pubchem.compound:4943,ncbigene:2563,1


# Subsetting only drug-protein and protein-protein edges

Openbiolink has PUBCHEM.COMPOUND ids as drugs and NCBIGENE ids as proteins while CustomKg has drugbank and HGNC ids for compounds and genes respectively.

In [5]:
openbiolink_drug_edges = openbiolink_df[
    (openbiolink_df['source'].str.contains('pubchem.compound')) & (openbiolink_df['target'].str.contains('ncbigene'))
]

openbiolink_pp_edges = openbiolink_df[
    (openbiolink_df['source'].str.contains('ncbigene')) & (openbiolink_df['target'].str.contains('ncbigene'))
]

openbiolink_drug_edges.shape, openbiolink_pp_edges.shape

((3401, 3), (38040, 3))

In [6]:
custom_drug_edges = custom_df[
    (custom_df['source'].str.contains('pubchem.compound')) & (custom_df['target'].str.contains('ncbigene'))
]

custom_pp_edges = custom_df[
    (custom_df['source'].str.contains('ncbigene')) & (custom_df['target'].str.contains('ncbigene'))
]

custom_drug_edges.shape, custom_pp_edges.shape

((743, 3), (50657, 3))

# Saving the subset graph

In [7]:
openbiolink_filtered = pd.concat([openbiolink_drug_edges, openbiolink_pp_edges])
custom_filtered = pd.concat([custom_drug_edges, custom_pp_edges])

In [8]:
openbiolink_filtered.to_csv(f'{KG_DIR}/openbiolink_filtered.tsv', sep='\t', index=False)
custom_filtered.to_csv(f'{KG_DIR}/customkg_filtered.tsv', sep='\t', index=False)