# Notebook to extract base networks from raw data.

In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
KG_DIR = '../data/kg'

# Load raw kg

Loading the KG from [drug2ways](https://github.com/drug2ways/results/tree/master) resource

In [3]:
openbiolink_df = pd.read_csv(
    'https://raw.githubusercontent.com/drug2ways/results/master/networks/data/openbiolink_network.tsv', sep='\t'
)
openbiolink_df.head(2)

Unnamed: 0,source,target,relation
0,PUBCHEM.COMPOUND:1,NCBIGENE:3162,1
1,PUBCHEM.COMPOUND:1,NCBIGENE:3308,1


In [4]:
custom_df = pd.read_csv(
    'https://raw.githubusercontent.com/drug2ways/results/master/networks/data/custom_network.tsv', sep='\t'
)
custom_df.head(2)

Unnamed: 0,source,target,relation,source_database
0,drugbank:DB00818,HGNC:4083,1,drugbank
1,drugbank:DB00139,HGNC:408,-1,drugbank


# Subsetting only drug-protein and protein-protein edges

Openbiolink has PUBCHEM.COMPOUND ids as drugs and NCBIGENE ids as proteins while CustomKg has drugbank and HGNC ids for compounds and genes respectively.

In [5]:
openbiolink_drug_edges = openbiolink_df[
    (openbiolink_df['source'].str.contains('PUBCHEM.COMPOUND')) & (openbiolink_df['target'].str.contains('NCBIGENE'))
]

openbiolink_pp_edges = openbiolink_df[
    (openbiolink_df['source'].str.contains('NCBIGENE')) & (openbiolink_df['target'].str.contains('NCBIGENE'))
]

openbiolink_drug_edges.shape, openbiolink_pp_edges.shape

((32761, 3), (39154, 3))

In [6]:
custom_drug_edges = custom_df[
    (custom_df['source'].str.contains('drugbank')) & (custom_df['target'].str.contains('HGNC'))
]

custom_pp_edges = custom_df[
    (custom_df['source'].str.contains('HGNC')) & (custom_df['target'].str.contains('HGNC'))
]

custom_drug_edges.shape, custom_pp_edges.shape

((4673, 4), (50764, 4))

# Saving the subset graph

In [7]:
openbiolink_filtered = pd.concat([openbiolink_drug_edges, openbiolink_pp_edges])
custom_filtered = pd.concat([custom_drug_edges, custom_pp_edges])

In [8]:
openbiolink_filtered.to_csv(f'{KG_DIR}/openbiolink_filtered.tsv', sep='\t', index=False)
custom_filtered.to_csv(f'{KG_DIR}/customkg_filtered.tsv', sep='\t', index=False)