# Train - Test - Validation Splits

In [1]:
import pandas as pd
import os.path as osp
import os
import networkx as nx
from copy import deepcopy

## Load in the final KGs

In [2]:
KG_DIR = '../data/kg'

In [3]:
kg = pd.read_csv(osp.join(KG_DIR, 'final_kg.tsv'), sep='\t')
kg.drop_duplicates(inplace=True)

kg_protclass = pd.read_csv(osp.join(KG_DIR, 'final_kg_subclassed.tsv'), sep='\t')
kg_protclass.drop_duplicates(inplace=True)

In [4]:
drug_bp_pairs = kg.loc[kg['edge_type'] == 'induces']

drug_bp_pairs_protclass = kg_protclass.loc[kg_protclass['edge_type'] == 'induces']

In [5]:
print(f"There are {len(drug_bp_pairs)} positive drug-BP pairs in the final KG")

print(f"There are {len(drug_bp_pairs_protclass)} positive drug-BP pairs in the protein-classed KG")

There are 1622 positive drug-BP pairs in the final KG
There are 1622 positive drug-BP pairs in the protein-classed KG
There are 457 positive drug-BP pairs in the small KG
There are 291 positive drug-BP pairs in the xsmall KG


## Load in the DrugMechDB pairs that go in the test set

In [6]:
dm_db_pairs = pd.read_csv(osp.join(KG_DIR, 'drugmechdb_triples.tsv'), sep='\t')
dm_db_pairs.drop_duplicates(inplace=True)

print(f"{len(dm_db_pairs)} additional drug-BP pairs come from DrugMechDB, constituting {len(dm_db_pairs)/len(drug_bp_pairs)*100}%")

48 additional drug-BP pairs come from DrugMechDB, constituting 2.9593094944512948%


## Specify which pairs are able to be matched:

MARS / MARS can match pairs as long as they are 4 or less hops apart (our hyperparameter setting) and not connected via an inverse _CtBP edge.

In [9]:
G = nx.DiGraph()

for i, row in kg.iterrows():
    if row['edge_type'] == 'induces':
        continue
    src_id = row['source']
    trgt_id = row['target']
    if src_id not in G.nodes:
        G.add_node(src_id, type=row['source_node_type'])
    if trgt_id not in G.nodes:
        G.add_node(trgt_id, type=row['target_node_type'])
    G.add_edge(src_id, trgt_id, type=row['edge_type'])

Gp = nx.DiGraph()

for i, row in kg_protclass.iterrows():
    if row['edge_type'] == 'induces':
        continue
    src_id = row['source']
    trgt_id = row['target']
    if src_id not in Gp.nodes:
        Gp.add_node(src_id, type=row['source_node_type'])
    if trgt_id not in Gp.nodes:
        Gp.add_node(trgt_id, type=row['target_node_type'])
    Gp.add_edge(src_id, trgt_id, type=row['edge_type'])

Check the unmatched pairs for the regular, full KG:

In [10]:
unmatched_pairs = set()
unmatched_dm_db_pairs = set()

path_lengths = dict()

for i, row in drug_bp_pairs.iterrows():
    if not nx.has_path(G, row['source'], row['target']):
        unmatched_pairs.add(i)
    elif 1 > nx.shortest_path_length(G, row['source'], row['target']) > 4:
        unmatched_pairs.add(i)
    else:
        path_lengths[f"{(row['source'], row['target'])}"] = nx.shortest_path_length(G, row['source'], row['target'])

for i, row in dm_db_pairs.iterrows():
    if not nx.has_path(G, row['source'], row['target']):
        unmatched_dm_db_pairs.add(i)
    elif 1 > nx.shortest_path_length(G, row['source'], row['target']) > 4:
        unmatched_dm_db_pairs.add(i)

In [11]:
print(f"{len(unmatched_pairs)} drug-BP pairs in the final KG are not connected via 4 hops or less")
print(f"{len(unmatched_dm_db_pairs)} drug-BP pairs from DrugMechDB are not connected via 4 hops or less")

26 drug-BP pairs in the final KG are not connected via 4 hops or less
0 drug-BP pairs from DrugMechDB are not connected via 4 hops or less


Check the unmatched pairs for the protein-subclassed KG:

In [12]:
unmatched_pairs_p = set()
unmatched_dm_db_pairs_p = set()

path_lengths_p = dict()

for i, row in drug_bp_pairs_protclass.iterrows():
    if not nx.has_path(Gp, row['source'], row['target']):
        unmatched_pairs_p.add(i)
    elif 1 > nx.shortest_path_length(Gp, row['source'], row['target']) > 4:
        unmatched_pairs_p.add(i)
    else:
        path_lengths_p[f"{(row['source'], row['target'])}"] = nx.shortest_path_length(Gp, row['source'], row['target'])

for i, row in dm_db_pairs.iterrows():
    if not nx.has_path(Gp, row['source'], row['target']):
        unmatched_dm_db_pairs_p.add(i)
    elif 1 > nx.shortest_path_length(Gp, row['source'], row['target']) > 4:
        unmatched_dm_db_pairs_p.add(i)

In [13]:
print(f"{len(unmatched_pairs_p)} drug-BP pairs in the protein-classed KG are not connected via 4 hops or less")
print(f"{len(unmatched_dm_db_pairs_p)} drug-BP pairs from DrugMechDB are not connected via 4 hops or less")

26 drug-BP pairs in the protein-classed KG are not connected via 4 hops or less
0 drug-BP pairs from DrugMechDB are not connected via 4 hops or less


In [18]:
# Remove unmatched pairs from the final KG
drug_bp_pairs.drop(list(unmatched_pairs), inplace=True)
drug_bp_pairs.reset_index(drop=True, inplace=True)

# Remove unmatched pairs from the protein-classed KG
drug_bp_pairs_protclass.drop(list(unmatched_pairs_p), inplace=True)
drug_bp_pairs_protclass.reset_index(drop=True, inplace=True)

# Remove unmatched pairs from DrugMechDB
# make copies of it for multiple KG variants:
dm_db_pairs_protclass = deepcopy(dm_db_pairs)

dm_db_pairs.drop(list(unmatched_dm_db_pairs), inplace=True)
dm_db_pairs.reset_index(drop=True, inplace=True)

dm_db_pairs_protclass.drop(list(unmatched_dm_db_pairs_p), inplace=True)
dm_db_pairs_protclass.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drug_bp_pairs.drop(list(unmatched_pairs), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drug_bp_pairs_protclass.drop(list(unmatched_pairs_p), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drug_bp_pairs_small.drop(list(unmatched_pairs_small), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

Add their categories (path lengths) in:


## Create the Splits

In the original PoLo example, they use different proportions for the splits, but let's go with something most similar to their Hetionet example, in which they do an approximate 60/20/20% split.

Note that the DrugMechDB examples need to be in the test set.

First, we'll exclude the subset of DrugMechDB examples that are in the test set.

Then, we'll split the remaining examples into train, validation, and test sets, with the test set accumulating to 20% with the DrugMechDB examples.

In [19]:
total_positives = len(drug_bp_pairs) + len(dm_db_pairs)

total_positives_pc = len(drug_bp_pairs_protclass) + len(dm_db_pairs_protclass)

So we need to get the following numbers from the KG positive examples for train, validation, and test sets:

In [20]:
proportions = round(0.6 * total_positives), round(0.2 * total_positives), round(0.2 * total_positives) - len(dm_db_pairs)
print(proportions)

proportions_pc = round(0.6 * total_positives_pc), round(0.2 * total_positives_pc), round(0.2 * total_positives_pc) - len(dm_db_pairs_protclass)
print(proportions_pc)

(986, 329, 281)
(986, 329, 281)
(295, 98, 63)
(189, 63, 39)


In [21]:
# write a function which separates the dataframe into train, val and test sets of defined sizes
def train_test_split(df, train_size, val_size, test_size):
    df = df.sample(frac=1, random_state=7).reset_index(drop=True)
    train = df[:train_size]
    val = df[train_size:train_size+val_size]
    test = df[train_size+val_size:train_size+val_size+test_size]
    return train, val, test

In [22]:
train, val, test = train_test_split(drug_bp_pairs, proportions[0], proportions[1], proportions[2])

train_pc, val_pc, test_pc = train_test_split(drug_bp_pairs_protclass, proportions_pc[0], proportions_pc[1], proportions_pc[2])

Check it did what we want:

In [23]:
len(train), len(val), len(test)

(986, 329, 281)

In [24]:
len(train_pc), len(val_pc), len(test_pc)

(986, 329, 281)

No overlap?

In [27]:
train_pairs = {(row['source'], row['target']) for i, row in train.iterrows()}
test_pairs = {(row['source'], row['target']) for i, row in test.iterrows()}
val_pairs = {(row['source'], row['target']) for i, row in val.iterrows()}

In [28]:
train_pairs & test_pairs

set()

In [29]:
train_pairs & val_pairs

set()

In [30]:
test_pairs & val_pairs

set()

Overlap in the protein-classed one?

In [31]:
train_pairs = {(row['source'], row['target']) for i, row in train_pc.iterrows()}
test_pairs = {(row['source'], row['target']) for i, row in test_pc.iterrows()}
val_pairs = {(row['source'], row['target']) for i, row in val_pc.iterrows()}

In [32]:
train_pairs & test_pairs

set()

In [33]:
train_pairs & val_pairs

set()

In [34]:
test_pairs & val_pairs

set()

Good, no overlap. Add the DrugMechDB examples to the test set:

In [43]:
test = pd.concat([test, dm_db_pairs]).sample(frac=1, random_state=7).reset_index(drop=True)

test_pc = pd.concat([test_pc, dm_db_pairs_protclass]).sample(frac=1, random_state=7).reset_index(drop=True)

In [44]:
print(len(test))
print(len(test_pc))

329
329
98
63


Take the test and validation sets out the KG:

In [45]:
kg = kg.loc[kg['edge_type'] != 'induces']
kg_mars = pd.concat([kg, train]).sample(frac=1, random_state=7).reset_index(drop=True)

kg_protclass = kg_protclass.loc[kg_protclass['edge_type'] != 'induces']
kg_mars_pc = pd.concat([kg_protclass, train_pc]).sample(frac=1, random_state=7).reset_index(drop=True)

In [46]:
len(kg_mars.loc[kg_mars['edge_type'] == 'induces']) == len(train)

True

In [47]:
len(kg_mars_pc.loc[kg_mars_pc['edge_type'] == 'induces']) == len(train_pc)

True

Write everything to files:

In [50]:
SPLITS_DIR = osp.join(KG_DIR, 'splits')
MOA_NET = osp.join(SPLITS_DIR, 'MoA-net')
MOA_NET_PROTCLASSED = osp.join(SPLITS_DIR, 'MoA-net-protclass')

# create the above directories if they do not exist:
for dir in [SPLITS_DIR, MOA_NET, MOA_NET_PROTCLASSED]:
    if not osp.exists(dir):
        os.makedirs(dir)

In [51]:
kg.to_csv(osp.join(MOA_NET, 'kg_no_cmp_bp.tsv'), sep='\t', index=False)
kg_mars.to_csv(osp.join(MOA_NET, 'kg_with_train_smpls.tsv'), sep='\t', index=False)

train.to_csv(osp.join(MOA_NET, 'train.tsv'), sep='\t', index=False)
val.to_csv(osp.join(MOA_NET, 'dev.tsv'), sep='\t', index=False)
test.to_csv(osp.join(MOA_NET, 'test.tsv'), sep='\t', index=False)

In [52]:
kg_protclass.to_csv(osp.join(MOA_NET_PROTCLASSED, 'kg_no_cmp_bp.tsv'), sep='\t', index=False)
kg_mars_pc.to_csv(osp.join(MOA_NET_PROTCLASSED, 'kg_with_train_smpls.tsv'), sep='\t', index=False)

train_pc.to_csv(osp.join(MOA_NET_PROTCLASSED, 'train.tsv'), sep='\t', index=False)
val_pc.to_csv(osp.join(MOA_NET_PROTCLASSED, 'dev.tsv'), sep='\t', index=False)
test_pc.to_csv(osp.join(MOA_NET_PROTCLASSED, 'test.tsv'), sep='\t', index=False)