# Train - Test - Validation Splits

In [1]:
import pandas as pd
import os.path as osp

## Load in the final KG

In [2]:
KG_DIR = '../data/kg'

In [3]:
kg = pd.read_csv(osp.join(KG_DIR, 'final_kg.tsv'), sep='\t')
kg.drop_duplicates(inplace=True)

In [4]:
drug_bp_pairs = kg.loc[kg['edge_type'] == 'induces']

In [5]:
print(f"There are {len(drug_bp_pairs)} positive drug-BP pairs in the final KG")

There are 1138 positive drug-BP pairs in the final KG


## Load in the DrugMechDB pairs that go in the test set

In [6]:
dm_db_pairs = pd.read_csv(osp.join(KG_DIR, 'test.tsv'), sep='\t')
dm_db_pairs.drop_duplicates(inplace=True)

In [7]:
print(f"{len(dm_db_pairs)} additional drug-BP pairs come from DrugMechDB, constituting {len(dm_db_pairs)/len(drug_bp_pairs)*100}%")

40 additional drug-BP pairs come from DrugMechDB, constituting 3.5149384885764503%


In the original PoLo example, they use different proportions for the splits, but let's go with something most similar to their Hetionet example, in which they do an approximate 60/20/20% split.

Note that the DrugMechDB examples need to be in the test set.

First, we'll exclude the subset of DrugMechDB examples that are in the test set.

Then, we'll split the remaining examples into train, validation, and test sets, with the test set accumulating to 20% with the DrugMechDB examples.

In [8]:
total_positives = len(drug_bp_pairs) + len(dm_db_pairs)

So we need to get the following numbers from the KG positive examples for train, validation, and test sets:

In [9]:
proportions = round(0.6 * total_positives), round(0.2 * total_positives), round(0.2 * total_positives) - len(dm_db_pairs)
proportions

(707, 236, 196)

In [10]:
# write a function which separates the dataframe into train, val and test sets of defined sizes
def train_test_split(df, train_size, val_size, test_size):
    df = df.sample(frac=1, random_state=7).reset_index(drop=True)
    train = df[:train_size]
    val = df[train_size:train_size+val_size]
    test = df[train_size+val_size:train_size+val_size+test_size]
    return train, val, test

In [11]:
train, val, test = train_test_split(drug_bp_pairs, proportions[0], proportions[1], proportions[2])

Check it did what we want:

In [12]:
len(train), len(val), len(test)

(707, 236, 195)

No overlap?

In [13]:
train_pairs = {(row['source'], row['target']) for i, row in train.iterrows()}
test_pairs = {(row['source'], row['target']) for i, row in test.iterrows()}
val_pairs = {(row['source'], row['target']) for i, row in val.iterrows()}

In [14]:
train_pairs & test_pairs

set()

In [15]:
train_pairs & val_pairs

set()

In [16]:
test_pairs & val_pairs

set()

Good, no overlap. Add the drugmechDB examples to the test set:

In [17]:
test = pd.concat([test, dm_db_pairs]).sample(frac=1, random_state=7).reset_index(drop=True)

In [18]:
len(test)

235

Take the test and validation sets out the KG:

In [19]:
kg = kg.loc[kg['edge_type'] != 'induces']
kg_polo = pd.concat([kg, train]).sample(frac=1, random_state=7).reset_index(drop=True)

In [20]:
len(kg_polo.loc[kg_polo['edge_type'] == 'induces']) == len(train)

True

Write everything to files:

In [21]:
SPLITS_DIR = osp.join(KG_DIR, 'splits')

In [22]:
kg.to_csv(osp.join(SPLITS_DIR, 'kg_no_cmp_bp.tsv'), sep='\t', index=False)
kg_polo.to_csv(osp.join(SPLITS_DIR, 'kg_with_train_smpls.tsv'), sep='\t', index=False)

train.to_csv(osp.join(SPLITS_DIR, 'train.tsv'), sep='\t', index=False)
val.to_csv(osp.join(SPLITS_DIR, 'dev.tsv'), sep='\t', index=False)
test.to_csv(osp.join(SPLITS_DIR, 'test.tsv'), sep='\t', index=False)

For the PoLo files, let's get it into a format suitable for PoLo:

In [23]:
POLO_DIR = osp.join(SPLITS_DIR, 'PoLo')