In [1]:
import os
import pandas as pd
import networkx as nx
from collections import defaultdict, Counter
import random
import numpy as np
import json
from data_loader_cross_val.cv_data_loaders import CrossValidationDataLoader

random.seed(42)

In [2]:
# Basic config parser for testing purposes

class Config:
    def __init__(self, config_path="config.json"):
        with open(config_path, 'r') as f:
            config_data = json.load(f)
        self._set_attributes(config_data)

    def _set_attributes(self, config_data):
        for key, value in config_data.items():
            if isinstance(value, dict):
                # Recursively create Config instances for nested dictionaries
                setattr(self, key, Config.from_dict(value))
            else:
                setattr(self, key, value)

    @classmethod
    def from_dict(cls, data_dict):
        config_instance = cls.__new__(cls)  # Avoid calling __init__
        config_instance._set_attributes(data_dict)
        return config_instance

In [3]:
conf = Config('config/DrugCombDB_config.json')

In [4]:
data_loader = CrossValidationDataLoader(conf.data_loader.args.data_dir,
                         conf.data_loader.args.batch_size,
                         conf.data_loader.args.score,
                         conf.data_loader.args.n_hop,
                         conf.data_loader.args.n_memory,
                         shuffle=False,
                         num_folds = 5,
                         num_workers=2)

undirected graph
# proteins: 15970, # drugs: 764, # cells: 76
# protein-protein interactions: 217160, # drug-protein associations: 5290, # cell-protein associations: 27730
constructing neighbor set ...
constructing neighbor set ...


In [5]:
drug_combo_df = data_loader.drug_combination_df
drug_combo_df

Unnamed: 0,cell,drug1_db,drug2_db,synergistic
34669,3,267,67,0
21168,65,187,336,0
64069,24,289,13,1
24346,29,547,390,0
4874,5,66,680,1
...,...,...,...,...
49100,1,397,187,1
20609,11,96,336,1
21440,42,13,336,0
50057,21,106,96,1


In [6]:
def fold_indices(df, n_folds=5):
    df_copy = df.copy()
    df_copy['drug_combination'] = df_copy.apply(lambda row: tuple(sorted([row['drug1_db'], row['drug2_db']])), axis=1)
    
    f = defaultdict(list)
    for idx, combo in enumerate(df_copy['drug_combination'].unique()):
        row_idx = df.loc[df_copy['drug_combination'] == combo].index
        fold = idx % n_folds
        f[fold].extend(row_idx)
        
    return f

In [7]:
for fold_id in data_loader.get_fold_indices().keys():
    data_loader.set_folds(fold_id)

    train_loader = data_loader.get_train_loader()
    val_loader = data_loader.get_val_loader()
    test_loader = data_loader.get_test_loader()

    print(f"Fold {fold_id}:")
    print(f"  Training set: {len(train_loader.dataset)} samples")
    print(f"  Validation set: {len(val_loader.dataset)} samples")
    print(f"  Testing set: {len(test_loader.dataset)} samples")

Fold 0:
  Training set: 41659 samples
  Validation set: 13822 samples
  Testing set: 13955 samples
Fold 1:
  Training set: 41537 samples
  Validation set: 13955 samples
  Testing set: 13944 samples
Fold 2:
  Training set: 41482 samples
  Validation set: 13944 samples
  Testing set: 14010 samples
Fold 3:
  Training set: 41721 samples
  Validation set: 14010 samples
  Testing set: 13705 samples
Fold 4:
  Training set: 41909 samples
  Validation set: 13705 samples
  Testing set: 13822 samples


In [8]:
folds = fold_indices(drug_combo_df)

In [9]:
for key in folds.keys():
    print(folds[key][:10])

assert sum([len(x) for x in folds.values()]) == len(drug_combo_df)
print(drug_combo_df)

[34669, 34672, 34666, 34680, 34679, 34687, 34684, 34656, 34689, 34675]
[21168, 8955, 10099, 15332, 30047, 19078, 29655, 15102, 11726, 30785]
[64069, 10257, 64071, 9171, 64067, 64055, 13141, 20150, 19221, 30625]
[24346, 25418, 9762, 14869, 22842, 3394, 13572, 26392, 20417, 20732]
[4874, 20714, 5166, 28653, 20398, 24611, 13601, 26677, 19493, 11156]
       cell  drug1_db  drug2_db  synergistic
34669     3       267        67            0
21168    65       187       336            0
64069    24       289        13            1
24346    29       547       390            0
4874      5        66       680            1
...     ...       ...       ...          ...
49100     1       397       187            1
20609    11        96       336            1
21440    42        13       336            0
50057    21       106        96            1
5192     71       317       110            0

[69436 rows x 4 columns]


In [10]:
for rows in folds.values():
    subset = drug_combo_df.iloc[rows]
    synergy_dist = subset['synergistic'].value_counts()
    print(synergy_dist)
    print('Number of unique cell lines: ', subset['cell'].nunique())
    

synergistic
0    7550
1    6272
Name: count, dtype: int64
Number of unique cell lines:  72
synergistic
0    7657
1    6298
Name: count, dtype: int64
Number of unique cell lines:  72
synergistic
0    7569
1    6375
Name: count, dtype: int64
Number of unique cell lines:  70
synergistic
0    7644
1    6366
Name: count, dtype: int64
Number of unique cell lines:  71
synergistic
0    7393
1    6312
Name: count, dtype: int64
Number of unique cell lines:  72


In [11]:
cell_counts = drug_combo_df['cell'].value_counts()
print(cell_counts[cell_counts < 10])

cell
22    2
68    1
Name: count, dtype: int64
