In [1]:
import os
import pandas as pd
import networkx as nx
from collections import defaultdict, Counter
import random
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
import json

from data_loader_cross_val.cv_data_loaders import CrossValidationDataLoader

In [2]:
# Basic config parser for testing purposes

class Config:
    def __init__(self, config_path="config.json"):
        with open(config_path, 'r') as f:
            config_data = json.load(f)
        self._set_attributes(config_data)

    def _set_attributes(self, config_data):
        for key, value in config_data.items():
            if isinstance(value, dict):
                # Recursively create Config instances for nested dictionaries
                setattr(self, key, Config.from_dict(value))
            else:
                setattr(self, key, value)

    @classmethod
    def from_dict(cls, data_dict):
        config_instance = cls.__new__(cls)  # Avoid calling __init__
        config_instance._set_attributes(data_dict)
        return config_instance

In [3]:
conf = Config('config/DrugCombDB_config.json')

In [4]:
data_loader = CrossValidationDataLoader(conf.data_loader.args.data_dir,
                         conf.data_loader.args.batch_size,
                         conf.data_loader.args.score,
                         conf.data_loader.args.n_hop,
                         conf.data_loader.args.n_memory,
                         shuffle=False,
                         num_folds = 5,
                         num_workers=2)

undirected graph
# proteins: 15970, # drugs: 764, # cells: 76
# protein-protein interactions: 217160, # drug-protein associations: 5290, # cell-protein associations: 27730
constructing neighbor set ...
constructing neighbor set ...


In [None]:
drug_combo_df = data_loader.drug_combination_df
drug_combo_df

In [None]:
def fold_indices(df, n_folds=5):
    df_copy = df.copy()
    df_copy['drug_combination'] = df_copy.apply(lambda row: tuple(sorted([row['drug1_db'], row['drug2_db']])), axis=1)
    
    f = defaultdict(list)
    for idx, combo in enumerate(df_copy['drug_combination'].unique()):
        row_idx = df.loc[df_copy['drug_combination'] == combo].index
        fold = idx % n_folds
        f[fold].extend(row_idx)
        
    return f

In [5]:
for fold_id in data_loader.get_fold_indices().keys():
    data_loader.set_folds(fold_id)

    train_loader = data_loader.get_train_loader()
    val_loader = data_loader.get_val_loader()
    test_loader = data_loader.get_test_loader()

    print(f"Fold {fold_id}:")
    print(f"  Training set: {len(train_loader.dataset)} samples")
    print(f"  Validation set: {len(val_loader.dataset)} samples")
    print(f"  Testing set: {len(test_loader.dataset)} samples")

Fold 0:
  Training set: 41659 samples
  Validation set: 13822 samples
  Testing set: 13955 samples
Fold 1:
  Training set: 41537 samples
  Validation set: 13955 samples
  Testing set: 13944 samples
Fold 2:
  Training set: 41482 samples
  Validation set: 13944 samples
  Testing set: 14010 samples
Fold 3:
  Training set: 41721 samples
  Validation set: 14010 samples
  Testing set: 13705 samples
Fold 4:
  Training set: 41909 samples
  Validation set: 13705 samples
  Testing set: 13822 samples


In [None]:
folds = fold_indices(drug_combo_df)

In [None]:
for key in folds.keys():
    print(folds[key][:10])

assert sum([len(x) for x in folds.values()]) == len(drug_combo_df)
print(drug_combo_df)

In [None]:
for rows in folds.values():
    subset = drug_combo_df.iloc[rows]
    synergy_dist = subset['synergistic'].value_counts()
    print(synergy_dist)
    print('Number of unique cell lines: ', subset['cell'].nunique())
    

In [None]:
cell_counts = drug_combo_df['cell'].value_counts()
print(cell_counts[cell_counts < 10])