In [1]:
from src.data.dataloader import SLAPDataset
from src.util.configuration import get_config
from src.util.definitions import DATA_ROOT, CONFIG_ROOT

from src.util.io import index_from_file

In [2]:
config = get_config(CONFIG_ROOT / "config.yaml")


In [3]:
# load data
data = SLAPDataset(name=config["data_name"],
                   raw_dir=DATA_ROOT,
                   reaction=config["reaction"],
                   smiles_columns=("SMILES", ),
                   label_column="targets",
                   graph_type=config["graph_type"],
                   rdkit_features=config["rdkit_features"],
                   ecfp6=True,
                   featurizers=config["featurizers"],
                   )

# update config with data processing specifics
config["atom_feature_size"] = data.atom_feature_size
config["bond_feature_size"] = data.bond_feature_size
config["global_feature_size"] = data.global_feature_size

# define split index files
split_files = [{"train": DATA_ROOT / "LCMS_split_763records" / f"fold{i}_train.csv",
                "val": DATA_ROOT / "LCMS_split_763records" / f"fold{i}_val.csv",
                "test_0D": DATA_ROOT / "LCMS_split_763records" / f"fold{i}_test_0D.csv",
                "test_1D": DATA_ROOT / "LCMS_split_763records" / f"fold{i}_test_1D.csv",
                "test_2D": DATA_ROOT / "LCMS_split_763records" / f"fold{i}_test_2D.csv"}
               for i in range(5)]


  return (1 + x**(-c))**(-d)


Done saving data into cached files.


In [4]:
data.labels

[1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,


In [5]:
for i, fold in enumerate(split_files):

    # load indices from file
    idx = {k: index_from_file(v) for k, v in fold.items()}

    # instantiate Dataset splits instead of DataLoaders
    data_splitted = {k: [data[i] for i in v] for k, v in idx.items()}
        
        

In [6]:
val = data_splitted["val"]

In [7]:
val

[(Graph(num_nodes={'atom': 30, 'bond': 76},
        num_edges={('atom', 'starts', 'bond'): 76, ('bond', 'leads_to', 'atom'): 76, ('bond', 'leads_to', 'bond'): 142, ('bond', 'starts_at', 'atom'): 76},
        metagraph=[('atom', 'bond', 'starts'), ('bond', 'atom', 'leads_to'), ('bond', 'atom', 'starts_at'), ('bond', 'bond', 'leads_to')]),
  [0.5984106997893455,
   0.05286935483606901,
   0.15185102670881218,
   0.28212289191591505,
   0.31811914407410613,
   0.1232438571452347,
   0.30988831349624524,
   0.7255094826427458,
   0.6319388183852803,
   0.9471142784858387,
   0.3633180690891452,
   0.49293744141676615,
   0.4002016987634654,
   0.4881498227939689,
   0.6891775988077399,
   2.1752345554537253e-08,
   6.893568743986756e-07,
   0.208748781183144,
   0.1415781946468176,
   0.6100510112355793,
   0.8489495578328278,
   5.856109102466708e-17,
   1.0887143007578768e-06,
   0.9267776169971761,
   0.612947919254031,
   0.1480024234700374,
   0.6162195994094224,
   0.3580235802675854

In [10]:
type(val[0][2])

list

In [11]:
train = data_splitted["train"]

In [12]:
train_graphs, train_global_features, train_fingerprints, train_labels = map(list, zip(*train))

In [16]:
train_graphs[:5]

[Graph(num_nodes={'atom': 28, 'bond': 72},
       num_edges={('atom', 'starts', 'bond'): 72, ('bond', 'leads_to', 'atom'): 72, ('bond', 'leads_to', 'bond'): 144, ('bond', 'starts_at', 'atom'): 72},
       metagraph=[('atom', 'bond', 'starts'), ('bond', 'atom', 'leads_to'), ('bond', 'atom', 'starts_at'), ('bond', 'bond', 'leads_to')]),
 Graph(num_nodes={'atom': 35, 'bond': 86},
       num_edges={('atom', 'starts', 'bond'): 86, ('bond', 'leads_to', 'atom'): 86, ('bond', 'leads_to', 'bond'): 154, ('bond', 'starts_at', 'atom'): 86},
       metagraph=[('atom', 'bond', 'starts'), ('bond', 'atom', 'leads_to'), ('bond', 'atom', 'starts_at'), ('bond', 'bond', 'leads_to')]),
 Graph(num_nodes={'atom': 33, 'bond': 82},
       num_edges={('atom', 'starts', 'bond'): 82, ('bond', 'leads_to', 'atom'): 82, ('bond', 'leads_to', 'bond'): 164, ('bond', 'starts_at', 'atom'): 82},
       metagraph=[('atom', 'bond', 'starts'), ('bond', 'atom', 'leads_to'), ('bond', 'atom', 'starts_at'), ('bond', 'bond', 'lea