In [4]:
# imports
from ogb.graphproppred import DglGraphPropPredDataset
from data_functions import create_smiles_files, handle_raw_data, get_processed_data, get_split_data
import numpy as np
import pandas as pd
import warnings
from igraph import *
import sys
# sys.path.append('../models')
# from model_functions import get_split_data

In [2]:
# handle warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

### Graph Data

In [5]:
# load graph data
# if run for 1st time, creates "ogbg-molhiv dataset" folder in "data" folder
dataset = DglGraphPropPredDataset(name = "ogbg-molhiv", root = "ogbg-molhiv dataset/")

Downloading http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/hiv.zip


Downloaded 0.00 GB: 100%|██████████| 3/3 [00:03<00:00,  1.16s/it]


Extracting ogbg-molhiv dataset/hiv.zip
Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 41127/41127 [00:01<00:00, 24075.77it/s]


Converting graphs into DGL objects...


100%|██████████| 41127/41127 [00:45<00:00, 901.46it/s] 


Saving...


In [6]:
print(f"the data is comprised of {len(dataset)} molecules")

the data is comprised of 41127 molecules


In [7]:
# default scaffold splitting
split_idx = dataset.get_idx_split() 
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
print(f"default scaffold splitting is done {round(len(train_idx)/len(dataset)*100)}% train | {round(len(valid_idx)/len(dataset)*100)}% valid | {round(len(test_idx)/len(dataset)*100)}% test")

default scaffold splitting is done 80% train | 10% valid | 10% test


#### Label Distribution

In [11]:
# ...
count = 0
for i in range(len(dataset)):
    count += dataset[i][1]
print(f"{int(count)} positive labels in {len(dataset)} data points ({round(int(count)*100/len(dataset),1)}%)")

1443 positive labels in 41127 data points (3.5%)


In [12]:
# across splits: training
count = 0
for i in range(len(dataset[split_idx["train"]])):
    count += dataset[split_idx["train"][i]][1]

print(f"{int(count)} positive labels in {len(dataset[split_idx['train']])} training data points ({round(int(count)*100/len(dataset[split_idx['train']]),1)}%)")

1232 positive labels in 32901 training data points (3.7%)


In [13]:
# across splits: validation
count = 0
for i in range(len(dataset[split_idx["valid"]])):
    count += dataset[split_idx["valid"][i]][1]

print(f"{int(count)} positive labels in {len(dataset[split_idx['valid']])} training data points ({round(int(count)*100/len(dataset[split_idx['valid']]), 1)}%)")

81 positive labels in 4113 training data points (2.0%)


In [14]:
# across splits: test
count = 0
for i in range(len(dataset[split_idx["test"]])):
    count += dataset[split_idx["test"][i]][1]

print(f"{int(count)} positive labels in {len(dataset[split_idx['test']])} training data points ({round(int(count)*100/len(dataset[split_idx['test']]), 1)}%)")

130 positive labels in 4113 training data points (3.2%)


### Featurization

The ogbg-molhiv dataset comes with a SMILES string representation for eahc molecule.

Based on the SMILES representation, descriptor calculators such as PaDEL-Descriptor can calculate descriptors and fingerprints as features.

""The molecular descriptor is the final result of a logic and mathematical procedure which transforms chemical information encoded within a symbolic representation of a molecule into a useful number or the result of some standardized experiment."

Descriptors and fingerprints represent human feature engineering.

This way, there is the original graph data, but also a tabular counterpart which featurized the underlying molecule based on its SMILES string.

In [15]:
# prepare SMILES input for PaDEL-descriptor by creating a folder with .smi files
# if run for the 1st time, creates "ogbg-molhiv SMILES" folder in "data" folder
create_smiles_files()

The resulting folder is the input for the PaDEL-Descriptor program.

Due to problems with PaDEL-Descriptor's Python wrapper (padelpy), the calculation was done in 1 run in the PaDEL-Descriptor  with the following settings, resulting in ```1D_2D_PubChemFP_SubFP_full_raw.csv```.

- 1D & 2D:          CHECK
- 3D:               -
- Fingerprints:     CHECK (PubChemFingerprinter + SubstructureFingerprinter)
- Remove salt:      CHECK
- Detect arom:      CHECK
- Stand. tautomers: -
- Stand. nitro:     CHECK
- Retain 3D coord.: -
- Convert to 3D:    No

Also, max. running time was set to 30,000 ms. 

Resulting .csv file should be named "featurized data/1D_2D_PubChemFP_SubFP_no-order_30s.csv" and saved into a newly created "featurized data" folder in the "data" folder

The file can also be downloader here: https://drive.google.com/file/d/1fRXMgHEO-bHSyZhjeGmduNjQin-VQI_J/view?usp=sharing

### Preprocessing of Featurized Data

In [28]:
# get the data
df_raw = handle_raw_data("featurized data/1D_2D_PubChemFP_SubFP_no-order_30s.csv")

saving file ...
saved to 'raw data/1D_2D_PubChemFP_SubFP_raw.csv'


In [29]:
# load raw featurized data
df_raw = pd.read_csv("featurized data/1D_2D_PubChemFP_SubFP_no-order_30s.csv")
df_raw.shape

(41127, 2633)

The featurization process yielded 2632 descriptors (label included in 2633).

Now, we can apply basic preprocessing steps.

1. remove features with missing values and low variance (<0.05)
2. remove features with high correlations (r>0.95) with other features
3. standardization to mean value of 0 and variance of 1


In [30]:
# get processed data
df_prepro = get_processed_data("raw data/1D_2D_PubChemFP_SubFP_raw.csv")

preprocessing resulted in 233 standardized features
saving file ...
saved to 'preprocessed data/1D_2D_PubChemFP_SubFP_preprocessed.csv'


In [31]:
# load preprocessed data
df_prepro = pd.read_csv("preprocessed data/1D_2D_PubChemFP_SubFP_preprocessed.csv")
df_prepro.shape

(41127, 234)

Now, we are only left with 233 features (234 including label).

In [3]:
# split data
x_train, y_train, x_valid, y_valid, x_test, y_test = get_split_data(("preprocessed data/1D_2D_PubChemFP_SubFP_preprocessed.csv"))