## Train SparseChem on Chembl_mini 
Output to `experiments/SparseChem`

In [2]:
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:90% !important; }</style>"))
%load_ext autoreload
%autoreload 2

In [3]:
# Copyright (c) 2020 KU Leuven
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
import sparsechem as sc
import scipy.io
import scipy.sparse
import numpy as np
import pandas as pd
import torch
import argparse
import os
import sys
import os.path
import time
import json
import functools
from datetime import datetime
import pprint
import csv
#from apex import amp
from contextlib import redirect_stdout
from sparsechem import Nothing
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.tensorboard import SummaryWriter
from pytorch_memlab import MemReporter
import multiprocessing
from pynvml import *

pp = pprint.PrettyPrinter(indent=4)
np.set_printoptions(edgeitems=3, infstr='inf', linewidth=150, nanstr='nan')
torch.set_printoptions( linewidth=132)

if torch.cuda.is_available():
    nvmlInit()

multiprocessing.set_start_method('fork', force=True)

parser = argparse.ArgumentParser(description="Training a multi-task model.")
parser.add_argument("--x", help="Descriptor file (matrix market, .npy or .npz)", type=str, default=None)
parser.add_argument("--y_class", "--y", "--y_classification", help="Activity file (matrix market, .npy or .npz)", type=str, default=None)
parser.add_argument("--y_regr", "--y_regression", help="Activity file (matrix market, .npy or .npz)", type=str, default=None)
parser.add_argument("--y_censor", help="Censor mask for regression (matrix market, .npy or .npz)", type=str, default=None)
parser.add_argument("--weights_class", "--task_weights", "--weights_classification", help="CSV file with columns task_id, training_weight, aggregation_weight, task_type (for classification tasks)", type=str, default=None)
parser.add_argument("--weights_regr", "--weights_regression", help="CSV file with columns task_id, training_weight, censored_weight, aggregation_weight, aggregation_weight, task_type (for regression tasks)", type=str, default=None)
parser.add_argument("--censored_loss", help="Whether censored loss is used for training (default 1)", type=int, default=1)
parser.add_argument("--folding", help="Folding file (npy)", type=str, required=True)
parser.add_argument("--fold_va", help="Validation fold number", type=int, default=0)
parser.add_argument("--fold_te", help="Test fold number (removed from dataset)", type=int, default=None)
parser.add_argument("--batch_ratio", help="Batch ratio", type=float, default=0.02)
parser.add_argument("--internal_batch_max", help="Maximum size of the internal batch", type=int, default=None)
parser.add_argument("--normalize_loss", help="Normalization constant to divide the loss (default uses batch size)", type=float, default=None)
parser.add_argument("--normalize_regression", help="Set this to 1 if the regression tasks should be normalized", type=int, default=0)
parser.add_argument("--normalize_regr_va", help="Set this to 1 if the regression tasks in validation fold should be normalized together with training folds", type=int, default=0)
parser.add_argument("--inverse_normalization", help="Set this to 1 if the regression tasks in validation fold should be inverse normalized at validation time", type=int, default=0)
parser.add_argument("--hidden_sizes", nargs="+", help="Hidden sizes of trunk", default=[], type=int, required=True)
parser.add_argument("--last_hidden_sizes", nargs="+", help="Hidden sizes in the head (if specified , class and reg heads have this dimension)", default=None, type=int)
#parser.add_argument("--middle_dropout", help="Dropout for layers before the last", type=float, default=0.0)
#parser.add_argument("--last_dropout", help="Last dropout", type=float, default=0.2)
parser.add_argument("--weight_decay", help="Weight decay", type=float, default=0.0)
parser.add_argument("--last_non_linearity", help="Last layer non-linearity (depecrated)", type=str, default="relu", choices=["relu", "tanh"])
parser.add_argument("--middle_non_linearity", "--non_linearity", help="Before last layer non-linearity", type=str, default="relu", choices=["relu", "tanh"])
parser.add_argument("--input_transform", help="Transformation to apply to inputs", type=str, default="none", choices=["binarize", "none", "tanh", "log1p"])
parser.add_argument("--lr", help="Learning rate", type=float, default=1e-3)
parser.add_argument("--lr_alpha", help="Learning rate decay multiplier", type=float, default=0.3)
parser.add_argument("--lr_steps", nargs="+", help="Learning rate decay steps", type=int, default=[10])
parser.add_argument("--input_size_freq", help="Number of high importance features", type=int, default=None)
parser.add_argument("--fold_inputs", help="Fold input to a fixed set (default no folding)", type=int, default=None)
parser.add_argument("--epochs", help="Number of epochs", type=int, default=20)
parser.add_argument("--pi_zero", help="Reference class ratio to be used for calibrated aucpr", type=float, default=0.1)
parser.add_argument("--min_samples_class", help="Minimum number samples in each class and in each fold for AUC calculation (only used if aggregation_weight is not provided in --weights_class)", type=int, default=5)
parser.add_argument("--min_samples_auc", help="Obsolete: use 'min_samples_class'", type=int, default=None)
parser.add_argument("--min_samples_regr", help="Minimum number of uncensored samples in each fold for regression metric calculation (only used if aggregation_weight is not provided in --weights_regr)", type=int, default=10)
parser.add_argument("--dev", help="Device to use", type=str, default="cuda:0")
parser.add_argument("--run_name", help="Run name for results", type=str, default=None)
parser.add_argument("--output_dir", help="Output directory, including boards (default 'models')", type=str, default="models")
parser.add_argument("--prefix", help="Prefix for run name (default 'run')", type=str, default='run')
parser.add_argument("--verbose", help="Verbosity level: 2 = full; 1 = no progress; 0 = no output", type=int, default=2, choices=[0, 1, 2])
parser.add_argument("--save_model",        help="Set this to 0 if the model should not be saved", type=int, default=1)
parser.add_argument("--save_board",        help="Set this to 0 if the TensorBoard should not be saved", type=int, default=1)
parser.add_argument("--profile",           help="Set this to 1 to output memory profile information", type=int, default=0)
parser.add_argument("--mixed_precision",   help="Set this to 1 to run in mixed precision mode (vs single precision)", type=int, default=0)
parser.add_argument("--eval_train",        help="Set this to 1 to calculate AUCs for train data", type=int, default=0)
parser.add_argument("--enable_cat_fusion", help="Set this to 1 to enable catalogue fusion", type=int, default=0)
parser.add_argument("--eval_frequency",    help="The gap between AUC eval (in epochs), -1 means to do an eval at the end.", type=int, default=1)

#hybrid model features
parser.add_argument("--regression_weight", help="between 0 and 1 relative weight of regression loss vs classification loss", type=float, default=0.5)
parser.add_argument("--scaling_regularizer", help="L2 regularizer of the scaling layer, if inf scaling layer is switched off", type=float, default=np.inf)
parser.add_argument("--class_feature_size", help="Number of leftmost features used from the output of the trunk (default: use all)", type=int, default=-1)
parser.add_argument("--regression_feature_size", help="Number of rightmost features used from the output of the trunk (default: use all)", type=int, default=-1)
parser.add_argument("--last_hidden_sizes_reg", nargs="+", help="Hidden sizes in the regression head (overwritten by last_hidden_sizes)", default=None, type=int)
parser.add_argument("--last_hidden_sizes_class", nargs="+", help="Hidden sizes in the classification head (overwritten by last_hidden_sizes)", default=None, type=int)
parser.add_argument("--dropouts_reg"  , nargs="+", help="List of dropout values used in the regression head (needs one per last hidden in reg head, ignored if last_hidden_sizes_reg not specified)", default=[], type=float)
parser.add_argument("--dropouts_class", nargs="+", help="List of dropout values used in the classification head (needs one per last hidden in class head, ignored if no last_hidden_sizes_class not specified)", default=[], type=float)
parser.add_argument("--dropouts_trunk", nargs="+", help="List of dropout values used in the trunk", default=[], type=float)

dev = "gpu" 
rstr = datetime.now().strftime("%m%d_%H%M")
# data_dir="chembl23_data"
# data_dir="chembl23_run_01152022"
data_dir = "../MLDatasets/chembl23_mini"
output_dir = f"../experiments/mini-SparseChem/{rstr}"
print(output_dir)

rm_output=False

# rstr = "synthetic_data_model" ##random_str(12)
# rstr = "synthetic_data_model_03042022" ##random_str(12)

# output_dir = f"./models-{rstr}/"
# output_dir = f"./{data_dir}/models-{rstr}/"

# output dir kbardool/kusanagi/experiments/SparseChem/0116_0843



../experiments/mini-SparseChem/0410_1959


In [4]:
# dev = "gpu" 
# data_dir="chembl23_data"
# data_dir="chembl23_run_01152022"
# data_dir = "/home/kbardool/kusanagi/MLDatasets/chembl_23mini_synthetic"

# rm_output=False

# rstr = datetime.now().strftime("%m%d_%H%M")
# rstr = "synthetic_data_model" ##random_str(12)
# rstr = "synthetic_data_model_03042022" ##random_str(12)

# output_dir = f"./models-{rstr}/"
# output_dir = f"./{data_dir}/models-{rstr}/"

# output dir kbardool/kusanagi/experiments/SparseChem/0116_0843
# output_dir = f"/home/kbardool/kusanagi/experiments/SparseChem/{rstr}"
# print(output_dir)

### Two layer network as specified in `https://git.infra.melloddy.eu/wp2/sparsechem/-/blob/master/docs/main.md`

In [5]:
cmd = (
  f" --x              {data_dir}/chembl_23mini_x.npy " +
  f" --y_class        {data_dir}/chembl_23mini_y.npy " +
  f" --folding        {data_dir}/chembl_23mini_folds.npy " +

  f" --output_dir     {output_dir}" +
  F" --dev              cuda:0 "
  f" --fold_va              0 " +
  f" --fold_inputs      32000 " +
  f" --batch_ratio       0.01 " +
  f" --hidden_sizes     40 40 40" +
  f" --dropouts_trunk    0  0  0" +
  f" --weight_decay      1e-4 " +
  f" --epochs             100 " +
  f" --lr                1e-3 " +
  f" --lr_steps            10 " +
  f" --lr_alpha           0.3 " + 
  f" --prefix              sc " +
  f" --min_samples_class    1 "
)

In [7]:
# cmd = (
#   f" --x       /home/kbardool/kusanagi/MLDatasets/chembl_23mini_synthetic/chembl_23mini_x.npy " +
#   f" --y_class /home/kbardool/kusanagi/MLDatasets/chembl_23mini_synthetic/chembl_23mini_adashare_y_all_bin_sparse.npy " +
#   f" --folding /home/kbardool/kusanagi/MLDatasets/chembl_23mini_synthetic/chembl_23mini_folds.npy " +
#   f" --output_dir {output_dir}" +    
#   f" --fold_va           0 " +
#   f" --batch_ratio    0.02 " +
#   f" --hidden_sizes   25 25 25 25 25 25 " +
#   f" --dropouts_trunk  0  0  0  0  0  0 " +
#   f" --weight_decay   1e-4 " 
#   f" --epochs           40 " +
#   f" --lr             1e-3 " +
#   f" --lr_steps         10 " +
#   f" --lr_alpha        0.3" 
# )

# cmd = (
#   f" --x       {data_dir}/chembl_23mini_x.npy " +
#   f" --y_class {data_dir}/chembl_23mini_adashare_y_all_bin_sparse.npy " +
#   f" --folding {data_dir}/chembl_23mini_folds.npy " +
#   f" --output_dir {output_dir}" +    
#   f" --fold_va            0 " +
#   f" --batch_ratio     0.02 " +
#   f" --hidden_sizes   40 40 " +
#   f" --dropouts_trunk  0  0 " +
#   f" --weight_decay   1e-4 " +
#   f" --epochs           20 " +
#   f" --lr             1e-3 " +
#   f" --lr_steps         10 " +
#   f" --lr_alpha        0.3 " 
# )

#   f" --hidden_sizes   400 400 " +
#   f" --last_dropout   0.2 " +
#   f" --middle_dropout 0.2 " +
#   f" --x       ./{data_dir}/chembl_23_x.mtx " +
#   f" --y_class ./{data_dir}/chembl_23_y.mtx " +
#   f" --folding ./{data_dir}/folding_hier_0.6.npy " +

In [8]:
#### copied from SparseChemDev 

# cmd = (
#         f" --x       ./{data_dir}/chembl_23mini_x.npy" +
#         f" --y_class ./{data_dir}/chembl_23mini_y.npy" +
#         f" --folding ./{data_dir}/chembl_23mini_folds.npy" +
#         f" --hidden_sizes 20 30 40 " +  
#         f" --output_dir {output_dir}" +
#         f" --batch_ratio 0.1" +
#         f" --epochs 2" +
#         f" --lr 1e-3" +
#         f" --lr_steps 1" +
#         f" --dev {dev}" +
#         f" --verbose 1")
#         f" --input_size_freq  40"
#         f" --tail_hidden_size  10"

### Initializations 

In [9]:
args = parser.parse_args(cmd.split())


# %tb
# args = parser.parse_args()

def vprint(s=""):
    if args.verbose:
        print(s)

pp.pprint(vars(args))

{   'batch_ratio': 0.01,
    'censored_loss': 1,
    'class_feature_size': -1,
    'dev': 'cuda:0',
    'dropouts_class': [],
    'dropouts_reg': [],
    'dropouts_trunk': [0.0, 0.0, 0.0],
    'enable_cat_fusion': 0,
    'epochs': 100,
    'eval_frequency': 1,
    'eval_train': 0,
    'fold_inputs': 32000,
    'fold_te': None,
    'fold_va': 0,
    'folding': '../MLDatasets/chembl23_mini/chembl_23mini_folds.npy',
    'hidden_sizes': [40, 40, 40],
    'input_size_freq': None,
    'input_transform': 'none',
    'internal_batch_max': None,
    'inverse_normalization': 0,
    'last_hidden_sizes': None,
    'last_hidden_sizes_class': None,
    'last_hidden_sizes_reg': None,
    'last_non_linearity': 'relu',
    'lr': 0.001,
    'lr_alpha': 0.3,
    'lr_steps': [10],
    'middle_non_linearity': 'relu',
    'min_samples_auc': None,
    'min_samples_class': 1,
    'min_samples_regr': 10,
    'mixed_precision': 0,
    'normalize_loss': None,
    'normalize_regr_va': 0,
    'normalize_regression

In [10]:
if args.run_name is not None:
    name = args.run_name
else:
    name  = f"{args.prefix}"
    name += f"_{'.'.join([str(h) for h in args.hidden_sizes])}"
#     name += f"_do{'.'.join([str(d) for d in args.dropouts_trunk])}"
    name += f"_lr{args.lr}"
    name += f"_do{args.dropouts_trunk[0]}"
#     name += f"_wd{args.weight_decay}"
#     name += f"_hs{'.'.join([str(h) for h in args.hidden_sizes])}"
    
#     name += f"_lrsteps{'.'.join([str(s) for s in args.lr_steps])}_ep{args.epochs}"
#     name += f"_fva{args.fold_va}_fte{args.fold_te}"
    if args.mixed_precision == 1:
        name += f"_mixed_precision"
vprint(f"Run name is '{name}'.")

# if args.run_name is not None:
#     name = args.run_name
# else:
#     name  = f"sc_{args.prefix}_h{'.'.join([str(h) for h in args.hidden_sizes])}_ldo_r{'.'.join([str(d) for d in args.dropouts_reg])}_wd{args.weight_decay}"
#     name += f"_lr{args.lr}_lrsteps{'.'.join([str(s) for s in args.lr_steps])}_ep{args.epochs}"
#     name += f"_fva{args.fold_va}_fte{args.fold_te}"
#     if args.mixed_precision == 1:
#         name += f"_mixed_precision"
# vprint(f"Run name is '{name}'.")


Run name is 'sc_40.40.40_lr0.001_do0.0'.


### Assertions

In [11]:

if (args.last_hidden_sizes is not None) and ((args.last_hidden_sizes_class is not None) or (args.last_hidden_sizes_reg is not None)):
    raise ValueError("Head specific and general last_hidden_sizes argument were both specified!")
if (args.last_hidden_sizes is not None):
    args.last_hidden_sizes_class = args.last_hidden_sizes
    args.last_hidden_sizes_reg   = args.last_hidden_sizes

if args.last_hidden_sizes_reg is not None:
    assert len(args.last_hidden_sizes_reg) == len(args.dropouts_reg), "Number of hiddens and number of dropout values specified must be equal in the regression head!"
if args.last_hidden_sizes_class is not None:
    assert len(args.last_hidden_sizes_class) == len(args.dropouts_class), "Number of hiddens and number of dropout values specified must be equal in the classification head!"
if args.hidden_sizes is not None:
    assert len(args.hidden_sizes) == len(args.dropouts_trunk), "Number of hiddens and number of dropout values specified must be equal in the trunk!"

if args.class_feature_size == -1:
    args.class_feature_size = args.hidden_sizes[-1]
if args.regression_feature_size == -1:
    args.regression_feature_size = args.hidden_sizes[-1]

assert args.regression_feature_size <= args.hidden_sizes[-1], "Regression feature size cannot be larger than the trunk output"
assert args.class_feature_size <= args.hidden_sizes[-1], "Classification feature size cannot be larger than the trunk output"
assert args.regression_feature_size + args.class_feature_size >= args.hidden_sizes[-1], "Unused features in the trunk! Set regression_feature_size + class_feature_size >= trunk output!"
#if args.regression_feature_size != args.hidden_sizes[-1] or args.class_feature_size != args.hidden_sizes[-1]:
#    raise ValueError("Hidden spliting not implemented yet!")

assert args.input_size_freq is None, "Using tail compression not yet supported."

if (args.y_class is None) and (args.y_regr is None):
    raise ValueError("No label data specified, please add --y_class and/or --y_regr.")

### Summary writer

In [12]:
if args.profile == 1:
    assert (args.save_board==1), "Tensorboard should be enabled to be able to profile memory usage."
if args.save_board:
    tb_name = os.path.join(args.output_dir, "", name)
    writer  = SummaryWriter(tb_name)
else:
    writer = Nothing()

### Load datasets

In [13]:
ecfp     = sc.load_sparse(args.x)
y_class  = sc.load_sparse(args.y_class)
y_regr   = sc.load_sparse(args.y_regr)
y_censor = sc.load_sparse(args.y_censor)

if (y_regr is None) and (y_censor is not None):
    raise ValueError("y_censor provided please also provide --y_regr.")
if y_class is None:
    y_class = scipy.sparse.csr_matrix((ecfp.shape[0], 0))
if y_regr is None:
    y_regr  = scipy.sparse.csr_matrix((ecfp.shape[0], 0))
if y_censor is None:
    y_censor = scipy.sparse.csr_matrix(y_regr.shape)

folding = np.load(args.folding)
assert ecfp.shape[0] == folding.shape[0], "x and folding must have same number of rows"

## Loading task weights
tasks_class = sc.load_task_weights(args.weights_class, y=y_class, label="y_class")
tasks_regr  = sc.load_task_weights(args.weights_regr, y=y_regr, label="y_regr")

## Input transformation
ecfp = sc.fold_transform_inputs(ecfp, folding_size=args.fold_inputs, transform=args.input_transform)
print(f"count non zero:{ecfp[0].count_nonzero()}")
num_pos    = np.array((y_class == +1).sum(0)).flatten()
num_neg    = np.array((y_class == -1).sum(0)).flatten()
num_class  = np.array((y_class != 0).sum(0)).flatten()
if (num_class != num_pos + num_neg).any():
    raise ValueError("For classification all y values (--y_class/--y) must be 1 or -1.")

num_regr   = np.bincount(y_regr.indices, minlength=y_regr.shape[1])

assert args.min_samples_auc is None, "Parameter 'min_samples_auc' is obsolete. Use '--min_samples_class' that specifies how many samples a task needs per FOLD and per CLASS to be aggregated."

if tasks_class.aggregation_weight is None:
    ## using min_samples rule
    fold_pos, fold_neg = sc.class_fold_counts(y_class, folding)
    n = args.min_samples_class
    tasks_class.aggregation_weight = ((fold_pos >= n).all(0) & (fold_neg >= n)).all(0).astype(np.float64)

if tasks_regr.aggregation_weight is None:
    if y_censor.nnz == 0:
        y_regr2 = y_regr.copy()
        y_regr2.data[:] = 1
    else:
        ## only counting uncensored data
        y_regr2      = y_censor.copy()
        y_regr2.data = (y_regr2.data == 0).astype(np.int32)
    fold_regr, _ = sc.class_fold_counts(y_regr2, folding)
    del y_regr2
    tasks_regr.aggregation_weight = (fold_regr >= args.min_samples_regr).all(0).astype(np.float64)

vprint(f"Input dimension: {ecfp.shape[1]}")
vprint(f"#samples:        {ecfp.shape[0]}")
vprint(f"#classification tasks:  {y_class.shape[1]}")
vprint(f"#regression tasks:      {y_regr.shape[1]}")
vprint(f"Using {(tasks_class.aggregation_weight > 0).sum()} classification tasks for calculating aggregated metrics (AUCROC, F1_max, etc).")
vprint(f"Using {(tasks_regr.aggregation_weight > 0).sum()} regression tasks for calculating metrics (RMSE, Rsquared, correlation).")


count non zero:80
Input dimension: 32000
#samples:        18388
#classification tasks:  100
#regression tasks:      0
Using 20 classification tasks for calculating aggregated metrics (AUCROC, F1_max, etc).
Using 0 regression tasks for calculating metrics (RMSE, Rsquared, correlation).


In [14]:
if args.fold_te is not None and args.fold_te >= 0:
    ## removing test data
    assert args.fold_te != args.fold_va, "fold_va and fold_te must not be equal."
    keep    = folding != args.fold_te
    ecfp    = ecfp[keep]
    y_class = y_class[keep]
    y_regr  = y_regr[keep]
    y_censor = y_censor[keep]
    folding = folding[keep]

normalize_inv = None
if args.normalize_regression == 1 and args.normalize_regr_va == 1:
   y_regr, mean_save, var_save = sc.normalize_regr(y_regr)
fold_va = args.fold_va
idx_tr  = np.where(folding != fold_va)[0]
idx_va  = np.where(folding == fold_va)[0]

y_class_tr = y_class[idx_tr]
y_class_va = y_class[idx_va]
y_regr_tr  = y_regr[idx_tr]
y_regr_va  = y_regr[idx_va]
y_censor_tr = y_censor[idx_tr]
y_censor_va = y_censor[idx_va]

if args.normalize_regression == 1 and args.normalize_regr_va == 0:
   y_regr_tr, mean_save, var_save = sc.normalize_regr(y_regr_tr) 
   if args.inverse_normalization == 1:
      normalize_inv = {}
      normalize_inv["mean"] = mean_save
      normalize_inv["var"]  = var_save
num_pos_va  = np.array((y_class_va == +1).sum(0)).flatten()
num_neg_va  = np.array((y_class_va == -1).sum(0)).flatten()
num_regr_va = np.bincount(y_regr_va.indices, minlength=y_regr.shape[1])
pos_rate = num_pos_va/(num_pos_va+num_neg_va)
pos_rate_ref = args.pi_zero
pos_rate = np.clip(pos_rate, 0, 0.99)
cal_fact_aucpr = pos_rate*(1-pos_rate_ref)/(pos_rate_ref*(1-pos_rate))

  pos_rate = num_pos_va/(num_pos_va+num_neg_va)


In [15]:
vprint(f"Input dimension   : {ecfp.shape[1]}")
vprint(f"Input dimension   : {ecfp.shape[1]}")
vprint(f"Training dataset  : {ecfp[idx_tr].shape}")
vprint(f"Validation dataset: {ecfp[idx_va].shape}")
vprint()
vprint(f"#classification tasks:  {y_class.shape[1]}")
vprint(f"#regression tasks    :      {y_regr.shape[1]}")
vprint(f"Using {(tasks_class.aggregation_weight > 0).sum():3d} classification tasks for calculating aggregated metrics (AUCROC, F1_max, etc).")
vprint(f"Using {(tasks_regr.aggregation_weight > 0).sum():3d} regression tasks for calculating metrics (RMSE, Rsquared, correlation).")

Input dimension   : 32000
Input dimension   : 32000
Training dataset  : (14633, 32000)
Validation dataset: (3755, 32000)

#classification tasks:  100
#regression tasks    :      0
Using  20 classification tasks for calculating aggregated metrics (AUCROC, F1_max, etc).
Using   0 regression tasks for calculating metrics (RMSE, Rsquared, correlation).


In [16]:
num_int_batches = 1
batch_size = 128
# batch_size  = int(np.ceil(args.batch_ratio * idx_tr.shape[0]))

print(f"orig batch size:   {batch_size}")
print(f"orig num int batches:   {num_int_batches}")

if args.internal_batch_max is not None:
    if args.internal_batch_max < batch_size:
        num_int_batches = int(np.ceil(batch_size / args.internal_batch_max))
        batch_size      = int(np.ceil(batch_size / num_int_batches))
print(f"batch size:   {batch_size}")
print(f"num_int_batches:   {num_int_batches}")

orig batch size:   128
orig num int batches:   1
batch size:   128
num_int_batches:   1


In [17]:
# #import ipdb; ipdb.set_trace()
# batch_size  = int(np.ceil(args.batch_ratio * idx_tr.shape[0]))
# num_int_batches = 1

# if args.internal_batch_max is not None:
#     if args.internal_batch_max < batch_size:
#         num_int_batches = int(np.ceil(batch_size / args.internal_batch_max))
#         batch_size      = int(np.ceil(batch_size / num_int_batches))
# vprint(f"#internal batch size:   {batch_size}")

In [18]:
tasks_cat_id_list = None
select_cat_ids = None
if tasks_class.cat_id is not None:
    tasks_cat_id_list = [[x,i] for i,x in enumerate(tasks_class.cat_id) if str(x) != 'nan']
    tasks_cat_ids = [i for i,x in enumerate(tasks_class.cat_id) if str(x) != 'nan']
    select_cat_ids = np.array(tasks_cat_ids)
    cat_id_size = len(tasks_cat_id_list)
else:
    cat_id_size = 0

### Dataloaders

In [19]:
dataset_tr = sc.ClassRegrSparseDataset(x=ecfp[idx_tr], y_class=y_class_tr, y_regr=y_regr_tr, y_censor=y_censor_tr, y_cat_columns=select_cat_ids)
dataset_va = sc.ClassRegrSparseDataset(x=ecfp[idx_va], y_class=y_class_va, y_regr=y_regr_va, y_censor=y_censor_va, y_cat_columns=select_cat_ids)

loader_tr = DataLoader(dataset_tr, batch_size=batch_size, num_workers = 8, pin_memory=True, collate_fn=dataset_tr.collate, shuffle=True)
loader_va = DataLoader(dataset_va, batch_size=batch_size, num_workers = 4, pin_memory=True, collate_fn=dataset_va.collate, shuffle=False)

args.input_size  = dataset_tr.input_size
args.output_size = dataset_tr.output_size

args.class_output_size = dataset_tr.class_output_size
args.regr_output_size  = dataset_tr.regr_output_size
args.cat_id_size = cat_id_size

dev  = torch.device(args.dev)
net  = sc.SparseFFN(args).to(dev)
loss_class = torch.nn.BCEWithLogitsLoss(reduction="none")
loss_regr  = sc.censored_mse_loss
if not args.censored_loss:
    loss_regr = functools.partial(loss_regr, censored_enabled=False)

tasks_class.training_weight = tasks_class.training_weight.to(dev)
tasks_regr.training_weight  = tasks_regr.training_weight.to(dev)
tasks_regr.censored_weight  = tasks_regr.censored_weight.to(dev)



### Network

In [20]:
vprint("Network:")
vprint(net)
reporter = None
h = None

Network:
SparseFFN(
  (net): Sequential(
    (0): SparseInputNet(
      (net_freq): SparseLinear(in_features=32000, out_features=40, bias=True)
    )
    (1): MiddleNet(
      (net): Sequential(
        (layer_0): Sequential(
          (0): ReLU()
          (1): Dropout(p=0.0, inplace=False)
          (2): Linear(in_features=40, out_features=40, bias=True)
        )
        (layer_1): Sequential(
          (0): ReLU()
          (1): Dropout(p=0.0, inplace=False)
          (2): Linear(in_features=40, out_features=40, bias=True)
        )
      )
    )
  )
  (classLast): LastNet(
    (net): Sequential(
      (initial_layer): Sequential(
        (0): ReLU()
        (1): Dropout(p=0.0, inplace=False)
        (2): Linear(in_features=40, out_features=100, bias=True)
      )
    )
  )
  (regrLast): Sequential(
    (0): LastNet(
      (net): Sequential(
        (initial_layer): Sequential(
          (0): Tanh()
          (1): Dropout(p=0.0, inplace=False)
          (2): Linear(in_features=40, 

### setup memory profiling reporter

In [21]:
if args.profile == 1:
   torch_gpu_id = torch.cuda.current_device()
   if "CUDA_VISIBLE_DEVICES" in os.environ:
      ids = list(map(int, os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",")))
      nvml_gpu_id = ids[torch_gpu_id] # remap
   else:
      nvml_gpu_id = torch_gpu_id
   h = nvmlDeviceGetHandleByIndex(nvml_gpu_id)

if args.profile == 1:
   #####   output saving   #####
   if not os.path.exists(args.output_dir):
       os.makedirs(args.output_dir)

   reporter = MemReporter(net)

   with open(f"{args.output_dir}/memprofile.txt", "w+") as profile_file:
        with redirect_stdout(profile_file):
             profile_file.write(f"\nInitial model detailed report:\n\n")
             reporter.report()

###  Optimizer, Scheduler, GradScaler

In [22]:
optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay)
scheduler = MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_alpha)
scaler = torch.cuda.amp.GradScaler()
num_prints = 0

In [23]:
print(optimizer)
# args.eval_train = 0 
# args.epochs     = 5
print(f"dev              :    {dev}")
print(f"args.lr          :    {args.lr}")
print(f"args.weight_decay:    {args.weight_decay}")
print(f"args.lr_steps    :    {args.lr_steps}")
print(f"args.lr_steps    :    {args.lr_steps}")
print(f"num_int_batches  :    {num_int_batches}")
print(f"batch_size       :    {batch_size}")
print(f"EPOCHS           :    {args.epochs}")
print(f"scaler           :    {scaler}")
print(f"args.normalize_loss    :    {args.normalize_loss}")
print(f"loss_class       :    {loss_class}")
print(f"mixed precision  :    {args.mixed_precision}")
print(args.eval_train)
current_epoch = 0
 

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    initial_lr: 0.001
    lr: 0.001
    weight_decay: 0.0001
)
dev              :    cuda:0
args.lr          :    0.001
args.weight_decay:    0.0001
args.lr_steps    :    [10]
args.lr_steps    :    [10]
num_int_batches  :    1
batch_size       :    128
EPOCHS           :    100
scaler           :    <torch.cuda.amp.grad_scaler.GradScaler object at 0x2b73210736d0>
args.normalize_loss    :    None
loss_class       :    BCEWithLogitsLoss()
mixed precision  :    0
0


##  Training Loop

In [24]:
end_epoch = current_epoch + args.epochs

for epoch in range(current_epoch, end_epoch, 1):
    t0 = time.time()
    sc.train_class_regr(
        net, optimizer,
        loader          = loader_tr,
        loss_class      = loss_class,
        loss_regr       = loss_regr,
        dev             = dev,
        weights_class   = tasks_class.training_weight * (1-args.regression_weight) * 2,
        weights_regr    = tasks_regr.training_weight * args.regression_weight * 2,
        censored_weight = tasks_regr.censored_weight,
        normalize_loss  = args.normalize_loss,
        num_int_batches = num_int_batches,
        progress        = False,
        writer = writer,
        epoch = epoch,
        args = args,
        scaler = scaler,
        nvml_handle = h)

    if args.profile == 1:
       with open(f"{args.output_dir}/memprofile.txt", "a+") as profile_file:
            profile_file.write(f"\nAfter epoch {epoch} model detailed report:\n\n")
            with redirect_stdout(profile_file):
                 reporter.report()

    t1 = time.time()
    eval_round = (args.eval_frequency > 0) and ((epoch + 1) % args.eval_frequency == 0)
    last_round = epoch == args.epochs - 1

    if eval_round or last_round:

        results_va = sc.evaluate_class_regr(net, loader_va, 
                                            loss_class, 
                                            loss_regr, 
                                            tasks_class=tasks_class, 
                                            tasks_regr=tasks_regr, 
                                            dev=dev, 
                                            progress = False, 
                                            normalize_inv=normalize_inv, 
                                            cal_fact_aucpr=cal_fact_aucpr)
        
        for key, val in results_va["classification_agg"].items():
            writer.add_scalar("val_metrics:aggregated/"+key, val, epoch*batch_size)
#             writer.add_scalar(key+"/va", val, epoch)            
#         for key, val in results_va["regression_agg"].items():
#             writer.add_scalar(key+"/va", val, epoch)

        if args.eval_train:
            results_tr = sc.evaluate_class_regr(net, loader_tr, loss_class, loss_regr, tasks_class=tasks_class, tasks_regr=tasks_regr, dev=dev, progress = args.verbose >= 2)
            for key, val in results_tr["classification_agg"].items():
                writer.add_scalar("trn_metrics:aggregated/"+key, val, epoch *batch_size)                
#                 writer.add_scalar(key+"/tr", val, epoch)
#             for key, val in results_tr["regression_agg"].items():
#                 writer.add_scalar(key+"/tr", val, epoch)
        else:
            results_tr = None

        if args.verbose:
            ## printing a new header every 20 lines
            header = num_prints % 20 == 0
            num_prints += 1
            sc.print_metrics_cr(epoch, t1 - t0, results_tr, results_va, header)

    scheduler.step()

#print("DEBUG data for hidden spliting")
#print (f"Classification mask: Sum = {net.classmask.sum()}\t Uniques: {np.unique(net.classmask)}")
#print (f"Regression mask:     Sum = {net.regmask.sum()}\t Uniques: {np.unique(net.regmask)}")
#print (f"overlap: {(net.regmask * net.classmask).sum()}")

Epoch  |      logl   bceloss  avg prec   auc roc    auc pr aucpr_cal    f1_max |      rmse  rsquared  corrcoef | tr_time 
0      |   0.08678   0.24621   0.88100   0.83707   0.87880   0.77458   0.88615 |       nan       nan       nan |    1.6 
1      |   0.07841   0.25434   0.91099   0.87245   0.89624   0.79673   0.92320 |       nan       nan       nan |    1.5 
2      |   0.07423   0.22931   0.91612   0.88930   0.90144   0.80599   0.92193 |       nan       nan       nan |    1.1 
3      |   0.07755   0.25578   0.92877   0.86402   0.92531   0.80459   0.93024 |       nan       nan       nan |    1.0 
4      |   0.07954   0.25474   0.93618   0.88056   0.93324   0.82355   0.93585 |       nan       nan       nan |    1.0 
5      |   0.08637   0.29669   0.92689   0.86190   0.92340   0.80521   0.92895 |       nan       nan       nan |    1.0 
6      |   0.08779   0.32310   0.90733   0.89057   0.89193   0.82575   0.91613 |       nan       nan       nan |    1.0 
7      |   0.09774   0.34855   

64     |   0.12107   0.45638   0.89492   0.86951   0.88270   0.74319   0.89815 |       nan       nan       nan |    1.0 
65     |   0.12115   0.45452   0.89490   0.86936   0.88270   0.74699   0.89800 |       nan       nan       nan |    0.9 
66     |   0.12020   0.45389   0.89625   0.87237   0.88417   0.75242   0.89872 |       nan       nan       nan |    0.9 
67     |   0.12303   0.44464   0.90776   0.87469   0.89243   0.76183   0.90633 |       nan       nan       nan |    1.0 
68     |   0.11959   0.43532   0.93644   0.88348   0.93360   0.80713   0.92341 |       nan       nan       nan |    1.0 
69     |   0.12561   0.48717   0.89600   0.86559   0.88464   0.75252   0.89549 |       nan       nan       nan |    1.5 
70     |   0.14155   0.58338   0.88125   0.84610   0.86754   0.72236   0.89207 |       nan       nan       nan |    1.0 
71     |   0.13307   0.49568   0.90106   0.88409   0.88948   0.76588   0.90057 |       nan       nan       nan |    0.9 
72     |   0.13368   0.52828   0

## Post Training 

In [25]:
writer.close()
vprint()
if args.profile == 1:
   multiplexer = sc.create_multiplexer(tb_name)
#   sc.export_scalars(multiplexer, '.', "GPUmem", "testcsv.csv")
   data = sc.extract_scalars(multiplexer, '.', "GPUmem")
   vprint(f"Peak GPU memory used: {sc.return_max_val(data)}MB")
vprint("Saving performance metrics (AUCs) and model.")

#####   model saving   #####
if not os.path.exists(args.output_dir):
   os.makedirs(args.output_dir)

model_file = f"{args.output_dir}/{name}.pt"
out_file   = f"{args.output_dir}/{name}.json"

if args.save_model:
   torch.save(net.state_dict(), model_file)
   vprint(f"Saved model weights into '{model_file}'.")

results_va["classification"]["num_pos"] = num_pos_va
results_va["classification"]["num_neg"] = num_neg_va
results_va["regression"]["num_samples"] = num_regr_va

if results_tr is not None:
    results_tr["classification"]["num_pos"] = num_pos - num_pos_va
    results_tr["classification"]["num_neg"] = num_neg - num_neg_va
    results_tr["regression"]["num_samples"] = num_regr - num_regr_va

stats=None
if args.normalize_regression == 1 :
   stats={}
   stats["mean"] = mean_save
   stats["var"]  = np.array(var_save)[0]
sc.save_results(out_file, args, validation=results_va, training=results_tr, stats=stats)

vprint(f"Saved config and results into '{out_file}'.\nYou can load the results by:\n  import sparsechem as sc\n  res = sc.load_results('{out_file}')")


Saving performance metrics (AUCs) and model.
Saved model weights into '../experiments/mini-SparseChem/0410_1959/sc_40.40.40_lr0.001_do0.0.pt'.
Saved config and results into '../experiments/mini-SparseChem/0410_1959/sc_40.40.40_lr0.001_do0.0.json'.
You can load the results by:
  import sparsechem as sc
  res = sc.load_results('../experiments/mini-SparseChem/0410_1959/sc_40.40.40_lr0.001_do0.0.json')


In [26]:
results_va['classification'][0:20]

Unnamed: 0_level_0,roc_auc_score,auc_pr,avg_prec_score,f1_max,p_f1_max,kappa,kappa_max,p_kappa_max,bceloss,auc_pr_cal,num_pos,num_neg
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0.972634,0.995563,0.99555,0.982759,0.701485,0.774872,0.857835,0.701485,0.17252,0.868535,115,17
1,,,,,,,,,,,31,0
2,,,,,,,,,,,17,0
3,,,,,,,,,,,54,0
4,,,,,,,,,,,7,0
5,1.0,1.0,1.0,1.0,0.980237,1.0,1.0,0.980237,0.024701,1.0,18,1
6,0.988506,0.999828,0.999828,0.997543,0.974935,0.797642,0.797642,0.974935,0.032356,0.974026,203,3
7,0.75,0.711111,0.755556,0.75,0.967762,0.0,0.461538,0.967762,1.698649,0.444742,3,4
8,0.9,0.996024,0.996063,0.990991,0.995515,0.0,0.658683,0.995515,0.175056,0.833314,55,2
9,1.0,1.0,1.0,1.0,0.998376,0.0,1.0,0.998376,0.024788,1.0,76,2


In [24]:
pp.pprint(results_va)

{   'classification':       roc_auc_score    auc_pr  avg_prec_score  f1_max  p_f1_max     kappa  \
task                                                                        
0          0.993095  0.998994        0.998963  0.9869  0.912304  0.786753   
1               NaN       NaN             NaN     NaN       NaN       NaN   
2               NaN       NaN             NaN     NaN       NaN       NaN   
3               NaN       NaN             NaN     NaN       NaN       NaN   
4               NaN       NaN             NaN     NaN       NaN       NaN   
...             ...       ...             ...     ...       ...       ...   
95              NaN       NaN             NaN     NaN       NaN       NaN   
96              NaN       NaN             NaN     NaN       NaN       NaN   
97              NaN       NaN             NaN     NaN       NaN       NaN   
98         1.000000  1.000000        1.000000  1.0000  0.969434  0.545455   
99              NaN       NaN             NaN     NaN 

## Results 

### Results of run on synthetic data 
    Run name is 'sc_run_
    h40.40_
    ldo_r_wd0.0001_
    lr0.001_
    lrsteps10_
    ep20_
    fva0_fteNone'

In [None]:
19     |  0.64258  0.64258  0.86670  0.86844   0.51818  0.79627 |       nan       nan       nan |    1.1 

### Results of run on synthetic data 
    Run name is 'sc_run_
    h400.400_
    ldo_r_wd0.0001_
    lr0.001_
    lrsteps10_
    ep20_
    fva: 0 fte: None'.

In [None]:
Epoch  |     logl  bceloss   aucroc    aucpr aucpr_cal   f1_max |      rmse  rsquared  corrcoef | tr_time 

0      |  0.34269  0.45776  0.80664  0.72646   0.48133  0.74813 |       nan       nan       nan |   13.6 
1      |  0.31554  0.42048  0.83178  0.75306   0.51796  0.76370 |       nan       nan       nan |   12.9 
2      |  0.31272  0.41163  0.84092  0.76037   0.53157  0.76873 |       nan       nan       nan |   12.8 
3      |  0.31439  0.41333  0.84125  0.76524   0.53492  0.77030 |       nan       nan       nan |   12.9 
4      |  0.31480  0.41191  0.84390  0.76670   0.53815  0.77059 |       nan       nan       nan |   12.8 
5      |  0.31809  0.41840  0.84434  0.76798   0.53912  0.77124 |       nan       nan       nan |   12.9 
6      |  0.32237  0.42103  0.84506  0.76737   0.53627  0.77180 |       nan       nan       nan |   12.9 
7      |  0.32203  0.42055  0.84465  0.76600   0.53617  0.77124 |       nan       nan       nan |   13.1 
8      |  0.32931  0.43144  0.84398  0.76671   0.53747  0.77013 |       nan       nan       nan |   13.1 
9      |  0.33162  0.42992  0.84499  0.76793   0.53587  0.77157 |       nan       nan       nan |   13.1 
10     |  0.32617  0.42194  0.84909  0.77293   0.54663  0.77418 |       nan       nan       nan |   13.0 
11     |  0.32782  0.42469  0.84891  0.77210   0.54406  0.77411 |       nan       nan       nan |   12.9 
12     |  0.33080  0.42809  0.84897  0.77252   0.54445  0.77391 |       nan       nan       nan |   12.9 
13     |  0.33451  0.43235  0.84840  0.77160   0.54306  0.77383 |       nan       nan       nan |   13.1 
14     |  0.33882  0.43660  0.84832  0.77141   0.54322  0.77295 |       nan       nan       nan |   13.1 
15     |  0.34108  0.43973  0.84803  0.77172   0.54287  0.77311 |       nan       nan       nan |   13.1 
16     |  0.34506  0.44437  0.84782  0.77086   0.54190  0.77230 |       nan       nan       nan |   13.0 
17     |  0.34866  0.44951  0.84697  0.77017   0.54043  0.77185 |       nan       nan       nan |   13.0 
18     |  0.35135  0.45143  0.84740  0.77087   0.54130  0.77260 |       nan       nan       nan |   13.0 
19     |  0.35432  0.45495  0.84742  0.77097   0.54075  0.77233 |       nan       nan       nan |   13.0 

### SparseChem/0116_0843/

In [None]:
                                                                                                                       

Saving performance metrics (AUCs) and model.
Saved model weights into '/home/kbardool/kusanagi/experiments/SparseChem/0116_0843/sc_run_h25.25.25.25.25.25_ldo_r_wd0.0001_lr0.001_lrsteps10_ep40_fva0_fteNone.pt'.
Saved config and results into '/home/kbardool/kusanagi/experiments/SparseChem/0116_0843/sc_run_h25.25.25.25.25.25_ldo_r_wd0.0001_lr0.001_lrsteps10_ep40_fva0_fteNone.json'.
You can load the results by:
  import sparsechem as sc
  res = sc.load_results('/home/kbardool/kusanagi/experiments/SparseChem/0116_0843/sc_run_h25.25.25.25.25.25_ldo_r_wd0.0001_lr0.001_lrsteps10_ep40_fva0_fteNone.json')