In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
%load_ext autoreload
%autoreload 2

In [None]:
from GPUtil import showUtilization as gpu_usage
gpu_usage()                             

import torch
from GPUtil import showUtilization as gpu_usage
# from numba import cuda

print(' Allocated : ', torch.cuda.memory_allocated("cuda:2") ) #returns you the current GPU memory usage by tensors in bytes for a given device
print(' Reserved  : ', torch.cuda.memory_reserved("cuda:2") )#returns you the current GPU memory managed by caching allocator in bytes for a given device, in previous PyTorch versions the command was torch.cuda.memory_cached
 

# cuda_device = 0 

# def free_gpu_cache(cuda_device):
#     print("Initial GPU Usage")    
#     gpu_usage()                             

#     print("GPU Usage after emptying the cache")
#     gpu_usage()
    
#     print("CUDA empty cache")
#     torch.cuda.empty_cache()

#     print("Close and reopen device")
#     cuda.select_device(cuda_device)
#     print("Close device")    
#     cuda.close()
#     print("Reopen device")    
#     cuda.select_device(cuda_device)

#     print("GPU Usage after closing and reopening")
#     gpu_usage()

# # gpu_usage()                       
# free_gpu_cache(1)
# # gpu_usage()

In [None]:
print( torch.cuda.is_available())

In [None]:
import sys
if ('../..' not in sys.path):
    sys.path.append('../..')
print(sys.path)


In [None]:
import os
import shutil
import subprocess
import sparsechem as sc
import numpy as np
import string
import glob
import scipy.sparse
import pandas as pd
from urllib.request import urlretrieve
from examples.chembl.test_train import random_str
import torch 
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import MultiStepLR
from torchinfo import summary
from torch.utils.tensorboard import SummaryWriter
import pprint
import time
from sparsechem.utils import training_arguments, load_task_weights
from GPUtil import showUtilization as gpu_usage
pp = pprint.PrettyPrinter(indent=4)

In [None]:
def display_input_parms(args):
    """Display Configuration values."""
    print("\n   Arguments passed :")
    print("   --------------------")
    for a in dir(args):
        if not a.startswith("__") and not callable(getattr(args, a)):
            print("   {:30} {}".format(a, getattr(args, a)))
    print("\n")
 

In [None]:
dev = "cuda:0"
data_dir="test_chembl23"
rstr = random_str(12)
output_dir = f"./{data_dir}/models-{rstr}/"
print(f"Call test_classification with dev: {dev} , data_dir: {data_dir} \n")  
print(f"Output dir : {output_dir}")

In [None]:
def vprint(s=""):
    if args.verbose:
        print(s)

## Training Setup

In [None]:
parser = training_arguments()

dev = "cuda:0" 
data_dir="test_chembl23"
rm_output=False
rstr = random_str(12)
output_dir = f"./{data_dir}/models-{rstr}/"
cmd = (
        f" --x ./{data_dir}/chembl_23mini_x.npy" +
        f" --y_class ./{data_dir}/chembl_23mini_y.npy" +
        f" --folding ./{data_dir}/chembl_23mini_folds.npy" +
        f" --batch_ratio 0.001" +
        f" --output_dir {output_dir}" +
        f" --hidden_sizes 20 20 " +
        f" --epochs 2" +
        f" --lr 1e-3" +
        f" --lr_steps 1" +
        f" --dev {dev}" +
        f" --verbose 1"
)

args = parser.parse_args(cmd.split())

In [None]:
display_input_parms(args)

In [None]:
##
## Generate runname if one wasn't provided in input args 
##
if args.run_name is not None:
    name = args.run_name
else:
    name  = f"sc_{args.prefix}_h{'.'.join([str(h) for h in args.hidden_sizes])}_ldo{args.last_dropout:.1f}_wd{args.weight_decay}"
    name += f"_lr{args.lr}_lrsteps{'.'.join([str(s) for s in args.lr_steps])}_ep{args.epochs}"
    name += f"_fva{args.fold_va}_fte{args.fold_te}"
vprint(f"\nRun name is '{name}'.")

In [None]:
##
## if args.save_board, Setup tensorboard writer
##
if args.save_board:
    tb_name = os.path.join(args.output_dir, "boards", name)
    vprint(f"\nargs.save_board is '{args.save_board}' - will be written to {tb_name}.")
    writer  = SummaryWriter(tb_name)
else:
    writer = Nothing()

assert args.input_size_freq is None, "Using tail compression not yet supported."

### Load X (ecfp) and Y datasets

In [None]:
## Verify presence of Y label data
if (args.y_class is None) and (args.y_regr is None):
    raise ValueError("No label data specified, please add --y_class and/or --y_regr.")

##



ecfp     = sc.load_sparse(args.x)
y_class  = sc.load_sparse(args.y_class)
y_regr   = sc.load_sparse(args.y_regr)
y_censor = sc.load_sparse(args.y_censor)
 
vprint(f"ecfp shape                   : {ecfp.shape}")
vprint(f"y_class shape                : {y_class.shape}")

if (y_regr is not None):
    vprint(f"y_regr shape         : {y_regr.shape}")

if (y_censor is not None):
    vprint(f"y_censor shape       : {y_censor.shape}")

if (y_regr is None) and (y_censor is not None):
    raise ValueError("y_censor provided please also provide --y_regr.")

if y_class is None:
    y_class = scipy.sparse.csr_matrix((ecfp.shape[0], 0))
    vprint(f"Created y_class shape        : {y_class.shape}")

if y_regr is None:
    y_regr  = scipy.sparse.csr_matrix((ecfp.shape[0], 0))
    vprint(f"Created y_regr shape         : {y_regr.shape}")

if y_censor is None:
    y_censor = scipy.sparse.csr_matrix(y_regr.shape)
    vprint(f"Created y_censor shape       : {y_censor.shape}")
    
folding = np.load(args.folding)
assert ecfp.shape[0] == folding.shape[0], "x and folding must have same number of rows"
vprint(f"folding shape                : {folding.shape}")


vprint() 
vprint(f"x shape              : {ecfp.shape}")
vprint(f"y_class shape        : {y_class.shape}")
vprint(f"y_regr shape         : {y_regr.shape}")
vprint(f"y_censor shape       : {y_censor.shape}")

### Loading task weights

In [None]:
print(args.weights_class, args.weights_regr)

In [None]:
## Loading task weights
tasks_class = sc.load_task_weights(args.weights_class, y=y_class, label="y_class")
tasks_regr  = sc.load_task_weights(args.weights_regr, y=y_regr, label="y_regr")

In [None]:
if tasks_class.aggregation_weight is None:
    '''
    fold classes 
    '''
    ## using min_samples rule
    fold_pos, fold_neg = sc.class_fold_counts(y_class, folding)
    n = args.min_samples_class
    tasks_class.aggregation_weight = ((fold_pos >= n).all(0) & (fold_neg >= n)).all(0).astype(np.float64)

In [None]:
# tasks_class = sc.load_task_weights(args.weights_class, y=y_class, label="y_class")
# tasks_regr  = sc.load_task_weights(args.weights_regr , y=y_regr, label="y_regr")

# print("\n")
# if tasks_class.aggregation_weight is None:
#     '''
#     fold classes 
#     '''
#     ## using min_samples rule
#     fold_pos, fold_neg = sc.class_fold_counts(y_class, folding)
#     n = args.min_samples_class
#     tasks_class.aggregation_weight = ((fold_pos >= n).all(0) & (fold_neg >= n)).all(0).astype(np.float64)    
    
    
#     print(f" fold_pos shape: {fold_pos.shape}")
#     print(fold_pos[4])
#     print(f"\n fold_neg.shape  {fold_neg.shape}")
#     print(fold_neg[4])

#     print(f"\n min_smaples_class: {args.min_samples_class}")
    
#     print(f"\n (fold_pos >= n) {(fold_pos >=n).shape}")
#     print((fold_pos >= n)[4])
#     print(f"\n (fold_neg >= n)  {(fold_neg >=n).shape}")
#     print((fold_neg >= n)[4])
#     print()
#     print(f"\n (fold_pos >= n).all(0) shape:   {(fold_pos >= n).all(0).shape}")
#     print(f"\n  {(fold_pos >= n).all(0)}")
#     print()
#     print(f"\n (fold_neg >= n).all(0) shape:  {(fold_neg >= n).all(0).shape}")
#     print(f"\n  {(fold_neg >= n).all(0)}")    

#     a1 =( (fold_pos >= n).all(0) & (fold_neg >= n) ).all(0)
#     a2 =( (fold_pos >= n).all(0) & (fold_neg >= n).all(0))
#     a3 =( (fold_pos >= n) & (fold_neg >= n) ).all(0)
    
#     vprint(f"\n\n ( (fold_pos >= n).all(0) & (fold_neg >= n)).all(0): {a1.shape}  ")
#     vprint(f"{a1}" )
    
#     vprint(f"\n\n ( (fold_pos >= n).all(0) & (fold_neg >= n).all(0) ) : {a2.shape}  \n")
#     vprint(f"{a2}  \n")

    
#     vprint(f"\n\n [(fold_pos >= n) & (fold_neg >= n )].all(0): {a3.shape}  ")
#     vprint(f"{a3}" )
    
    
#     print((a1==a2).all(),  (a1==a3).all())
#     vprint(f"tasks_class.aggregation_weight : \n{tasks_class.aggregation_weight}  \n")

In [None]:
if tasks_regr.aggregation_weight is None:
    print("proceed")
    if y_censor.nnz == 0:
        y_regr2 = y_regr.copy()
        y_regr2.data[:] = 1
    else:
        ## only counting uncensored data
        y_regr2      = y_censor.copy()
        y_regr2.data = (y_regr2.data == 0).astype(np.int32)
  
    fold_regr, _ = sc.class_fold_counts(y_regr2, folding)
    print(' fold_regr:', fold_regr)
    del y_regr2
    tasks_regr.aggregation_weight = (fold_regr >= args.min_samples_regr).all(0).astype(np.float64)
    print(tasks_regr.aggregation_weight)

In [None]:
vprint(f"tasks_class.training_weight: \n{tasks_class.training_weight} \n")
vprint(f"tasks_class.aggregation_weight : \n{tasks_class.aggregation_weight}  \n")

 

#### load_task_weights() : step by step:

Executed when a filename is passed

In [None]:
# df = pd.read_csv('/home/kevin/MLDatasets/chembl_23_mini/chembl_23mini_class_weights.csv')
# df.rename(columns={"weight": "training_weight"}, inplace=True)
# df.rename(columns={c + "s": c for c in ["task_id", "training_weight", "aggregation_weight", "task_type", "censored_weight"]}, inplace=True)
# assert "task_id" in df.columns, "task_id is missing in task info CVS file"
# assert "training_weight" in df.columns, "training_weight is missing in task info CSV file"
# df.sort_values("task_id", inplace=True)
# df

# cols = ["", "task_id", "training_weight", "aggregation_weight", "task_type", "censored_weight"]
# for col in df.columns:
#     print(' col: ',col)
#     assert col in cols, f"Unsupported colum '{col}' in task weight file. Supported columns: {cols}."

### Input folding & transformation

In [None]:
print(f"args.fold_inputs : {args.fold_inputs} \t\t  transform: {args.input_transform}\n")
print(repr(ecfp))
ecfp = sc.fold_transform_inputs(ecfp, folding_size=args.fold_inputs, transform=args.input_transform)##
print(repr(ecfp))

### Get number of positive / neg and total for each classes

In [None]:
num_pos    = np.array((y_class == +1).sum(0)).flatten()
num_neg    = np.array((y_class == -1).sum(0)).flatten()
num_class  = np.array((y_class != 0).sum(0)).flatten()

if (num_class != num_pos + num_neg).any():
    raise ValueError("For classification all y values (--y_class/--y) must be 1 or -1.")

num_regr   = np.bincount(y_regr.indices, minlength=y_regr.shape[1])

print(' num_regr          : ', num_regr)
print(' folding file      : ', folding)

In [None]:
vprint(f"Input dimension      : {ecfp.shape[1]}")
vprint(f"#samples             : {ecfp.shape[0]}")
vprint(f"#classification tasks: {y_class.shape[1]}")
vprint(f"#regression tasks    : {y_regr.shape[1]}")
vprint(f"Using {(tasks_class.aggregation_weight > 0).sum()} classification tasks for calculating aggregated metrics (AUCROC, F1_max, etc).")
vprint(f"Using {(tasks_regr.aggregation_weight > 0).sum()} regression tasks for calculating metrics (RMSE, Rsquared, correlation).")
vprint(f"args.fold_te         : {args.fold_te}")

In [None]:
if args.fold_te is not None and args.fold_te >= 0:
    ## removing test data
    assert args.fold_te != args.fold_va, "fold_va and fold_te must not be equal."
    keep    = folding != args.fold_te
    ecfp    = ecfp[keep]
    y_class = y_class[keep]
    y_regr  = y_regr[keep]
    y_censor= y_censor[keep]
    folding = folding[keep]
else:
    print("args.fold_te is None")

### Identify training and validation data

In [None]:
fold_va = args.fold_va

idx_tr  = np.where(folding != fold_va)[0]
idx_va  = np.where(folding == fold_va)[0]

vprint(f"fold_va        : {fold_va}")
vprint(f"idx_va         : {idx_tr}    Length: {len(idx_tr)}")
vprint(f"idx_va         : {idx_va}    Length: {len(idx_va)}")

In [None]:
y_class_tr = y_class[idx_tr]
y_class_va = y_class[idx_va]
y_regr_tr  = y_regr[idx_tr]
y_regr_va  = y_regr[idx_va]
y_censor_tr = y_censor[idx_tr]
y_censor_va = y_censor[idx_va]

print('y_class_tr : ', repr(y_class_tr))
print('y_regr_tr  : ', repr(y_regr_tr))
print('y_censor_tr: ', repr(y_censor_tr))
print()
print('y_class_tr : ', repr(y_class_va))
print('y_regr_tr  : ', repr(y_regr_va))
print('y_censor_tr: ', repr(y_censor_va))      
      
      

In [None]:
num_pos_va  = np.array((y_class_va == +1).sum(0)).flatten()
num_neg_va  = np.array((y_class_va == -1).sum(0)).flatten()
num_regr_va = np.bincount(y_regr_va.indices, minlength=y_regr.shape[1])

print(f' num pos va : {num_pos_va[:100]}  sum: {num_pos_va.sum()}')
print(f' num neg va : {num_neg_va[:100]}  sum: {num_neg_va.sum()}')
print(f' num regr va: {num_regr_va}')

### Determine Batch Size

In [None]:
args.internal_batch_max = None
batch_size  = int(np.ceil(args.batch_ratio * idx_tr.shape[0]))
num_int_batches = 1

print(f" internal_batch_max: {args.internal_batch_max}")
print(f" batch ratio:  {args.batch_ratio}") 
print(f" training file size (idx_tr.shape): {idx_tr.shape}")
print(f" batch size:  batch ratio * idx_tr.shape[0] : {args.batch_ratio * idx_tr.shape[0]}  -> {batch_size}")


args.internal_batch_max = None


if args.internal_batch_max is not None:
    if args.internal_batch_max < batch_size:
        print(f" batch size is ({batch_size}) larger than internal batch maximum size ({args.internal_batch_max})")
        print(f" batch_size / args.internal_batch_max : {batch_size / args.internal_batch_max}")
        print(f" num int_batches: {int(np.ceil(batch_size / args.internal_batch_max))}")

        num_int_batches = int(np.ceil(batch_size / args.internal_batch_max)) 
        
        print(f" batch_size / num_int_batches : {batch_size / num_int_batches}")
        print(f" batch_size: {int(np.ceil(batch_size / num_int_batches))}")
        
        batch_size      = int(np.ceil(batch_size / num_int_batches))
        

vprint(f" batch size:            {batch_size}")
vprint(f" internal batch size:   {num_int_batches}")


### Instantiate Data Loaders

In [None]:
ecfp

In [None]:
dataset_tr = sc.ClassRegrSparseDataset(x=ecfp[idx_tr], y_class=y_class_tr, y_regr=y_regr_tr, y_censor=y_censor_tr)
dataset_va = sc.ClassRegrSparseDataset(x=ecfp[idx_va], y_class=y_class_va, y_regr=y_regr_va, y_censor=y_censor_va)

print(dataset_tr)

In [None]:
loader_tr = DataLoader(dataset_tr, batch_size=batch_size, num_workers = 1, pin_memory=True, collate_fn=dataset_tr.collate, shuffle=True)
loader_va = DataLoader(dataset_va, batch_size=batch_size, num_workers = 1, pin_memory=True, collate_fn=dataset_va.collate, shuffle=False)

args.input_size  = dataset_tr.input_size
args.output_size = dataset_tr.output_size

args.class_output_size = dataset_tr.class_output_size
args.regr_output_size  = dataset_tr.regr_output_size

print(f" input size        : {dataset_tr.input_size}     output size: {dataset_tr.output_size}")
print(f" class output size : {dataset_tr.class_output_size}     regr_output_size : {dataset_tr.regr_output_size}")
print(f" Batch size        : {batch_size}")

In [None]:
# batch_enumerator  = enumerate(loader_tr)

#  a1 = [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  8,  9, 10, 11, 12, 13, 14]
#  a2 = [ 4, 41, 74, 33, 50,  0, 33, 38, 43, 48,  0, 28, 14, 54, 10, 85]

# b1 = np.array(a1)
# print(b1)

## Model Definition

### Display Input Params

In [None]:
# display_input_parms(args)

### Instantiate Model

In [None]:
# %load_ext autoreload
# %autoreload 2
# del net

In [None]:
dev  = torch.device(args.dev)
print(f" dev: {dev}")

net  = sc.SparseFFN(args)
# net  = sc.SparseFFN(args).to(dev)
print(f" Net: \n {net}")

### Summarize the defined model

    Summarize the given PyTorch model. Summarized information includes:
        1) Layer names,
        2) input/output shapes,
        3) kernel shape,
        4) # of parameters,
        5) # of operations (Mult-Adds)

    Args:
        model (nn.Module):
                PyTorch model to summarize. The model should be fully in either train()
                or eval() mode. If layers are not all in the same mode, running summary
                may have side effects on batchnorm or dropout statistics. If you
                encounter an issue with this, please open a GitHub issue.

    input_data (Sequence of Sizes or Tensors):
            Example input tensor of the model (dtypes inferred from model input).
            - OR -
            Shape of input data as a List/Tuple/torch.Size
            (dtypes must match model input, default is FloatTensors).
            You should NOT include batch size in the tuple.
            - OR -
            If input_data is not provided, no forward pass through the network is
            performed, and the provided model information is limited to layer names.
            Default: None

    batch_dim (int):
            Batch_dimension of input data. If batch_dim is None, the input data
            is assumed to contain the batch dimension.
            WARNING: in a future version, the default will change to None.
            Default: 0

    branching (bool):
            Whether to use the branching layout for the printed output.
            Default: True

    col_names (Iterable[str]):
            Specify which columns to show in the output. Currently supported:
            ("input_size", "output_size", "num_params", "kernel_size", "mult_adds")
            If input_data is not provided, only "num_params" is used.
            Default: ("output_size", "num_params")

    col_width (int):
            Width of each column.
            Default: 25

    depth (int):
            Number of nested layers to traverse (e.g. Sequentials).
            Default: 3

    device (torch.Device):
            Uses this torch device for model and input_data.
            If not specified, uses result of torch.cuda.is_available().
            Default: None

    dtypes (List[torch.dtype]):
            For multiple inputs, specify the size of both inputs, and
            also specify the types of each parameter here.
            Default: None

    verbose (int):
            0 (quiet): No output
            1 (default): Print model summary
            2 (verbose): Show weight and bias layers in full detail
            Default: 1

    *args, **kwargs:
            Other arguments used in `model.forward` function.

    Return:
        ModelStatistics object
                See torchsummary/model_statistics.py for more information.

### Display Model Summary

In [None]:
# summary(model, input_size=(batch_size, 1, 28, 28))

summary(net, input_size=(200, 85277), 
        col_names=["input_size", "output_size", "num_params", "kernel_size", "mult_adds"],
        verbose=2,
        depth = 6,
        col_width=16,
        device='cuda:0',
        row_settings=["depth","var_names"],);

### Loss functions , Weights, and Optimizer

In [None]:
loss_class = torch.nn.BCEWithLogitsLoss(reduction="none")
loss_regr  = sc.censored_mse_loss

if not args.censored_loss:
    loss_regr = functools.partial(loss_regr, censored_enabled=False)


In [None]:
print(repr(loss_regr))
print(repr(loss_class))
print(dev)

In [None]:
tasks_class.training_weight = tasks_class.training_weight.to(dev)
tasks_regr.training_weight  = tasks_regr.training_weight.to(dev)
tasks_regr.censored_weight  = tasks_regr.censored_weight.to(dev)

optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay)
scheduler = MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_alpha)

num_prints = 0

In [None]:
print(optimizer)
print(f"LR steps: {args.lr_steps}   gamma: {args.lr_alpha}")
print(scheduler)

In [None]:
from sparsechem import Nothing
# if args.save_board:
#     tb_name = os.path.join(args.output_dir, "boards", name)
#     vprint(f"\nargs.save_board is '{args.save_board}' - will be written to {tb_name}.")
#     writer  = SummaryWriter(tb_name)
# else:
writer = Nothing()

In [None]:
#     print("Initial GPU Usage")    
#     gpu_usage()         

## Training loop

In [None]:
args.epochs = 40
print(args.epochs)


In [None]:
print(tasks_class.training_weight)
print(num_int_batches)

In [None]:
# torch.set_printoptions(precision=None, threshold=None, edgeitems=None, linewidth=None, profile=None, sci_mode=None)
torch.set_printoptions( linewidth=132)

In [None]:
##
## Training loop
##
for epoch in range(args.epochs):
    t0 = time.time()
    sc.train_class_regr(
        net, optimizer,
        loader          = loader_tr,
        loss_class      = loss_class,
        loss_regr       = loss_regr,
        dev             = dev,
        weights_class   = tasks_class.training_weight,
        weights_regr    = tasks_regr.training_weight,
        censored_weight = tasks_regr.censored_weight,
        normalize_loss  = args.normalize_loss,
        num_int_batches = num_int_batches,
        progress        = args.verbose >= 2)

    t1 = time.time()

    eval_round = (args.eval_frequency > 0) and ((epoch + 1) % args.eval_frequency == 0)
    last_round = epoch == args.epochs - 1
    break
    
    if eval_round or last_round:
        results_va = sc.evaluate_class_regr(net, loader_va, 
                                            loss_class, 
                                            loss_regr, 
                                            tasks_class=tasks_class, 
                                            tasks_regr=tasks_regr, 
                                            dev=dev, 
                                            progress = args.verbose >= 2)
        for key, val in results_va["classification_agg"].items():
            writer.add_scalar(key+"/va", val, epoch)
        for key, val in results_va["regression_agg"].items():
            writer.add_scalar(key+"/va", val, epoch)

        if args.eval_train:
            results_tr = sc.evaluate_class_regr(net, loader_tr, 
                                                loss_class, loss_regr, 
                                                tasks_class=tasks_class, tasks_regr=tasks_regr, 
                                                dev=dev, progress = args.verbose >= 2)
            for key, val in results_tr["classification_agg"].items():
                writer.add_scalar(key+"/tr", val, epoch)
            for key, val in results_tr["regression_agg"].items():
                writer.add_scalar(key+"/tr", val, epoch)
        else:
            results_tr = None

        if args.verbose:
            ## printing a new header every 20 lines
            header = num_prints % 20 == 0
            num_prints += 1
            sc.print_metrics_cr(epoch, t1 - t0, results_tr, results_va, header)

    scheduler.step()
    print('-'*100)
    print('-'*100)
    print('-'*100)    
    break
# writer.close()
# vprint()
# vprint("Saving performance metrics (AUCs) and model.")

### `train_class_regr()`

In [None]:
# def train_class_regr(net, optimizer, loader, loss_class, loss_regr, dev,
#                      weights_class, weights_regr, censored_weight,
#                      normalize_loss=None, num_int_batches=1, progress=True):
#     sc.train_class_regr(
#         net, 
#         optimizer,
loader          = loader_tr
#         loss_class      = loss_class,
#         loss_regr       = loss_regr,
#         dev             = dev,
weights_class   = tasks_class.training_weight
weights_regr    = tasks_regr.training_weight
censored_weight = tasks_regr.censored_weight
normalize_loss  = args.normalize_loss
num_int_batches = 1
progress        = args.verbose >= 2

In [None]:
net.train()
int_count = 0

In [None]:
# from tqdm  import tqdm
print(type(loader))
print(normalize_loss)
print(num_int_batches)

In [None]:
b = next(iter(loader))

In [None]:
print( b.keys())
for key in b.keys():
    if isinstance(b[key], torch.Tensor):
        print(f" {key:10s}:   {b[key].shape}")
    elif isinstance(b[key], int):
        print(f" {key:10s}:   {b[key]}")
    else:
        print(f" {key:10s}:   {b[key]}")

print(type(y_class))        
print(f"yc_ind: \n {b['yc_ind'][0]} \n\n {b['yc_ind'][1]}")

In [None]:
i = 0 ;
# for b in tqdm(loader, leave=False, disable=(progress == False)):
# b = next(iter(loader))
print(i, b.keys())
i+=1
for key in b.keys():

    if b[key] is None:
        continue
    if isinstance(b[key], torch.Tensor):
        print(f" {key:10s}:   {b[key].shape}")
    elif isinstance(b[key], int):
        print(f" {key}:   {b[key]}")

if int_count == 0:
    print("optimizer.zero_grad()")
    optimizer.zero_grad()

In [None]:
norm = normalize_loss
print(f"Normalize loss : {norm}")
if norm is None:
    norm = b["batch_size"] * num_int_batches
print(f"Normalize loss : {norm}")

    

In [None]:
fwd = batch_forward(net, b=b, 
                    input_size=loader.dataset.input_size, 
                    loss_class=loss_class, loss_regr=loss_regr, 
                    weights_class=weights_class, weights_regr=weights_regr, censored_weight=censored_weight, 
                    dev=dev)
# fwd = out

In [None]:
loss = fwd["yc_loss"] + fwd["yr_loss"]
loss_norm = loss / norm
print(f" loss: {loss}      loss_norm: {loss_norm}    {type(loss_norm)}")


In [None]:
loss_norm.backward()

In [None]:
int_count += 1
print(int_count, num_int_batches)

In [None]:
if int_count == num_int_batches:
    print(" optimizer.step()")
    optimizer.step()
    int_count = 0

In [None]:
if int_count > 0:
    ## process tail batch (should not happen)
    optimizer.step()

###  `batch_forward()`

In [None]:
# def batch_forward(net, b, input_size, loss_class, loss_regr, weights_class, weights_regr, censored_weight=[], dev="cpu"):


# fwd = batch_forward(net, b=b, 
#                     input_size=loader.dataset.input_size, 
#                     loss_class=loss_class, 
#                     loss_regr=loss_regr, 
#                     weights_class=weights_class, 
#                     weights_regr=weights_regr, 
#                     censored_weight=censored_weight, 
#                     dev=dev)

In [None]:
input_size = loader.dataset.input_size
#                     loss_class = loss_class
#                     loss_regr  = loss_regr
#                     weights_class=weights_class, 
#                     weights_regr=weights_regr, 
#                     censored_weight=censored_weight, 
#                     dev=dev)

In [None]:
print(f" input size : {input_size}")
print(b["x_ind"][:,:4])
print(b["x_ind"][:,-4:])

In [None]:
X = torch.sparse_coo_tensor(
    b["x_ind"],
    b["x_data"],
    size = [b["batch_size"], input_size]).to(dev, non_blocking=True)

print(X)

In [None]:

yc_hat_all, yr_hat_all = net(X)


In [None]:
np.set_printoptions( linewidth=120)
torch.set_printoptions(linewidth=120)

In [None]:
print(yc_hat_all.shape)
print(yc_hat_all[:5,:10])
print(yr_hat_all.shape)

In [None]:
out = {}
out["yc_hat_all"] = yc_hat_all
out["yr_hat_all"] = yr_hat_all
out["yc_loss"]    = 0
out["yr_loss"]    = 0
out["yc_weights"] = 0
out["yr_weights"] = 0

In [None]:
print(b["yc_ind"].shape, '\n', b["yc_ind"][:,:20])
print(b["yc_data"].shape, '\n', b["yc_data"][:20])

In [None]:
if net.class_output_size > 0:
    print(net.class_output_size)
    yc_ind  = b["yc_ind"].to(dev, non_blocking=True)
    yc_w    = weights_class[yc_ind[1]]
    yc_data = b["yc_data"].to(dev, non_blocking=True)
    yc_hat  = yc_hat_all[yc_ind[0], yc_ind[1]]
    out["yc_ind"]  = yc_ind
    out["yc_data"] = yc_data
    out["yc_hat"]  = yc_hat
    out["yc_loss"] = (loss_class(yc_hat, yc_data) * yc_w).sum()
    out["yc_weights"] = yc_w.sum()

In [None]:
print(b["yc_data"].shape, '\n', b["yc_data"][:20])
print(yc_hat.shape, '\n', yc_hat[:20])
print(yc_w.shape, '\n' , yc_w[:20] )
print(loss_class(yc_hat, yc_data).shape, '\n', loss_class(yc_hat, yc_data)[:20])
print(out["yc_loss"].shape, '\n', out["yc_loss"])

In [None]:
net.regr_output_size

In [None]:
if net.regr_output_size > 0:
    yr_ind  = b["yr_ind"].to(dev, non_blocking=True)
    yr_w    = weights_regr[yr_ind[1]]
    yr_data = b["yr_data"].to(dev, non_blocking=True)
    yr_hat  = yr_hat_all[yr_ind[0], yr_ind[1]]

    out["ycen_data"] = b["ycen_data"]
    if out["ycen_data"] is not None:
        out["ycen_data"] = out["ycen_data"].to(dev, non_blocking=True)

        if len(censored_weight) > 0:
            ## updating weights of censored data
            yrcen_w = yr_w * censored_weight[yr_ind[1]]
            yr_w    = torch.where(out["ycen_data"] == 0, yr_w, yrcen_w)

    out["yr_ind"]  = yr_ind
    out["yr_data"] = yr_data
    out["yr_hat"]  = yr_hat
    out["yr_loss"] = (loss_regr(input=yr_hat, target=yr_data, censor=out["ycen_data"]) * yr_w).sum()
    out["yr_weights"] = yr_w.sum()
else:
    print("net.regr_output_size is 0")

In [None]:

# i = torch.LongTensor((np.arange(10),np.arange(10)))
# v = torch.FloatTensor(np.random.normal(size=10))
# G = torch.sparse.FloatTensor(i,v,torch.Size((100,100)))
# x = torch.Tensor(np.random.normal(size=100))
# device = torch.device('cpu')
# x = x.to(device)
# torch.mm(G, x.unsqueeze(1))

### training functions 

In [None]:

def train_class_regr(net, optimizer, loader, loss_class, loss_regr, dev,
                     weights_class, weights_regr, censored_weight,
                     normalize_loss=None, num_int_batches=1, progress=True):
    
    ## Set the model in training mode.
    net.train()

    int_count = 0
    for b in tqdm(loader, leave=False, disable=(progress == False)):
        if int_count == 0:
            optimizer.zero_grad()

        norm = normalize_loss
        if norm is None:
            norm = b["batch_size"] * num_int_batches

        fwd = batch_forward(net, b=b, 
                            input_size=loader.dataset.input_size, 
                            loss_class=loss_class, loss_regr=loss_regr, 
                            weights_class=weights_class, weights_regr=weights_regr, censored_weight=censored_weight, 
                            dev=dev)
        loss = fwd["yc_loss"] + fwd["yr_loss"]
        loss_norm = loss / norm
    
        loss_norm.backward()

        int_count += 1
        if int_count == num_int_batches:
            optimizer.step()
            int_count = 0

    if int_count > 0:
        ## process tail batch (should not happen)
        optimizer.step()


In [None]:
def batch_forward(net, b, input_size, loss_class, loss_regr, weights_class, weights_regr, censored_weight=[], dev="cpu"):
    """returns full outputs from the network for the batch b"""
    ## Convert CSR tensor to COO tensor.
    X = torch.sparse_coo_tensor(
        b["x_ind"],
        b["x_data"],
        size = [b["batch_size"], input_size]).to(dev, non_blocking=True)

    yc_hat_all, yr_hat_all = net(X)

    out = {}
    out["yc_hat_all"] = yc_hat_all
    out["yr_hat_all"] = yr_hat_all
    out["yc_loss"]    = 0
    out["yr_loss"]    = 0
    out["yc_weights"] = 0
    out["yr_weights"] = 0

    if net.class_output_size > 0:
        yc_ind  = b["yc_ind"].to(dev, non_blocking=True)
        yc_w    = weights_class[yc_ind[1]]
        yc_data = b["yc_data"].to(dev, non_blocking=True)
        yc_hat  = yc_hat_all[yc_ind[0], yc_ind[1]]
        out["yc_ind"]  = yc_ind
        out["yc_data"] = yc_data
        out["yc_hat"]  = yc_hat
        out["yc_loss"] = (loss_class(yc_hat, yc_data) * yc_w).sum()
        out["yc_weights"] = yc_w.sum()

    if net.regr_output_size > 0:
        yr_ind  = b["yr_ind"].to(dev, non_blocking=True)
        yr_w    = weights_regr[yr_ind[1]]
        yr_data = b["yr_data"].to(dev, non_blocking=True)
        yr_hat  = yr_hat_all[yr_ind[0], yr_ind[1]]

        out["ycen_data"] = b["ycen_data"]
        if out["ycen_data"] is not None:
            out["ycen_data"] = out["ycen_data"].to(dev, non_blocking=True)
            
            if len(censored_weight) > 0:
                ## updating weights of censored data
                yrcen_w = yr_w * censored_weight[yr_ind[1]]
                yr_w    = torch.where(out["ycen_data"] == 0, yr_w, yrcen_w)

        out["yr_ind"]  = yr_ind
        out["yr_data"] = yr_data
        out["yr_hat"]  = yr_hat
        out["yr_loss"] = (loss_regr(input=yr_hat, target=yr_data, censor=out["ycen_data"]) * yr_w).sum()
        out["yr_weights"] = yr_w.sum()

    return out

## Model Saving 

In [None]:
#####   model saving   #####
if not os.path.exists(args.output_dir):
   os.makedirs(args.output_dir)

model_file = f"{args.output_dir}/{name}.pt"
out_file   = f"{args.output_dir}/{name}.json"

if args.save_model:
   torch.save(net.state_dict(), model_file)
   vprint(f"Saved model weights into '{model_file}'.")

results_va["classification"]["num_pos"] = num_pos_va
results_va["classification"]["num_neg"] = num_neg_va
results_va["regression"]["num_samples"] = num_regr_va

if results_tr is not None:
    results_tr["classification"]["num_pos"] = num_pos - num_pos_va
    results_tr["classification"]["num_neg"] = num_neg - num_neg_va
    results_tr["regression"]["num_samples"] = num_regr - num_regr_va

sc.save_results(out_file, args, validation=results_va, training=results_tr)

vprint(f"Saved config and results into '{out_file}'.\nYou can load the results by:\n  import sparsechem as sc\n  res = sc.load_results('{out_file}')\n")


## Exploring Variables and Tensors

#### Panda Create Weights Dataframe

Create a dataframe of 100 rows consisting of random training weights and task types. 

In [None]:
    df = pd.DataFrame({
        "task_id":         np.arange(100),
        "training_weight": np.clip(np.random.randn(100), 0, 1),
        "task_type":       np.random.choice(["adme", "panel", "other"], size=100),
    })

In [None]:
df.head(5)

In [None]:
conf_file  = glob.glob(f"{output_dir}/*.json")
model_file = glob.glob(f"{output_dir}/*.pt")

print(f"Conf file : {conf_file}")
# results = sc.load_results(conf_file)

#### Compressed Sparse Row Formatted Matrix Manipulation

In [None]:
ecfp = np.load(f"./{data_dir}/chembl_23mini_x.npy", allow_pickle=True)


In [None]:
print('ecfp     : ' ,type(ecfp))
print('ecfp_item:  ', type(ecfp_item := ecfp.item()),
      '\necfp_tocsr: ', type(ecfp_tocsr := ecfp.item().tocsr()))
if (ecfp_item - ecfp_tocsr).nnz == 0:
    print(' Matricies are equal')

In [None]:
print(repr(ecfp_item))
print(repr(ecfp_tocsr))

### `y_class`

In [None]:
y_class = np.load(f"./{data_dir}/chembl_23mini_y.npy", allow_pickle=True)

print('y_class      : ', type(y_class), y_class.shape)
print('y_class_item : ', type(y_class_item := y_class.item()) , y_class_item.shape,
    '\ny_class_tocsr: ', type(y_class_tocsr := y_class.item().tocsr()), y_class_tocsr.shape)
if (y_class_item - y_class_tocsr).nnz == 0:
    print(' Matricies are equal')
print('Y_class: ', repr(y_class))
print('Y_class_item: ', repr(y_class_item))
print('Y_class_to_csr: ', repr(y_class_tocsr))

In [None]:
print((y_class_tocsr == +1).shape)
print((y_class_tocsr > 0.0).shape)

In [None]:
num_pos    = np.array((y_class_tocsr == +1).sum(0)).flatten()
num_neg    = np.array((y_class_tocsr == -1).sum(0)).flatten()
num_class  = np.array((y_class_tocsr != 0).sum(0)).flatten()
print(f' num pos  : {num_pos[:100]}')
print(f' num neg  : {num_neg[:100]}')
print(f' num class: {num_class[:100]}')

In [None]:
if (num_class != num_pos + num_neg).any():
    print("For classification all y values (--y_class/--y) must be 1 or -1.")

### `y_censor`

`y_censor` is a sparse matrix : row_count x class_count

In [None]:
y_censor = np.load(f"./{data_dir}/chembl_23mini_y_censored.npy", allow_pickle=True)

print('y_censor      : ', type(y_censor), y_censor.shape)
print('y_censor_item : ', type(y_censor_item := y_censor.item()) , y_censor_item.shape,
    '\ny_censor_tocsr: ', type(y_censor_tocsr := y_censor.item().tocsr()), y_censor_tocsr.shape)
if (y_censor_item - y_censor_tocsr).nnz == 0:
    print(' Matricies are equal')
 
print('Y_censor: ', repr(y_censor))
print('Y_censor_item: ', repr(y_censor_item))
print('Y_censor_to_csr: ', repr(y_censor_tocsr))

In [None]:
# print(y_censor_tocsr)


###  `y_regr`

`y_regr` is a sparse matrix : row_count x class_count

In [None]:
y_regr = np.load(f"./{data_dir}/chembl_23mini_y.npy", allow_pickle=True)

print('y_regr      : ', type(y_regr), y_regr.shape)
print('y_regr_item : ', type(y_regr_item := y_regr.item()) , y_regr_item.shape,
    '\ny_regr_tocsr: ', type(y_regr_tocsr := y_regr.item().tocsr()), y_regr_tocsr.shape)
if (y_regr_item - y_regr_tocsr).nnz == 0:
    print(' Matricies y_regr_item and y_regr_tocsr are equal')
 

print('Y_regr: ', repr(y_regr))
print('Y_regr_item: ', repr(y_regr_item))
print('Y_regr_to_csr: ', repr(y_regr_tocsr))

In [None]:
# print(y_regr_tocsr)


### Folding

In [None]:
folding = np.load(f"./{data_dir}/chembl_23mini_folds.npy")
print(repr(folding), folding.shape)
print(np.unique(folding))
print(np.bincount(folding))

In [None]:
y1= y_class_tocsr[folding == 4]
y1

In [None]:
# print(y_class_tocsr)

In [None]:
# print(y1)

In [None]:
# del n_pos, n_neg
n_pos =   np.array((y1 == +1).sum(0))
n_pos.shape
n_pos.flatten()
# n_neg =   np.array((y1 == -1).sum(0)).flatten()

In [None]:
folds = np.unique(folding)
num_pos = []
num_neg = []
for fold in folds:
    yf = y_class_tocsr[folding == fold]
    num_pos.append( np.array((yf == +1).sum(0)).flatten() )
    num_neg.append( np.array((yf == -1).sum(0)).flatten() )
    print(f' Fold: {fold}')
    print(f' ------------')
    print('num_pos: ' , num_pos)
    print('num_neg: ', num_neg)

In [None]:
num_pos_stack = np.row_stack(num_pos)

num_pos_stack.shape
num_pos_stack[:,:5]

### Input File Folding and Transformation

In [None]:
folding_size = 2
ecfp_folded = sc.fold_transform_inputs(ecfp_tocsr, folding_size=folding_size)


In [None]:
print(repr(ecfp_tocsr))
print(repr(ecfp_folded))

In [None]:
folding_size = 73
idx = ecfp_tocsr.nonzero()
print('\n', idx[0], idx[0].shape,'\n', idx[1], idx[1].shape)

folded = idx[1] % folding_size
print(folded, len(folded))
x = scipy.sparse.csr_matrix((ecfp_tocsr.data, (idx[0], folded)), shape=(ecfp_tocsr.shape[0], folding_size))
print(repr(x))

## Eliminate dups by adding them together.
x.sum_duplicates()
print(repr(x))

In [None]:
# print(x)