# Build NN model to predict TPSA from Pharmacophores

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
%load_ext autoreload  
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

In [2]:

# Models
import os
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import pickle
import itertools
from collections.abc import Iterator
from   datetime import datetime
from pprint import PrettyPrinter
import joblib

from utils import *
from utils_ml import model_selection

from multiprocessing import Pool, process

# from ydata_profiling import ProfileReport

import dask.dataframe as dd 
pp = PrettyPrinter(indent=4)
np.set_printoptions(edgeitems=3, infstr='inf', linewidth=150, nanstr='nan')
pd.options.display.width = 170



In [None]:
os.environ["WANDB_NOTEBOOK_NAME"] = "Adashare_Train.ipynb"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor


# utility class to help normalize labels such that they contain only values between 0 and n_classes-1. 
from sklearn.preprocessing import LabelEncoder, Normalizer, normalize, MaxAbsScaler,MinMaxScaler

#K-Folds cross-validator - Provides train/test indices to split data in train/test sets. Split dataset into k consecutive folds (without shuffling by default).
# Each fold is then used once as a validation while the k - 1 remaining folds form the training set.
from sklearn.model_selection import train_test_split, KFold

# Evaluate metric(s) by cross-validation and also record fit/score times.
from sklearn.model_selection import cross_validate, cross_val_score 

# Generate cross-validated estimates for each input data point.
# The data is split according to the cv parameter. Each sample belongs to exactly one test set, 
# and its prediction is computed with an estimator fitted on the corresponding training set.

from sklearn.model_selection import cross_val_predict

# Exhaustive search over specified parameter values for an estimator.
# Randomized search on hyper parameters.

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 

#Input checker utility for building a cross-validator.(?)

from sklearn.model_selection._split import check_cv

from sklearn.pipeline import Pipeline

# Make a scorer from a performance metric or loss function. This factory function wraps scoring 
# functions for use in GridSearchCV and cross_val_score. It takes a score function, such as accuracy_score, 
# mean_squared_error, adjusted_rand_score or average_precision_score and returns a callable that 
# scores an estimator’s output. The signature of the call is (estimator, X, y) where estimator
# is the model to be evaluated, X is the data and y is the ground truth labeling (or None in the
# case of unsupervised models).

from sklearn.metrics import make_scorer

#regression matrics

from sklearn.metrics import mean_absolute_error , mean_squared_error, r2_score

#classification metrics

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.base import TransformerMixin

# Construct a new unfitted estimator with the same parameters. Clone does a deep copy of the model 
# in an estimator without actually copying attached data. It returns a new estimator with the same 
# parameters that has not been fitted on any data.

from sklearn.base import clone

In [None]:
import dask
import dask_ml.model_selection as dcv
from dask_ml.model_selection import train_test_split
from dask_ml.model_selection import GridSearchCV, IncrementalSearchCV, HyperbandSearchCV
from dask.distributed import Client
from dask.distributed import LocalCluster
import joblib
# from dask_cuda import LocalCUDACluster
# from sklearn.model_selection import GridSearchCV


In [None]:
def result_model_selection(results, name):
    df_results = pd.DataFrame({'model'     : [name] * len(results.cv_results_['params']),
                               'params'    : results.cv_results_['params'],
                               'mean score': results.cv_results_['mean_test_score'],
                               'std score' : results.cv_results_['std_test_score'],
                               'rank'      : results.cv_results_['rank_test_score']
                              })
    return df_results

In [3]:
def make_cv_splits(n_folds: int = 5,) -> Iterator[tuple[dd.DataFrame, dd.DataFrame]]:
    frac = [1 / n_folds] * n_folds
    splits = ddf.random_split(frac, shuffle=True)
    for i in range(n_folds):
        train = [splits[j] for j in range(n_folds) if j != i]
        test = splits[i]
        yield dd.concat(train), test

In [None]:
gzip_compression_options = {"method": "gzip", 'compresslevel': 1,"mtime": 1}

metadata_path = "../cj-datasets/metadata"
plates = pd.read_csv(os.path.join(metadata_path,"plate.csv.gz"))
wells = pd.read_csv(os.path.join(metadata_path,"well.csv.gz"))
compound = pd.read_csv(os.path.join(metadata_path,"compound.csv.gz"))
orf = pd.read_csv(os.path.join(metadata_path,"orf.csv.gz"))


## gz, bz2, zip, tar, tar.gz, tar.bz2
# types = ['.gz', '.bz2','.zip', '.tar', '.tar.gz', '.tar.bz2']
type_bz2 = 'bz2'
type_gzip = 'gz'


In [None]:
prefix = '' ### Target-2' , 'MOA'

input_path ="./input/"
output_path ="./output_11102023/"

prefix_lc = prefix.lower().replace('-', '_')
 
# compoundMetadataInputFile   = f"{input_path}JUMP-{prefix}_compound_library.csv"

compoundMetadataInputFile          = f"{output_path}{prefix_lc}_compound_metadata.csv"
compoundMetadataCleanFile          = f"{output_path}{prefix_lc}_compound_metadata_clean.csv"
compoundMetadataTPSAFile           = f"{output_path}{prefix_lc}_compound_metadata_tpsa.csv"
compoundMetadataTPSACleanFile      = f"{output_path}{prefix_lc}_compound_metadata_tpsa_clean.csv"
compoundTPSAFile                   = f"{output_path}{prefix_lc}_compound_TPSA.csv"
compoundTPSACleanFile              = f"{output_path}{prefix_lc}_compound_TPSA_clean.csv"
compoundPharmacophoreFile          = f"{output_path}{prefix_lc}_compound_pharmacophores_sparse.pkl"
compoundPharmacophoreCleanFile     = f"{output_path}{prefix_lc}_compound_pharmacophores_sparse_clean.pkl"
compoundPharmacophoreDenseZipFile  = f"{output_path}{prefix_lc}_compound_pharmacophores_dense.npz"
CompoundExtendedMetadataFile        = f"{output_path}{prefix_lc}compound_extended_metadata.csv"

CompoundExtendedMetadata5SampleFile = f"{output_path}{prefix_lc}compound_extended_metadata_5samples.csv"
CompoundProfiles5SampleFileCSV      = f"{output_path}{prefix_lc}compound_profiles_5samples.csv"

CompoundExtendedMetadata2SampleFile = f"{output_path}{prefix_lc}compound_extended_metadata_2samples.csv"
CompoundProfiles2SampleFileCSV      = f"{output_path}{prefix_lc}compound_profiles_2samples.csv"

# CompoundProfiles2SampleFileParquet  = f"{output_path}{prefix_lc}compound_profiles_2samples.parquet"
# CompoundProfiles2SampleFileGZ       = f"{output_path}{prefix_lc}compound_profiles_2samples.gz"

CompoundExtendedMetadataSampleFile  = f"{output_path}{prefix_lc}compound_extended_metadata_samples.csv"
featureSelectionFile                = f"{output_path}{prefix_lc}_normalized_feature_select.csv.gz"

In [None]:
print(f" ")
print(f" compound Metadata Input File             : {compoundMetadataInputFile}")
print(f" compound Metadata Output File            : {compoundMetadataCleanFile}")
print(f" compound Metadata + TPSA File            : {compoundMetadataTPSAFile}")
print(f" compound Metadata + TPSA Cleaned File    : {compoundMetadataTPSACleanFile}\n")
print(f" compound TPSA File                       : {compoundTPSAFile}")
print(f" compound TPSA Clean File                 : {compoundTPSACleanFile}")
print()
print(f" compound Pharmacophore Sparse File       : {compoundPharmacophoreFile}")
print(f" compound Pharmacophore Sparse Clean File : {compoundPharmacophoreCleanFile}")
print(f" compound Pharmacophore Dense Zipped File : {compoundPharmacophoreDenseZipFile}")
print(f" ")
print(f" Compound Extended MetadataFile           : {CompoundExtendedMetadataFile }")
print(f" Compound Extended Metadata 5 SampleFile  : {CompoundExtendedMetadata5SampleFile }")
print(f" Compound Profiles 5 Sample File CSV      : {CompoundProfiles5SampleFileCSV }")
print()
print(f" Compound Extended Metadata 2 SampleFile  : {CompoundExtendedMetadata2SampleFile }")
print(f" Compound Profiles 2 Samples File CSV     : {CompoundProfiles2SampleFileCSV}")
# print(f" ")
# print(f" CompoundProfiles2SamplesFile Parquet        : {CompoundProfiles2SampleFileParquet }")
# print(f" CompoundProfiles2SamplesFile gz             : {CompoundProfiles2SampleFileGZ }")
print(f" ")
print(f" featureSelectionFile                     : {featureSelectionFile}")

# Data Load and Prep

### Create dask cluster and client 

In [None]:
# cluster = LocalCluster()
cluster = LocalCluster("Kevins_Cluster", n_workers=2, threads_per_worker=2)
# client = Client("tcp://127.0.0.1:37937")
client = Client(cluster.scheduler_address)
# client = Client(processes = False)

In [None]:
# cluster.workers
# cluster.scale(2)
cluster.close()
client.close()
# del cluster

In [None]:
# client
# cluster.name
# print(cluster)
# cluster.dashboard_link
# cluster.scheduler_address
cluster.scheduler_spec
# cluster.workers

In [None]:
# cluster.scheduler.stop()
# cluster.scheduler.close()

In [None]:
# client 
# client.status
# client.connection_args
# del client

In [None]:
# with open("./metadata/parquet_columns.pkl",'rb') as f:
#     ParquetColumns = pickle.load(f)

# for k,v in ParquetColumns.items():
#     print(f" {k:20s}   items: {len(v)}")

In [None]:
# type(ParquetColumns['Cells']['Cells_AreaShape_Area'])
# ParquetColumns['Cells']
# del ParquetColumns


### Read column metadata file

In [None]:
with open("./metadata/feature_selection_columns.pkl", 'rb') as f: 
    x = pickle.load(f)
for i in x:
    print(f" {i:20s}    {len(x[i])} ")

X_columns = [] ## ["Metadata_JCP2022"]
y_columns = [] ## ["Metadata_JCP2022"]
X_columns.extend(x['selected'])
y_columns.extend([ "Metadata_log10TPSA"])
 

x_columns_drop = ["Metadata_Source", "Metadata_Batch", "Metadata_Plate", "Metadata_Well", "Metadata_TPSA", "Metadata_lnTPSA", "Metadata_log10TPSA"]
# x_columns_drop.extend(["Metadata_JCP2022"])

# columns_read = ["Metadata_JCP2022", "Metadata_log10TPSA"]
# print(f" len(columns_read) : {len(columns_read)}")
print(f" len(x_columms)    : {len(X_columns)}")
print(f" len(y_columms)    : {len(y_columns)}")

# columns_read.extend(x['selected'])
# print(f" len(columns_read) : {len(columns_read)}")
x_columns_dtype = {x: np.dtype('float32') for x in X_columns}
y_columns_dtype = {x: np.dtype('float32') for x in y_columns} ## "Metadata_log10TPSA":np.dtype('float64')}

### Read compound profiles

In [None]:
# Apply feature selection
profilesFile = CompoundProfiles2SampleFileCSV ## +'.'+ type_bz2
featureSelectionFile = './output_11102023//normalized_feature_select.csv'

print(f" Profiles file       :  {profilesFile}")
print(f" Features select file:  {featureSelectionFile}")

In [None]:
# df_profiles = dd.read_csv(profilesFile, blocksize="100MB", usecols=columns_read)  ##, index_col = 'CASRN')

# df_profiles.info()
# df_profiles.head(6)
# del df_X
# del df_y

In [None]:
# df_X = dd.read_csv(profilesFile, blocksize="100MB", usecols=X_columns, dtype= x_columns_dtype)  ##, index_col = 'CASRN')

In [None]:
# df_X.info()
# df_X.head()
# df_X.shape

In [None]:
# df_y = dd.read_csv(profilesFile, blocksize="100MB", usecols=y_columns, dtype=y_columns_dtype)  ##, index_col = 'CASRN')

In [None]:
# df_y_array.info()
# df_y_array.head()
# df_y_array.shape

In [None]:
# df_X_array = df_X.to_dask_array(lengths = True)

# df_X_array = df_X_array.rechunk(chunks=(10000,-1))
# df_X_array.to_zarr('df_X_array.zarr' ) 

In [None]:
# df_y_array = df_y.to_dask_array(lengths = True)

# df_y_array = df_y_array.rechunk(chunks=(10000,-1))
# df_y_array.to_zarr('df_y_array.zarr' ) 

In [None]:
# df_X_array.to_hdf5('df_X_array.hdf5' , '/x')  
# df_y_array.to_hdf5('df_y_array.hdf5' , '/x')  

In [None]:
# del df_X, df_y, df_X_array, df_y_array

In [None]:
# df_y = df_profiles[y_columns].compute()

# df_X = df_profiles[list(x['selected'])] ## .drop(labels=x_columns_drop, axis =1)

### Read zarr files

In [None]:
df_X_array = dask.array.from_zarr('df_X_array.zarr' )

In [None]:
df_y_array = dask.array.from_zarr('df_y_array.zarr' )

# Neural Net

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
from torchinfo import summary
torch.set_printoptions(precision=None, threshold=None, edgeitems=None, linewidth=180, profile=None, sci_mode=None)
## Set visible GPU device 
##----------------------------------------------
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

device

In [None]:
@torch.no_grad()
def validation(val_steps=50):
    loss = 0
    for i in range(val_steps):
        ix = torch.randint(0, val_X.shape[0], (batch_size,))
        Xv, Yv = torch.Tensor(val_X[ix]).to(device), torch.Tensor(val_y[ix]).to(device) # batch X,Y
        logits = model(Xv)
        loss += F.mse_loss(logits, Yv)
    loss /= val_steps
    return loss 

# import torchmetrics 
# evaluate the loss
@torch.no_grad() # this decorator disables gradient tracking inside pytorch
def split_loss(split):
    """
    compute loss for data split passed (training, validation, or test data)
    """
#     from torch.torcheval.metrics import R2Score    
    from torchmetrics.regression import R2Score, PearsonCorrCoef
    x_numpy,y_numpy = {
    'train': (train_X, train_y),
    'val'  : (val_X  , val_y),
    'test' : (test_X , test_y),
    }[split]
    x = torch.Tensor(x_numpy).to(device)
    y = torch.Tensor(y_numpy).to(device) 
    logits = model(x)
#     print(f" size of logits: {logits.shape}   size of y: {y.shape}")
    mse_loss = F.mse_loss(logits, y)
    r2score = R2Score().to(device)
    pearson = PearsonCorrCoef(num_outputs=1).to(device)
    r2_loss = r2score(logits, y) 
    pearson_loss= pearson(logits.view(-1), y.view(-1))
    print(f"\n {split:5s} data:   MSE loss: {mse_loss.item():10.4f}    R2 Score: {r2_loss.item():.5f}     Pearson Coeff. {pearson_loss:.4f}")
    
@torch.no_grad()
def calc_loss(x,y):
    logits = model(x)
    loss = F.mse_loss(logits, y)
    print(y[:20].T)
    print(logits[:20].T)
    print(f"Calculated loss:  {loss.item():5e}")
    

In [None]:
del model

In [None]:
# hierarchical network
#  nn.Linear(n_hidden_2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
# del model

n_input = 1032 # the dimensionality of the character embedding vectors
n_hidden_1 = 128 # the number of neurons in the hidden layer of the MLP
n_hidden_2 = 64 # the number of neurons in the hidden layer of the MLP
model = nn.Sequential(
    nn.Linear(n_input , n_hidden_1, bias=False), 
    nn.BatchNorm1d(n_hidden_1), 
    nn.Tanh(),
    nn.Linear(n_hidden_1, n_hidden_2, bias=False), 
    nn.BatchNorm1d(n_hidden_2), 
    nn.Tanh(),
    nn.Linear(n_hidden_2, 1),
)
model.to(device)

In [None]:
# print(type(model))
# print(type(model[-1]))
model[-1].__dict__
model[-1].weight.shape
model[-1].bias.shape
model[-1].weight
# parameter init
with torch.no_grad():
    model[-1].weight *= 0.1 # last layer make less confident
model[-1].weight

In [None]:
parameters = model.parameters()
ttl_nelements = 0
for p in parameters:
    print(f"Parm shape: {str(p.shape):35s}    # elements: {p.nelement():8d}    Required gradient calc: {p.requires_grad}")
    ttl_nelements += p.nelement()
print(ttl_nelements)

print(f"Total num of parameters: {sum(p.nelement() for p in model.parameters())}") # number of parameters in total
for p in parameters:
    p.requires_grad = True

summary(model, 
        input_size = (1,1032),
        verbose =2, 
        col_names = ["input_size", "output_size", "num_params", "params_percent",  "mult_adds","trainable"],
        col_width=16,
        row_settings=["var_names"],);

In [None]:
torch.manual_seed(42); # seed rng for reproducibility

In [None]:
start_step = 0
end_step = 200000


In [None]:
# create a PyTorch optimizer
init_LR = 1e-3
curr_LR = init_LR
step_size = 100000
start_step = 0
end_step  = 200000
batch_size = 64
lossi = []
lossv = []

optimizer = torch.optim.AdamW(model.parameters(), lr=init_LR)

In [None]:
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = step_size, gamma=0.1, last_epoch= -1 , verbose=False)

In [None]:
model.train();
for i in range(start_step, end_step):
  
    # minibatch construct
    ix = torch.randint(0, train_X.shape[0], (batch_size,))
    Xb, Yb = torch.Tensor(train_X[ix]).to(device), torch.Tensor(train_y[ix]).to(device) # batch X,Y
    
    # forward pass
    logits = model(Xb)
    loss_mse = F.mse_loss(logits, Yb)

    optimizer.zero_grad(set_to_none=True)
    loss_mse.backward()

    optimizer.step()

    lossi.append(loss_mse.log10().item())
    
    scheduler.step()

    # track stats
    if i % 1000 == 0:
        val_loss_mse = validation()
        lossv.append(val_loss_mse.log10().item())
        print(f" {datetime.now().strftime('%X.%f')} | batch: {i:7d}/{end_step:7d} | training loss: {loss_mse.item():11.6f} | validation loss: {val_loss_mse.item():11.6f}") 
    
        if i % 50000 == 0:
            if curr_LR != optimizer.param_groups[0]['lr']:
                curr_LR = optimizer.param_groups[0]['lr']
                print(f" ===> learning rate adjusted to {curr_LR}")        
            model.eval();
            print(f"**{i//50000} iterations**")
            split_loss('train')
            split_loss('val')
            split_loss('test')
            print()
            model.train();

## End of training loop
        
print(f" start_step : {start_step}     end_step: {end_step}    i: {i}")    

model.eval();
split_loss('train')
split_loss('val')
split_loss('test')
model.train();



In [None]:
# start_step = end_step
# end_step += 50000

In [None]:
print(f" start_step : {start_step}     end_step: {end_step}     end_step: {end_step - start_step}   i: {i}      learning rate: {optimizer.param_groups[0]['lr']}")    

In [None]:
# for g in optimizer.param_groups:
#     g['lr']=1e-4
#     print(g)

In [None]:
print(len(lossi), len(lossv));
# print(lossi[0], lossi[-1])
# lossi.pop()
# print(len(lossi), len(lossv));
# print(lossi[0], lossi[-1])
plt.plot(torch.tensor(lossi).view(-1, 1000).mean(1));
plt.plot(torch.tensor(lossv).view(-1));


In [None]:
https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics:~:text=The%20r2_score%20function%20computes,score%20of%200.0.

In [None]:
# put layers into eval mode (needed for batchnorm especially)
model.eval();
split_loss('train')
split_loss('val')
split_loss('test')
model.train();

**training using TPSA**

     train data:        MSE loss:   125.6541    R2 Score: 0.88529     Pearson Coeff. 0.9419

     val   data:        MSE loss:   290.9020    R2 Score: 0.67287     Pearson Coeff. 0.8236

     test  data:        MSE loss:   384.7767    R2 Score: 0.67663     Pearson Coeff. 0.8230
      
      

### MinMax TPSA 128 x 64 , no BatchNorm 

**MinMax TPSA 128 x 64 , no BatchNorm**

**50k iterations**

     train data:        MSE loss:     0.0001    R2 Score: 0.79670     Pearson Coeff. 0.8926

     val   data:        MSE loss:     0.0001    R2 Score: 0.68855     Pearson Coeff. 0.8333

     test  data:        MSE loss:     0.0002    R2 Score: 0.66196     Pearson Coeff. 0.8145


**100k iterations**

     train data:        MSE loss:     0.0001    R2 Score: 0.88567     Pearson Coeff. 0.9418

     val   data:        MSE loss:     0.0001    R2 Score: 0.70376     Pearson Coeff. 0.8402

     test  data:        MSE loss:     0.0002    R2 Score: 0.70878     Pearson Coeff. 0.8433
      
      
**150k iterations**

     train data:        MSE loss:     0.0001    R2 Score: 0.88960     Pearson Coeff. 0.9438

     val   data:        MSE loss:     0.0001    R2 Score: 0.70221     Pearson Coeff. 0.8394

     test  data:        MSE loss:     0.0002    R2 Score: 0.70636     Pearson Coeff. 0.8420
      
      
**200k iterations**

    train data:        MSE loss:     0.0001    R2 Score: 0.89114     Pearson Coeff. 0.9441

     val   data:       MSE loss:     0.0001    R2 Score: 0.70325     Pearson Coeff. 0.8394

     test  data:       MSE loss:     0.0002    R2 Score: 0.70603     Pearson Coeff. 0.8416
      

**250k iterations**      
      
     train data:        MSE loss:     0.0001    R2 Score: 0.89125     Pearson Coeff. 0.9440

     val   data:        MSE loss:     0.0001    R2 Score: 0.70360     Pearson Coeff. 0.8398

     test  data:        MSE loss:     0.0002    R2 Score: 0.70624     Pearson Coeff. 0.8417  
      

### MinMax TPSA 128 x 64 , With BatchNorm 

**MinMax TPSA 128 x 64 , with BatchNorm**

**50k iterations**

     train data:    MSE loss:     0.0001    R2 Score: 0.80766     Pearson Coeff. 0.8994

     val   data:    MSE loss:     0.0001    R2 Score: 0.66401     Pearson Coeff. 0.8181

     test  data:    MSE loss:     0.0002    R2 Score: 0.68471     Pearson Coeff. 0.8312


**100k iterations**


     train data:    MSE loss:     0.0001    R2 Score: 0.85963     Pearson Coeff. 0.9295

     val   data:    MSE loss:     0.0001    R2 Score: 0.68456     Pearson Coeff. 0.8308

     test  data:    MSE loss:     0.0002    R2 Score: 0.69034     Pearson Coeff. 0.8358      
      
      
**150k iterations**


     train data:    MSE loss:     0.0001    R2 Score: 0.87261     Pearson Coeff. 0.9345

     val   data:    MSE loss:     0.0001    R2 Score: 0.69643     Pearson Coeff. 0.8354

     test  data:    MSE loss:     0.0002    R2 Score: 0.69435     Pearson Coeff. 0.8362

      
**200k iterations**


     train data:    MSE loss:     0.0001    R2 Score: 0.87318     Pearson Coeff. 0.9345

     val   data:    MSE loss:     0.0001    R2 Score: 0.69366     Pearson Coeff. 0.8337

     test  data:    MSE loss:     0.0002    R2 Score: 0.69301     Pearson Coeff. 0.8363      
     

**250k iterations**      
      
     train data:    MSE loss:     0.0001    R2 Score: 0.87323     Pearson Coeff. 0.9346

     val   data:    MSE loss:     0.0001    R2 Score: 0.69520     Pearson Coeff. 0.8343

     test  data:    MSE loss:     0.0002    R2 Score: 0.69483     Pearson Coeff. 0.8368
      

In [None]:
from sklearn.metrics import get_scorer_names
for i in get_scorer_names() :
    if "error" in i:
        print(i)

## Evaluation on Test dataset

In [None]:
model.eval()
# minibatch construct
ix = torch.randint(0, train_X.shape[0], (batch_size,))
Xb, Yb = torch.Tensor(train_X[ix]).to(device), torch.Tensor(train_y[ix]).to(device) # batch X,Y

print(ix)
logits = model(Xb)
with torch.no_grad():
    a,b = logits.cpu().numpy().squeeze(), Yb.cpu().numpy().squeeze()
    print(a.shape, b.shape)
    for i, j in zip(logits.cpu().numpy(), Yb.cpu().numpy()):
        print(f" {i[0]:.5f}     {j[0]:.5f}")

# EDA - Generate Data Profile

In [None]:
# profile = ProfileReport(df_labels, title="Profiling Report")
# profile_report = df_labels.profile(html={"style": {"full_width": True}})
# profile.to_file("./output/example.html")

### profile widgets

In [None]:
# profile.to_widgets()

### profile notebook iframe

In [None]:
# profile.to_notebook_iframe()

## Create label file `all_y1` from `df_tpsa[TPSA]`

In [None]:
# del tpsa_norm, tpsa_numpy,tpsa_numpy_1, normalizer

###  Normalize data 

In [None]:
# tpsa_numpy =df_labels['log10TPSA'].to_numpy().reshape(-1,1)
tpsa_numpy =df_labels['TPSA'].to_numpy().reshape(-1,1)


print(f" {tpsa_numpy.shape}  min:  {tpsa_numpy.min()},   max: {tpsa_numpy.max()},  mean: {tpsa_numpy.mean():.4f}, std dev: {tpsa_numpy.std():.4f}")
print("           ",tpsa_numpy[:10].T)

# normalize function 
print(f"\n Using normalize function")
print(f" --------------------------")
normalizer = Normalizer()
tpsa_norm  = normalizer.fit_transform(tpsa_numpy)
print(f" Parameters: {normalizer.get_params(deep=True)}")
# # all_y = normalizer.transform(tpsa_numpy)
print(f" {tpsa_norm.shape}  min:  {tpsa_norm.min()},   max: {tpsa_norm.max():.4f},  mean: {tpsa_norm.mean():.4f}, std dev: {tpsa_norm.std():.4f}")
print(f" First 10 elements : {tpsa_norm[:10].T}")      

print(f"\n Using MaxAbsScaler ")
print(f" --------------------------")
scaler  = MaxAbsScaler()
print(f" type of scaler: {type(scaler)}  ")
tpsa_maxabs  = scaler.fit_transform(tpsa_numpy)
print(f" Parameters: {scaler.get_params(deep=True)}")
# # all_y = normalizer.transform(tpsa_numpy)
print(f" {tpsa_maxabs.shape}  min:  {tpsa_maxabs.min()},   max: {tpsa_maxabs.max():.4f},  mean: {tpsa_maxabs.mean():.4f}, std dev: {tpsa_maxabs.std():.4f}")
print(f"  First 10 elements : {tpsa_maxabs[:10].T}")      

print(f"\n Using MinMaxScaler ")
print(f" --------------------------")
scaler  = MinMaxScaler(feature_range=(-1,1))
print(f" type of scaler: {type(scaler)}")
print(f" Parameters: {scaler.get_params(deep=True)}")
tpsa_minmax  = scaler.fit_transform(tpsa_numpy)
print(f" {tpsa_minmax.shape}  min:  {tpsa_minmax.min()},   max: {tpsa_minmax.max():.4f},  mean: {tpsa_minmax.mean():.4f}, std dev: {tpsa_minmax.std():.4f}")
print(f" First 10 elements : {tpsa_minmax[:10].T}")      
      

In [None]:
# all_y = tpsa_numpy.copy()
all_y = tpsa_minmax.copy()

In [None]:
print(f" {all_y.shape} -   min:  {all_y.min()}       max: {all_y.max():.4f}      mean: {all_y.mean():.4f}      std dev: {all_y.std():.4f}")
print(f" First 10 elements : {all_y[:10].T}")


## Read fingerprint features file `all_X`

In [None]:
print(f" compound Pharmacophore Dense Zipped File : {compoundPharmacophoreDenseZipFile}")

In [None]:
all_X = np.load(compoundPharmacophoreDenseZipFile)['dense_mat']
print(type(all_X), all_X.shape, all_X.dtype)
print(all_X[:5, :25])