  # Apply encoder to morphological profiles to get latent space representations :

# Setup

In [1]:
%load_ext autoreload  
%autoreload 2
from IPython.display import display, HTML, Image
from IPython.core.interactiveshell import InteractiveShell
display(HTML("<style>.container { width:98% !important; }</style>"))
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import os
import sys
import random
from typing import List, Tuple
from types import SimpleNamespace
import pprint
import logging
from datetime import datetime
for p in ['./src','../pt-snnl','../..']:
    if p not in sys.path:
        print(f"insert {p}")
        sys.path.insert(0, p)
print(sys.path)

import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd

import scipy
import scipy.stats as sps
import sklearn.metrics as skm
from scipy.spatial.distance import pdist, squareform, euclidean

import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt  # for making figures
from torchinfo import summary

torch.set_printoptions(precision=None, threshold=None, edgeitems=None, linewidth=180, profile=None, sci_mode=None)
torch.manual_seed(42);   # seed rng for reproducibility
pp = pprint.PrettyPrinter(indent=4)
pd.options.display.width = 132
np.set_printoptions(edgeitems=3, infstr='inf', linewidth=150, nanstr='nan')

os.environ["WANDB_NOTEBOOK_NAME"] = "AE-MAIN-SNNL.ipynb"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"

torch.set_num_threads(4)  ## <--- limit to ~ 2 CPUs
torch.get_num_threads()

insert ./src
insert ../pt-snnl
insert ../..
['../..', '../pt-snnl', './src', '/home/kevin/WSL-shared/cellpainting/cj-datasets', '/home/kevin/miniforge3/envs/cp311/lib/python311.zip', '/home/kevin/miniforge3/envs/cp311/lib/python3.11', '/home/kevin/miniforge3/envs/cp311/lib/python3.11/lib-dynload', '', '/home/kevin/miniforge3/envs/cp311/lib/python3.11/site-packages', '/home/kevin/miniforge3/envs/cp311/lib/python3.11/site-packages/huggingface_hub-0.20.3-py3.8.egg']


<torch._C.Generator at 0x7fa37f2d7ef0>

4

In [3]:
# from KevinsRoutines.utils.utils_general import list_namespace, save_to_pickle, load_from_pickle, get_device
import KevinsRoutines.utils as myutils
# import snnl.utils as utils
# from utils.utils_ptsnnl import display_cellpainting_batch, get_device
from utils.utils_cellpainting import label_counts, balance_datasets,save_checkpoint, load_checkpoint

from utils.utils_notebooks import plot_cls_metrics, compute_classification_metrics, run_model_on_test_data,\
                                train, validation, accuracy_fn, fit, build_model, define_datasets



In [4]:
myutils.get_device(verbose = True)

Dev Id   Device Name                    Total Memory                     InUse                            Free Memory 
   0     Quadro GV100                   34,069,872,640 B/ (31.73 GB)  	 325,058,560 B / (0.30 GB)  	 33,744,814,080 B / (31.43 GB)   *** CURRENT DEVICE *** 
   1     Quadro GV100                   34,069,872,640 B/ (31.73 GB)  	 325,058,560 B / (0.30 GB)  	 33,744,814,080 B / (31.43 GB)  
   2     NVIDIA TITAN Xp                12,774,539,264 B/ (11.90 GB)  	 157,417,472 B / (0.15 GB)  	 12,617,121,792 B / (11.75 GB)  

 Current CUDA Device is:  "cuda:0"  Device Name: Quadro GV100


'cuda:0'

In [5]:
timestamp = datetime.now().strftime('%Y_%m_%d_%H:%M:%S')
logger = logging.getLogger(__name__)
logLevel = os.environ.get('LOG_LEVEL', 'INFO').upper()
FORMAT = '%(asctime)s - %(name)s - %(levelname)s: - %(message)s'
logging.basicConfig(level="INFO", format= FORMAT)
logger.info(f" Excution started : {timestamp} ")
logger.info(f" Pytorch version  : {torch.__version__}")
logger.info(f" Scipy version    : {scipy.__version__}  \t\t Numpy version : {np.__version__}")
logger.info(f" Pandas version   : {pd.__version__}  ")

2024-10-03 19:55:20,952 - __main__ - INFO: -  Excution started : 2024_10_03_19:55:20 
2024-10-03 19:55:20,953 - __main__ - INFO: -  Pytorch version  : 2.2.0
2024-10-03 19:55:20,954 - __main__ - INFO: -  Scipy version    : 1.11.4  		 Numpy version : 1.26.2
2024-10-03 19:55:20,955 - __main__ - INFO: -  Pandas version   : 2.2.0  


In [6]:
try:
    del model
except Exception as e:
    pass

In [7]:
myutils.set_device(1)
device  = myutils.get_device(verbose = True)
print(device)

 Switched to: "cuda:1"   Device Name: Quadro GV100                  


'cuda:1'

Dev Id   Device Name                    Total Memory                     InUse                            Free Memory 
   0     Quadro GV100                   34,069,872,640 B/ (31.73 GB)  	 325,058,560 B / (0.30 GB)  	 33,744,814,080 B / (31.43 GB)  
   1     Quadro GV100                   34,069,872,640 B/ (31.73 GB)  	 325,058,560 B / (0.30 GB)  	 33,744,814,080 B / (31.43 GB)   *** CURRENT DEVICE *** 
   2     NVIDIA TITAN Xp                12,774,539,264 B/ (11.90 GB)  	 157,417,472 B / (0.15 GB)  	 12,617,121,792 B / (11.75 GB)  

 Current CUDA Device is:  "cuda:1"  Device Name: Quadro GV100
cuda:1


# Args 

In [8]:
LATENT_DIM    = 150
COMPOUNDS_PER_BATCH = 600

MODEL_TYPE = 'batch_norm'
# MODEL_TYPE = 'single_layer'
# MODEL_TYPE = 'relu'
n_input    = LATENT_DIM  # the embedding dimensionality 
n_hidden_1 = 512  # the number of neurons in the hidden layer of the MLP
n_hidden_2 = 512  # the number of neurons in the hidden layer of the MLP
n_hidden_3 = 128

METADATA_COLS = ['Metadata_Source', 'Metadata_Batch', 'Metadata_Plate', 'Metadata_Well', 'Metadata_JCP2022', 'Metadata_Hash', 'Metadata_Bin', 'Metadata_TPSA', 'Metadata_lnTPSA', 'Metadata_log10TPSA', 'Metadata_Permiation']
# METADATA_COLS += [f'Feature_{x:03d}' for x in range(LATENT_DIM)]
input_cols = LATENT_DIM + len(METADATA_COLS)
print(len(METADATA_COLS))
print(input_cols)

INPUT_PATH = f"/home/kevin/WSL-shared/cellpainting/cj-datasets/output_11102023/3_sample_embeddings/"
CKPT_PATH = "./saved_models/embedding_models"

11
161


In [9]:
# RUN_DATETIME = datetime.now().strftime('%Y%m%d_%H%M')
# RUN_DATETIME = '20240909_1801'
# RUN_DATETIME = '20240909_1800'
# RUN_DATETIME = '20240909_2100'
# RUN_DATETIME = '20240916_1830'
# RUN_DATETIME = '20240921_0700'
# RUN_DATETIME = '20240926_1900'
# RUN_DATETIME = '20240927_2345'
RUN_DATETIME = '20240909_2130'
print(RUN_DATETIME)

20240909_2130


In [10]:
# SNNL AUTOENCODERS 
AE_RUNMODE = "snnl"
# AE_DATETIME = "20240718_1956"
AE_DATETIME = "20240906_2201"     # Autoencoder training - SNNL, CPB = 600, Latent 150, WD = 0.001, SNN Factor 3
# AE_DATETIME = "20240917_2004"     # Autoencoder training - SNNL, CPB = 600, Latent 250, WD = 0.001, SNN Factor 3

## BASELINE AUTOENCODERS 
# AE_RUNMODE = 'base'
# AE_DATETIME = "20240917_2017"     # Autoencoder training - Baseline, CPB = 600, Latent 250, WD = 0.001 (SNN Factor 0)

# AE_CKPTTYPE = "BEST"
AE_CKPTTYPE = "LAST"

In [11]:
CKPT_FILE = f"NN_{AE_RUNMODE.lower()}_embd600_{LATENT_DIM}Ltnt_512_{AE_DATETIME}_{AE_CKPTTYPE}_{RUN_DATETIME}_ep_{{ep}}"
print(CKPT_FILE)

NN_snnl_embd600_150Ltnt_512_20240906_2201_LAST_20240909_2130_ep_{ep}


In [12]:
## total rows = 346,542
## Trn file sz: 312,000 
## Train      : 277,200    (312_000 - (21,600 + 12,600 + 600) = 277,200
## Validation :  21,600
## Test       :  12,600
## Leftover   :     600
cellpainting_args = {'compounds_per_batch': COMPOUNDS_PER_BATCH,
                     'train_start'        : 0,
                     'train_end'          : 277_200,
                     'val_start'          : 0,
                     'val_end'            : 21_600,
                     'test_start'         : 0,
                     'test_end'           : 12_600, 
                    }

In [13]:
data_loader = define_datasets(cellpainting_args, AE_RUNMODE, AE_DATETIME, input_cols, AE_CKPTTYPE, INPUT_PATH)

2024-10-03 19:55:35,937 - utils.dataloader - INFO: -  Building CellPantingDataset for train
2024-10-03 19:55:35,938 - utils.dataloader - INFO: -  filename:  /home/kevin/WSL-shared/cellpainting/cj-datasets/output_11102023/3_sample_embeddings/3smpl_prfl_embedding_161_HashOrder_snnl_20240906_2201_LAST_train.csv
2024-10-03 19:55:35,939 - utils.dataloader - INFO: -  type    :  train
2024-10-03 19:55:35,939 - utils.dataloader - INFO: -  start   :  0
2024-10-03 19:55:35,940 - utils.dataloader - INFO: -  end     :  277200
2024-10-03 19:55:35,940 - utils.dataloader - INFO: -  numrows :  277200
2024-10-03 19:55:35,941 - utils.dataloader - INFO: -  names   :  None     usecols :  None
2024-10-03 19:55:35,942 - utils.dataloader - INFO: -  batch_size  :  1
2024-10-03 19:55:35,942 - utils.dataloader - INFO: -  sample_size :  3
2024-10-03 19:55:35,943 - utils.dataloader - INFO: -  compounds_per_batch :  600
2024-10-03 19:55:35,943 - utils.dataloader - INFO: -  rows per batch (chunksize) :  1800
2024-1

 TRAIN_INPUT:  /home/kevin/WSL-shared/cellpainting/cj-datasets/output_11102023/3_sample_embeddings/3smpl_prfl_embedding_161_HashOrder_snnl_20240906_2201_LAST_train.csv
 TEST_INPUT :  /home/kevin/WSL-shared/cellpainting/cj-datasets/output_11102023/3_sample_embeddings/3smpl_prfl_embedding_161_HashOrder_snnl_20240906_2201_LAST_train_sub_test.csv
 ALL_INPUT  :  /home/kevin/WSL-shared/cellpainting/cj-datasets/output_11102023/3_sample_embeddings/3smpl_prfl_embedding_161_HashOrder_snnl_20240906_2201_LAST_train_sub_val.csv
 load {}
 Dataset size: 277200   rows per batch: 1800  tpsa_threshold: 100
 Dataset size: 21600   rows per batch: 1800  tpsa_threshold: 100
 Dataset size: 12600   rows per batch: 1800  tpsa_threshold: 100


### Dataloader


In [11]:
# TRAIN_INPUT_FILE = f"3smpl_prfl_embedding_{input_cols}_HashOrder_{AE_RUNMODE}_{AE_DATETIME}_{AE_CKPTTYPE}_train.csv"
# TEST_INPUT_FILE  = f"3smpl_prfl_embedding_{input_cols}_HashOrder_{AE_RUNMODE}_{AE_DATETIME}_{AE_CKPTTYPE}_train_sub_test.csv"
# VAL_INPUT_FILE   = f"3smpl_prfl_embedding_{input_cols}_HashOrder_{AE_RUNMODE}_{AE_DATETIME}_{AE_CKPTTYPE}_train_sub_val.csv"
# # ALL_INPUT_FILE   = f"3smpl_prfl_embedding_{num_cols}_HashOrder_{AE_RUNMODE}_{AE_DATETIME}_{AE_CKPTTYPE}_sub_val.csv"

# TRAIN_INPUT = os.path.join(INPUT_PATH, TRAIN_INPUT_FILE)
# TEST_INPUT  = os.path.join(INPUT_PATH, TEST_INPUT_FILE)
# VAL_INPUT   = os.path.join(INPUT_PATH, VAL_INPUT_FILE)

# print(f" TRAIN_INPUT:  {TRAIN_INPUT}")
# print(f" TEST_INPUT :  {TEST_INPUT }")
# print(f" ALL_INPUT  :  {VAL_INPUT }")

In [12]:
## total rows = 346,542
## Trn file sz: 312,000 
## Train      : 277,200    (312_000 - (21,600 + 12,600 + 600) = 277,200
## Validation :  21,600
## Test       :  12,600
## Leftover   :     600
# cellpainting_args = {'sample_size': 3,
#                      'batch_size': 1,
#                      'compounds_per_batch': 600,
#                      'training_path'  : TRAIN_INPUT,
#                      'validation_path': TRAIN_INPUT,
#                      'test_path'      : TRAIN_INPUT,
#                      'train_start'    : 0,
#                      'train_end'      : 277_200,  # 277,200 samples
#                      'val_start'      : 277_200,  # 
#                      'val_end'        : 298_800,  # 21_600 samples
#                      'test_start'     : 298_800,  # 
#                      'test_end'       : 311_400,  # 12_600 samples
#                     }

In [13]:
# cellpainting_args

In [14]:
#### Load CellPainting Dataset
# data : keys to the dataset settings (and resulting keys in output dictionary)
# dataset = dict()
# data_loader = dict()

# print(f" load {dataset}")
# for datatype in ['train', 'val', 'test']:
#     dataset[datatype] = CellpaintingDataset(type = datatype, **cellpainting_args)
#     data_loader[datatype] = InfiniteDataLoader(dataset = dataset[datatype], batch_size=1, shuffle = False, num_workers = 0, collate_fn = custom_collate_fn)

In [15]:
# %%timeit
# for dataset in ['train', 'val', 'test']:
#     for idx, batch in enumerate(data_loader[dataset]):
#         print(batch[0].shape[0], batch[1].sum())
#         # display_cellpainting_batch(idx, batch)
#         if idx == 1:
#             break

In [16]:
# # -----------------------------------------
# #  Count pos/neg labels in each dataset
# # -----------------------------------------
# for datatype in ['train', 'val', 'test']:
#     MINIBATCH_SIZE = data_loader[datatype].dataset.sample_size * data_loader[datatype].dataset.compounds_per_batch
#     print(f" {datatype.capitalize()} Minibatch size : {MINIBATCH_SIZE} \n") 
# # for datatype in ['val', 'test']:
#     minibatches = len(data_loader[datatype]) // MINIBATCH_SIZE
#     ttl_rows = 0
#     ttl_pos_labels = 0
#     with tqdm.tqdm(enumerate(data_loader[datatype]), initial=0, total = minibatches, position=0, file=sys.stdout,
#                    leave= False, desc=f" Count labels ") as t_warmup:
#         for batch_count, (batch_features, batch_labels, _, _, _, _) in t_warmup:
#             ttl_rows += batch_labels.shape[0]
#             ttl_pos_labels += batch_labels.sum()
#     ttl_neg_labels = ttl_rows - ttl_pos_labels
#     ttl = f"\n Dataset: {datatype} -  len of {datatype} data loader: {len(data_loader[datatype])}   number of batches: {minibatches}"
#     print(ttl)
#     print('-'*len(ttl))
#     print(f" total rows     : {ttl_rows:7d}")
#     print(f" total pos rows : {ttl_pos_labels:7.0f} - {ttl_pos_labels*100.0/ttl_rows:5.2f}%")
#     print(f" total neg rows : {ttl_neg_labels:7.0f} - {ttl_neg_labels*100.0/ttl_rows:5.2f}%")
#     print()

     Minibatch size : 1800 
                                                                                                 
     Dataset: train - len of train data loader: 277200   number of batches: 154  
    ------------------------------
     total rows     :  277200
     total pos rows :   33129 - 11.95%
     total neg rows :  244071 - 88.05%

     Dataset: val - len of val data loader: 21600   number of batches: 12
    ------------------------------
     total rows     :   21600
     total pos rows :    2532 - 11.72%
     total neg rows :   19068 - 88.28%
    
     Dataset: test - len of test data loader: 12600   number of batches: 7
    ------------------------------
     total rows     :   12600
     total pos rows :    1431 - 11.36%
     total neg rows :   11169 - 88.64%

# Define Neural Net Model 

- **4 layer model :**

    Input --> Hidden1 --> (BN/NL) ---> Hidden2 ---> (BN/NL) ---> Hidden3 --->  (BN/NL) ---> 1
   
    -  **20240909_1800** : Run on 4 FC layers model, model configuration UNKNOWN
    -  **20240909_1801** : Run on 4 FC layers model, Relu non linearities (NO Batch Norm)
    -  **20240909_2100** : Run on 4 FC layers model, with BATCH NORM / Tanh 256,256,128

    -  **20240930_1930** : Run on 4 FC layers model, with BATCH NORM/ Tanh 512,512,128
      
 - **Single Hidden Layer - 256**

   Input --> Hidden1 --> (Tanh) --->  1
    -  **20240916_1830** : Run on 1 FC layers model, Input --> 256 --> Tanh --> 1 ,  Read from 20240906_2201 (SNNL - CPB 600, LAT 150, SNN Factor 3)
    -  **20240926_1900** : Run on 1 FC layers model, Input --> 256 --> Tanh --> 1 ,  Read from 20240917_2017 (BASELINE - CPB 600, LAT 250, SNN Factor 0)
    -  **20240926_1930** : Run on 1 FC layers model, Input --> 256 --> Tanh --> 1 ,  Read from 20240917_2004 (SNNL - CPB 600, LAT 250, SNN Factor 3)
    -  **20240926_2000** : Run on 1 FC layers model, Input --> 256 --> Tanh --> 1 ,  Read from 20240924_0146 (SNNL - CPB 600, LAT 250, SNN Factor 30)
<br>

 - **Single Hidden Layer - 256**

    -  **20240921_0700** : Run on 1 FC layers model (includes final layer), Input --> 512 --> Tanh --> 1 ,  Read from 20240906_2201 (SNNL - CPB 600, LAT 150, SNN Factor 3)    


In [14]:
model = build_model(MODEL_TYPE, input = n_input, hidden_1 = n_hidden_1, hidden_2 = n_hidden_2, hidden_3=n_hidden_3, device = device)

Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Param %                   Mult-Adds                 Trainable
Sequential                               [30, 150]                 [30, 1]                   --                             --                   --                        True
├─Linear: 1-1                            [30, 150]                 [30, 512]                 77,312                     18.95%                   2,319,360                 True
│    └─weight                                                                                ├─76,800
│    └─bias                                                                                  └─512
├─BatchNorm1d: 1-2                       [30, 512]                 [30, 512]                 1,024                       0.25%                   30,720                    True
│    └─weight                                                                             

In [15]:
parameters = model.parameters()
ttl_nelements = 0
for p in parameters:
    print(f"Parm shape: {str(p.shape):35s}    # elements: {p.nelement():8d}    Required gradient calc: {p.requires_grad}")
    ttl_nelements += p.nelement()
print(f"Total num of parameters: {ttl_nelements}")  # number of parameters in total

col_names = ["input_size", "output_size", "num_params", "params_percent", "mult_adds", "trainable"]

summary_input_size = (30, n_input)
_ = summary(model, verbose = 2, input_size=summary_input_size, col_names = col_names)

Parm shape: torch.Size([512, 150])                 # elements:    76800    Required gradient calc: True
Parm shape: torch.Size([512])                      # elements:      512    Required gradient calc: True
Parm shape: torch.Size([512])                      # elements:      512    Required gradient calc: True
Parm shape: torch.Size([512])                      # elements:      512    Required gradient calc: True
Parm shape: torch.Size([512, 512])                 # elements:   262144    Required gradient calc: True
Parm shape: torch.Size([512])                      # elements:      512    Required gradient calc: True
Parm shape: torch.Size([512])                      # elements:      512    Required gradient calc: True
Parm shape: torch.Size([512])                      # elements:      512    Required gradient calc: True
Parm shape: torch.Size([128, 512])                 # elements:    65536    Required gradient calc: True
Parm shape: torch.Size([128])                      # elements:  

In [16]:
metrics = { 'loss_trn' : [], 'acc_trn' : [], 'loss_val' : [], 'acc_val' : []}

start_epoch, end_epoch = 0,0
init_LR = 1.0e-3
# curr_LR = init_LR

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=init_LR)

# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.3 , patience=20, cooldown=10,)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = step_size, gamma=0.1, last_epoch =-1)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.5, threshold=1.0e-06, patience=50, cooldown=10,)

### Read checkpoint

In [17]:
# loaded_epoch
# optimizer.state_dict()
# scheduler.state_dict()

In [18]:
# model, optimizer, scheudler, end_epoch = load_checkpoint(model, optimizer, scheduler, checkpoint_file.format(ep=100), ckpt_path = CKPT_PATH)
# model = model.to(device)

In [19]:
# end_epoch
# optimizer.state_dict()
# scheduler.state_dict()

# Run Training

In [20]:
# start_epoch = 0
# start_epoch = loaded_epoch
start_epoch = end_epoch
end_epoch += 600
# start_epoch, end_epoch = 0,100
print(start_epoch, end_epoch)
_ = model.train()

0 600


In [21]:

metrics = fit(model, optimizer, scheduler, data_loader, metrics, start_epoch, end_epoch, device, CKPT_FILE, CKPT_PATH )


 19:56:03 | Ep:   1/ 600 | Trn loss:  0.405897 - Acc: 86.2926 | Val loss:  0.354494 - Acc: 88.2593 | last_lr: 1.00000e-03  bad_ep: 0  cdwn: 0                              
 19:56:16 | Ep:   2/ 600 | Trn loss:  0.358459 - Acc: 88.0364 | Val loss:  0.351901 - Acc: 88.2778 | last_lr: 1.00000e-03  bad_ep: 0  cdwn: 0                              
 19:56:29 | Ep:   3/ 600 | Trn loss:  0.356163 - Acc: 88.0437 | Val loss:  0.351947 - Acc: 88.2778 | last_lr: 1.00000e-03  bad_ep: 1  cdwn: 0                              
 19:56:42 | Ep:   4/ 600 | Trn loss:  0.355132 - Acc: 88.0501 | Val loss:  0.352421 - Acc: 88.2870 | last_lr: 1.00000e-03  bad_ep: 2  cdwn: 0                              
 19:56:55 | Ep:   5/ 600 | Trn loss:  0.354414 - Acc: 88.0534 | Val loss:  0.352092 - Acc: 88.2639 | last_lr: 1.00000e-03  bad_ep: 3  cdwn: 0                              
 19:57:08 | Ep:   6/ 600 | Trn loss:  0.353851 - Acc: 88.0548 | Val loss:  0.351427 - Acc: 88.2639 | last_lr: 1.00000e-03  bad_ep: 0  cdwn: 

2024-10-03 20:17:14,650 - utils.utils_cellpainting - INFO: -  Model exported to NN_snnl_embd600_150Ltnt_512_20240906_2201_LAST_20240909_2130_ep_100.pt - epoch: 100


 20:17:14 | Ep: 100/ 600 | Trn loss:  0.331571 - Acc: 88.4729 | Val loss:  0.360854 - Acc: 87.9583 | last_lr: 5.00000e-04  bad_ep: 0  cdwn: 1 
 20:17:27 | Ep: 101/ 600 | Trn loss:  0.330945 - Acc: 88.4812 | Val loss:  0.361946 - Acc: 87.9491 | last_lr: 5.00000e-04  bad_ep: 0  cdwn: 0                              
 20:17:40 | Ep: 102/ 600 | Trn loss:  0.330297 - Acc: 88.5036 | Val loss:  0.362928 - Acc: 87.9491 | last_lr: 5.00000e-04  bad_ep: 1  cdwn: 0                              
 20:17:53 | Ep: 103/ 600 | Trn loss:  0.329652 - Acc: 88.5206 | Val loss:  0.364184 - Acc: 87.9259 | last_lr: 5.00000e-04  bad_ep: 2  cdwn: 0                              
 20:18:06 | Ep: 104/ 600 | Trn loss:  0.329007 - Acc: 88.5364 | Val loss:  0.365260 - Acc: 87.9120 | last_lr: 5.00000e-04  bad_ep: 3  cdwn: 0                              
 20:18:19 | Ep: 105/ 600 | Trn loss:  0.328351 - Acc: 88.5595 | Val loss:  0.366208 - Acc: 87.9120 | last_lr: 5.00000e-04  bad_ep: 4  cdwn: 0                            

2024-10-03 20:38:46,512 - utils.utils_cellpainting - INFO: -  Model exported to NN_snnl_embd600_150Ltnt_512_20240906_2201_LAST_20240909_2130_ep_200.pt - epoch: 200


 20:38:46 | Ep: 200/ 600 | Trn loss:  0.265579 - Acc: 90.2612 | Val loss:  0.424539 - Acc: 86.6019 | last_lr: 2.50000e-04  bad_ep: 38  cdwn: 0 
 20:38:59 | Ep: 201/ 600 | Trn loss:  0.264910 - Acc: 90.2810 | Val loss:  0.425535 - Acc: 86.6204 | last_lr: 2.50000e-04  bad_ep: 39  cdwn: 0                             
 20:39:12 | Ep: 202/ 600 | Trn loss:  0.264240 - Acc: 90.3070 | Val loss:  0.426537 - Acc: 86.6019 | last_lr: 2.50000e-04  bad_ep: 40  cdwn: 0                             
 20:39:25 | Ep: 203/ 600 | Trn loss:  0.263569 - Acc: 90.3294 | Val loss:  0.427530 - Acc: 86.5833 | last_lr: 2.50000e-04  bad_ep: 41  cdwn: 0                             
 20:39:38 | Ep: 204/ 600 | Trn loss:  0.262897 - Acc: 90.3514 | Val loss:  0.428532 - Acc: 86.5694 | last_lr: 2.50000e-04  bad_ep: 42  cdwn: 0                             
 20:39:51 | Ep: 205/ 600 | Trn loss:  0.262224 - Acc: 90.3672 | Val loss:  0.429513 - Acc: 86.5509 | last_lr: 2.50000e-04  bad_ep: 43  cdwn: 0                          

2024-10-03 21:00:12,710 - utils.utils_cellpainting - INFO: -  Model exported to NN_snnl_embd600_150Ltnt_512_20240906_2201_LAST_20240909_2130_ep_300.pt - epoch: 300


 21:00:12 | Ep: 300/ 600 | Trn loss:  0.236462 - Acc: 91.1216 | Val loss:  0.457684 - Acc: 86.6852 | last_lr: 6.25000e-05  bad_ep: 16  cdwn: 0 
 21:00:25 | Ep: 301/ 600 | Trn loss:  0.236146 - Acc: 91.1342 | Val loss:  0.457961 - Acc: 86.6759 | last_lr: 6.25000e-05  bad_ep: 17  cdwn: 0                             
 21:00:38 | Ep: 302/ 600 | Trn loss:  0.235833 - Acc: 91.1421 | Val loss:  0.458240 - Acc: 86.6620 | last_lr: 6.25000e-05  bad_ep: 18  cdwn: 0                             
 21:00:52 | Ep: 303/ 600 | Trn loss:  0.235523 - Acc: 91.1512 | Val loss:  0.458520 - Acc: 86.6528 | last_lr: 6.25000e-05  bad_ep: 19  cdwn: 0                             
 21:01:05 | Ep: 304/ 600 | Trn loss:  0.235215 - Acc: 91.1616 | Val loss:  0.458802 - Acc: 86.6389 | last_lr: 6.25000e-05  bad_ep: 20  cdwn: 0                             
 21:01:18 | Ep: 305/ 600 | Trn loss:  0.234908 - Acc: 91.1717 | Val loss:  0.459085 - Acc: 86.6343 | last_lr: 6.25000e-05  bad_ep: 21  cdwn: 0                          

2024-10-03 21:21:59,568 - utils.utils_cellpainting - INFO: -  Model exported to NN_snnl_embd600_150Ltnt_512_20240906_2201_LAST_20240909_2130_ep_400.pt - epoch: 400


 21:21:59 | Ep: 400/ 600 | Trn loss:  0.220179 - Acc: 91.7049 | Val loss:  0.476747 - Acc: 86.7315 | last_lr: 1.56250e-05  bad_ep: 0  cdwn: 6 
 21:22:12 | Ep: 401/ 600 | Trn loss:  0.220003 - Acc: 91.7157 | Val loss:  0.476843 - Acc: 86.7407 | last_lr: 1.56250e-05  bad_ep: 0  cdwn: 5                              
 21:22:25 | Ep: 402/ 600 | Trn loss:  0.219841 - Acc: 91.7255 | Val loss:  0.476943 - Acc: 86.7454 | last_lr: 1.56250e-05  bad_ep: 0  cdwn: 4                              
 21:22:38 | Ep: 403/ 600 | Trn loss:  0.219688 - Acc: 91.7334 | Val loss:  0.477049 - Acc: 86.7315 | last_lr: 1.56250e-05  bad_ep: 0  cdwn: 3                              
 21:22:51 | Ep: 404/ 600 | Trn loss:  0.219543 - Acc: 91.7403 | Val loss:  0.477155 - Acc: 86.7269 | last_lr: 1.56250e-05  bad_ep: 0  cdwn: 2                              
 21:23:04 | Ep: 405/ 600 | Trn loss:  0.219403 - Acc: 91.7435 | Val loss:  0.477262 - Acc: 86.7222 | last_lr: 1.56250e-05  bad_ep: 0  cdwn: 1                            

2024-10-03 21:43:49,429 - utils.utils_cellpainting - INFO: -  Model exported to NN_snnl_embd600_150Ltnt_512_20240906_2201_LAST_20240909_2130_ep_500.pt - epoch: 500


 21:43:49 | Ep: 500/ 600 | Trn loss:  0.211755 - Acc: 92.0501 | Val loss:  0.484878 - Acc: 86.4861 | last_lr: 7.81250e-06  bad_ep: 33  cdwn: 0 
 21:44:02 | Ep: 501/ 600 | Trn loss:  0.211704 - Acc: 92.0512 | Val loss:  0.484939 - Acc: 86.4815 | last_lr: 7.81250e-06  bad_ep: 34  cdwn: 0                             
 21:44:15 | Ep: 502/ 600 | Trn loss:  0.211654 - Acc: 92.0519 | Val loss:  0.484999 - Acc: 86.4815 | last_lr: 7.81250e-06  bad_ep: 35  cdwn: 0                             
 21:44:29 | Ep: 503/ 600 | Trn loss:  0.211605 - Acc: 92.0548 | Val loss:  0.485059 - Acc: 86.4815 | last_lr: 7.81250e-06  bad_ep: 36  cdwn: 0                             
 21:44:42 | Ep: 504/ 600 | Trn loss:  0.211555 - Acc: 92.0566 | Val loss:  0.485119 - Acc: 86.4815 | last_lr: 7.81250e-06  bad_ep: 37  cdwn: 0                             
 21:44:55 | Ep: 505/ 600 | Trn loss:  0.211505 - Acc: 92.0581 | Val loss:  0.485179 - Acc: 86.4815 | last_lr: 7.81250e-06  bad_ep: 38  cdwn: 0                          

2024-10-03 22:05:49,379 - utils.utils_cellpainting - INFO: -  Model exported to NN_snnl_embd600_150Ltnt_512_20240906_2201_LAST_20240909_2130_ep_600.pt - epoch: 600


 22:05:49 | Ep: 600/ 600 | Trn loss:  0.208090 - Acc: 92.1649 | Val loss:  0.487311 - Acc: 86.3704 | last_lr: 1.95313e-06  bad_ep: 11  cdwn: 0 


In [None]:
# print(CKPT_FILE)
# save_checkpoint(end_epoch, model, optimizer, scheduler, metrics = metrics,
#                 filename = CKPT_FILE.format(ep=end_epoch),
#                 ckpt_path = CKPT_PATH, verbose = True)

In [None]:
# start_epoch, end_epoch

# for mtrc in ['loss_trn', 'loss_val']:
#     for i in range(len(metrics[mtrc])):
#         # print(i)
#         metrics[mtrc][i] = metrics[mtrc][i].item()
 

# Modify TPSA Threshold

In [None]:

train_y_72 = np.zeros_like(train_y)
train_y_72.shape[0]/3

In [None]:
print(f"                  min           max           std          mean           median")
for x in ['TPSA', 'lnTPSA', 'log10TPSA']:
    print(f"{x:12s} {df_train[x].min():13.7f} {df_train[x].max():13.7f} {df_train[x].std():13.7f} {df_train[x].mean():13.7f} {df_train[x].median():13.7f}") 

df_train.TPSA.count()
df_train[df_train.TPSA >= THRESHOLD].TPSA.count()/df_train.TPSA.count()
df_train[df_train.TPSA < THRESHOLD].TPSA.count()/df_train.TPSA.count()


In [None]:
_tmp = df_train.Metadata_Permiation.value_counts()
_tmp[0], _tmp[1]

In [None]:
for threshold in [68, 69, 70, 71, 72, 100]:
    _tmp = (df_train['Metadata_TPSA'] >= threshold).value_counts()
    print(f"\n TPSA threshold {threshold} \n Total samples: {_tmp.sum()}")
    print(f" Label 0: {_tmp[False]:>7d}      % {_tmp[False]*100/_tmp.sum():2.2f} ")
    print(f" Label 1: {_tmp[True]:>7d}      % {_tmp[True]*100/_tmp.sum():2.2f} ")

In [None]:
# fig, ax = plt.subplots(figsize=(4,4))
# fig.canvas.draw()  # Need to draw the figure to define renderer
# ax.set_title("AngleLabel example")
# # Plot two crossing lines and label each angle between them with the above
# center = (4.5, 650)
# p1 = [(2.5, 710), (6.0, 605)]
# p2 = [(3.0, 275), (5.5, 900)]
# line1, = ax.plot(*zip(*p1))
# line2, = ax.plot(*zip(*p2))
# point, = ax.plot(*center, marker="o")


In [None]:
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
num_bins = 200
# fig, ax = plt.subplots()
fig = plt.figure(figsize=(10,5))
sigma = df_train.Metadata_TPSA.std()
mu = df_train.Metadata_TPSA.mean()
med = df_train.Metadata_TPSA.median()
# the histogram of the data
# We can set the number of bins with the *bins* keyword argument.
n, bins, patches = plt.hist(df_train.Metadata_TPSA, num_bins, density=False, range=[0, 500],)
# p1 = [(med, 710), (6.0, 605)]
# _ = plt.vlines(x=med, ymin=10, ymax=17000, colors='red', linestyles='-', lw=1.75, label='Single Short Line')
_ = plt.axvline(x=med, ymin=0, ymax=.97, color='red', linestyle='-', lw=1.75, label='Single Short Line')
_ = plt.xlabel('TPSA Value');
_ = plt.ylabel('# Compounds');
_ = plt.title(fr'TPSA distribution -  $\mu={mu:.3f}$    $\sigma={sigma:.3f}$')
plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0)
# axs[1].hist(dist2, bins=n_bins)
plt.show()

# Stratified CV data splits

In [None]:
def visualize_groups(classes, groups, name):
    # Visualize dataset groups
    fig, ax = plt.subplots(figsize=(10,5))
    ax.scatter(
        range(len(groups)),
        [0.5] * len(groups),
        c=groups,
        marker="_",
        lw=50,
        cmap=cmap_data,
    )
    ax.scatter(
        range(len(groups)),
        [3.5] * len(groups),
        c=classes,
        marker="_",
        lw=50,
        cmap=cmap_data,
    )
    ax.set(
        ylim=[-1, 5],
        yticks=[0.5, 3.5],
        yticklabels=["Data\ngroup", "Data\nclass"],
        xlabel="Sample index",
    )

In [None]:
def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""
    use_groups = "Group" in type(cv).__name__
    groups = group if use_groups else None
    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=groups)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=indices,
            marker="_",
            lw=lw,
            cmap=cmap_cv,
            vmin=-0.2,
            vmax=1.2,
        )

    # Plot the data classes and groups at the end
    ax.scatter(
        range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=cmap_data
    )

    ax.scatter(
        range(len(X)), [ii + 2.5] * len(X), c=group, marker="_", lw=lw, cmap=cmap_data
    )

    # Formatting
    yticklabels = list(range(n_splits)) + ["class", "group"]
    ax.set(
        yticks=np.arange(n_splits + 2) + 0.5,
        yticklabels=yticklabels,
        xlabel="Sample index",
        ylabel="CV iteration",
        ylim=[n_splits + 2.2, -0.2],
        xlim=[0, 100],
    )
    ax.set_title("{}".format(type(cv).__name__), fontsize=15)
    return ax

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Patch


In [None]:
rng = np.random.RandomState(1338)
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
n_splits = 4

# Generate the class/group data
# n_points = 100
# X = rng.randn(100, 10)

# percentiles_classes = [0.1, 0.3, 0.6]
# y = np.hstack([[ii] * int(100 * perc) for ii, perc in enumerate(percentiles_classes)])

In [None]:
# Generate uneven groups

# group_prior = rng.dirichlet([2] * 10)
# group_prior.sum()
# group_prior

# groups = np.repeat(np.arange(10), rng.multinomial(100, group_prior))
# groups.shape
# groups

In [None]:
groups = np.repeat(0, train_X.shape[0])
groups.shape

In [None]:
visualize_groups(train_y, groups, "no groups")

In [None]:
n_splits = 5
groups = None

In [None]:
fig, ax = plt.subplots()
cv = KFold(n_splits)
plot_cv_indices(cv, train_X, train_y, groups, ax, n_splits)

# Input 

## Read Embedded Features CSV file

In [None]:
BASE_TRAIN_INPUT = os.path.join(OUTPUT_PATH, INPUT_FILE.format(runmode = BASE_runmode ,datatype='train'))
BASE_TEST_INPUT  = os.path.join(OUTPUT_PATH, INPUT_FILE.format(runmode = BASE_runmode ,datatype='test'))
SNNL_TRAIN_INPUT = os.path.join(OUTPUT_PATH, INPUT_FILE.format(runmode = SNNL_runmode ,datatype='train'))
SNNL_TEST_INPUT  = os.path.join(OUTPUT_PATH, INPUT_FILE.format(runmode = SNNL_runmode ,datatype='test'))
BASE_TRAIN_INPUT
BASE_TEST_INPUT 
SNNL_TRAIN_INPUT
SNNL_TEST_INPUT 

In [None]:
df_test = pd.read_csv(SNNL_TEST_INPUT )
df_train = pd.read_csv(SNNL_TRAIN_INPUT)

# df_train = pd.read_csv(BASE_TRAIN_INPUT)
# df_test = pd.read_csv(BASE_TEST_INPUT )
# df_train = pd.read_csv(TRAIN_INPUT, nrows = 100 )
# df_train = pd.read_csv(TRAIN_INPUT, usecols = ['Metadata_Batch'])

In [None]:
df_test.info()
df_test.shape
df_test.columns
df_test.iloc[:5,:13]

In [None]:
# df_train = pd.read_csv(TRAIN_INPUT, nrows = 100 )
# df_train = pd.read_csv(TRAIN_INPUT, usecols = ['Metadata_Batch'])

In [None]:
test_y = df_test.iloc[:,10:11].to_numpy().ravel().astype(np.uint8)
test_y.sum()
test_y.shape, type(test_y), test_y.dtype
test_X = df_test.iloc[:,11:].to_numpy()
test_X.shape,type(test_X), test_X.dtype

In [None]:
df_train.shape
df_train.info()
df_train.iloc[:5,:16]

In [None]:
312000+34542

In [None]:
train_y = df_train.iloc[:,10:11].to_numpy().ravel().astype(np.uint8)
train_y.sum()
train_y.shape, type(train_y), train_y.dtype

train_X = df_train.iloc[:,11:].to_numpy()
train_X.shape,type(train_X) ,train_X.dtype

## Standardize inputs

In [None]:
print(f"Train_X :  Min: {train_X.min():.4f}    Max: {train_X.max():.4f}   Mean: {train_X.mean():.4f}  Std: {train_X.std():.4f}")
print(f"Test_X  :  Min: {test_X.min():.4f}    Max: {test_X.max():.4f}    Mean: {test_X.mean():.4f}  Std: {test_X.std():.4f}")

In [None]:
# print(f"Train_X :  Min: {train_X.min():.4f}    Max: {train_X.max():.4f}   Mean: {train_X.mean():.4f}  Std: {train_X.std():.4f}")
# print(f"Test_X  :  Min: {test_X.min():.4f}    Max: {test_X.max():.4f}    Mean: {test_X.mean():.4f}  Std: {test_X.std():.4f}")

In [None]:
scaler = StandardScaler(copy = True)
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

In [None]:
print("After Standard Scaler Transformation")
print(f"Train_X :  Min: {train_X.min():.4f}    Max: {train_X.max():.4f}   Mean: {train_X.mean():.4f}  Std: {train_X.std():.4f}")
print(f"Test_X  :  Min: {test_X.min():.4f}    Max: {test_X.max():.4f}    Mean: {test_X.mean():.4f}  Std: {test_X.std():.4f}")


In [None]:
label_counts([("Training", train_y), ("Test", test_y)])

# TQDM Examples

In [None]:
#-----------------------------------------
# TRANGE example
#-----------------------------------------   
# with trange(+1, ns.trn_iters_warmup+1 , initial = 0 , total = ns.trn_iters_warmup, position=0, file=sys.stdout,
#             leave= False, disable = disable_tqdm, desc=f" Warmup Epoch {ns.current_epoch}/{ns.stop_epoch_warmup}") as t_warmup :
#     for _ in t_warmup:
#         ns.current_iter += 1            

#         batch = next(dldrs.warmup_trn_loader)            
#         environ.set_inputs(batch, input_size)

#         environ.optimize(is_policy=False, 
#                          num_train_layers=ns.num_train_layers,
#                          flag='update_weights', 
#                          verbose = verbose)
    
#         t_warmup.set_postfix({'curr_iter':ns.current_iter, 
#                             'Loss': f"{environ.losses['total']['total'].item():.4f}"})

#-----------------------------------------
# TQDM example
#-----------------------------------------   
# current_epoch = 1
# total_epochs = 20
# current_iter = 0 
# train_minibatches = len(data_loader['train']) // minibatch_size
# val_minibatches = len(data_loader['val']) // minibatch_size

# # with batch_count, (batch_features, batch_labels, _, _, _, _) in tqdm(enumerate(data_loader['train']), initial=0, total = 400, position=0, file=sys.stdout,
#             # leave= False, desc=f" Epoch {current_epoch}/{total_epochs}") as t_warmup :
# t_warmup =  tqdm(enumerate(data_loader['train']), initial=0, total = train_minibatches, position=0, file=sys.stdout,
#             leave= False, desc=f" Epoch {current_epoch}/{total_epochs}") 
# for batch_count, (batch_features, batch_labels, _, _, _, _) in t_warmup:
#     # batch_count, (batch_features, batch_labels, _, _, _, _) = pp
#     loss = random.random()
#     t_warmup.set_postfix({'curr_iter':batch_count, 'Loss': f"{loss:.4f}"})