  # Apply encoder to morphological profiles to get latent space representations :

# Setup

In [1]:
%load_ext autoreload  
%autoreload 2
from IPython.display import display, HTML, Image
from IPython.core.interactiveshell import InteractiveShell
display(HTML("<style>.container { width:98% !important; }</style>"))
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import os
import sys
import random
from typing import List, Tuple
from types import SimpleNamespace
import pprint
import logging
from datetime import datetime
for p in ['./src','../pt-snnl','../..']:
    if p not in sys.path:
        print(f"insert {p}")
        sys.path.insert(0, p)
print(sys.path)

import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd

import scipy
import scipy.stats as sps
import sklearn.metrics as skm
from scipy.spatial.distance import pdist, squareform, euclidean

import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt  # for making figures
from torchinfo import summary

torch.set_printoptions(precision=None, threshold=None, edgeitems=None, linewidth=180, profile=None, sci_mode=None)
torch.manual_seed(42);   # seed rng for reproducibility
pp = pprint.PrettyPrinter(indent=4)
pd.options.display.width = 132
np.set_printoptions(edgeitems=3, infstr='inf', linewidth=150, nanstr='nan')

os.environ["WANDB_NOTEBOOK_NAME"] = "AE-MAIN-SNNL.ipynb"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

torch.set_num_threads(4)  ## <--- limit to ~ 2 CPUs
torch.get_num_threads()

insert ./src
insert ../pt-snnl
insert ../..
['../..', '../pt-snnl', './src', '/home/kevin/WSL-shared/cellpainting/cj-datasets', '/home/kevin/miniforge3/envs/cp311/lib/python311.zip', '/home/kevin/miniforge3/envs/cp311/lib/python3.11', '/home/kevin/miniforge3/envs/cp311/lib/python3.11/lib-dynload', '', '/home/kevin/miniforge3/envs/cp311/lib/python3.11/site-packages', '/home/kevin/miniforge3/envs/cp311/lib/python3.11/site-packages/huggingface_hub-0.20.3-py3.8.egg']


<torch._C.Generator at 0x7f67eb8aff50>

4

In [3]:
# from KevinsRoutines.utils.utils_general import list_namespace, save_to_pickle, load_from_pickle, get_device
import KevinsRoutines.utils as myutils
# import snnl.utils as utils
# from utils.utils_ptsnnl import display_cellpainting_batch, get_device
from utils.utils_cellpainting import label_counts, balance_datasets,save_checkpoint, load_checkpoint

from utils.utils_notebooks import plot_cls_metrics, compute_classification_metrics, run_model_on_test_data,\
                                train, validation, accuracy_fn, fit, build_model, define_datasets



In [4]:
myutils.get_device(verbose = True)

Dev Id   Device Name                    Total Memory                     InUse                            Free Memory 
   0     NVIDIA TITAN Xp                12,774,539,264 B/ (11.90 GB)  	 918,683,648 B / (0.86 GB)  	 11,855,855,616 B / (11.04 GB)   *** CURRENT DEVICE *** 

 Current CUDA Device is:  "cuda:0"  Device Name: NVIDIA TITAN Xp


'cuda:0'

In [5]:
timestamp = datetime.now().strftime('%Y_%m_%d_%H:%M:%S')
logger = logging.getLogger(__name__)
logLevel = os.environ.get('LOG_LEVEL', 'INFO').upper()
FORMAT = '%(asctime)s - %(name)s - %(levelname)s: - %(message)s'
logging.basicConfig(level="INFO", format= FORMAT)
logger.info(f" Excution started : {timestamp} ")
logger.info(f" Pytorch version  : {torch.__version__}")
logger.info(f" Scipy version    : {scipy.__version__}  \t\t Numpy version : {np.__version__}")
logger.info(f" Pandas version   : {pd.__version__}  ")

2024-09-30 19:53:03,200 - __main__ - INFO: -  Excution started : 2024_09_30_19:53:03 
2024-09-30 19:53:03,201 - __main__ - INFO: -  Pytorch version  : 2.2.0
2024-09-30 19:53:03,202 - __main__ - INFO: -  Scipy version    : 1.11.4  		 Numpy version : 1.26.2
2024-09-30 19:53:03,203 - __main__ - INFO: -  Pandas version   : 2.2.0  


## Helper routines

# Args 

In [6]:
LATENT_DIM    = 250
COMPOUNDS_PER_BATCH = 600
n_input    = LATENT_DIM  # the embedding dimensionality 
n_hidden_1 = 512  # the number of neurons in the hidden layer of the MLP
n_hidden_2 = 512  # the number of neurons in the hidden layer of the MLP
n_hidden_3 = 128

METADATA_COLS = ['Metadata_Source', 'Metadata_Batch', 'Metadata_Plate', 'Metadata_Well', 'Metadata_JCP2022', 'Metadata_Hash', 'Metadata_Bin', 'Metadata_TPSA', 'Metadata_lnTPSA', 'Metadata_log10TPSA', 'Metadata_Permiation']
# METADATA_COLS += [f'Feature_{x:03d}' for x in range(LATENT_DIM)]
input_cols = LATENT_DIM + len(METADATA_COLS)
print(len(METADATA_COLS))
print(input_cols)

# OUTPUT_PATH = f"/home/kevin/WSL-shared/cellpainting/cj-datasets/output_11102023/3_sample_embeddings/"
INPUT_PATH = f"/home/kevin/WSL-shared/cellpainting/cj-datasets/output_11102023/3_sample_embeddings/"
CKPT_PATH = "./saved_models/embedding_models"

11
261


In [9]:
RUN_DATETIME = datetime.now().strftime('%Y%m%d_%H%M')
# RUN_DATETIME = '20240909_1801'
# RUN_DATETIME = '20240909_1800'
# RUN_DATETIME = '20240909_2100'
# RUN_DATETIME = '20240916_1830'
# RUN_DATETIME = '20240921_0700'
# RUN_DATETIME = '20240926_1900'
# RUN_DATETIME = '20240926_1930'
# RUN_DATETIME = '20240927_2355'
# RUN_DATETIME = '20240929_2030'
RUN_DATETIME = '20240930_1945'
print(RUN_DATETIME)

20240930_1945


In [10]:
# SNNL AUTOENCODERS 
AE_RUNMODE = "snnl"
# AE_DATETIME = "20240718_1956"
# AE_DATETIME = "20240906_2201"     # Autoencoder training - SNNL, CPB = 600, Latent 150, WD = 0.001, SNN Factor 3
# AE_DATETIME = "20240917_2004"     # Autoencoder training - SNNL, CPB = 600, Latent 250, WD = 0.001, SNN Factor 3
AE_DATETIME = "20240924_0146"     # Autoencoder training - SNNL, CPB = 600, Latent 250, WD = 0.001, SNN Factor 30

## BASELINE AUTOENCODERS 
# AE_RUNMODE = 'base'
# AE_DATETIME = "20240917_2017"     # Autoencoder training - Baseline, CPB = 600, Latent 250, WD = 0.001 (SNN Factor 0)

# AE_CKPTTYPE = "BEST"
AE_CKPTTYPE = "LAST"

In [11]:
CKPT_FILE = f"NN_{AE_RUNMODE.lower()}_embd600_{LATENT_DIM}Ltnt_512_{AE_DATETIME}_{AE_CKPTTYPE}_{RUN_DATETIME}_ep_{{ep}}"
print(CKPT_FILE)

NN_snnl_embd600_250Ltnt_512_20240924_0146_LAST_20240930_1945_ep_{ep}


In [12]:
## total rows = 346,542
## Trn file sz: 312,000 
## Train      : 277,200    (312_000 - (21,600 + 12,600 + 600) = 277,200
## Validation :  21,600
## Test       :  12,600
## Leftover   :     600
cellpainting_args = {'compounds_per_batch': COMPOUNDS_PER_BATCH,
                     'train_start'        : 0,
                     'train_end'          : 277_200,
                     'val_start'          : 0,
                     'val_end'            : 21_600,
                     'test_start'         : 0,
                     'test_end'           : 12_600, 
                    }

In [13]:
data_loader = define_datasets(cellpainting_args, AE_RUNMODE, AE_DATETIME, input_cols, AE_CKPTTYPE, INPUT_PATH)

2024-09-30 19:53:40,379 - utils.dataloader - INFO: -  Building CellPantingDataset for train
2024-09-30 19:53:40,379 - utils.dataloader - INFO: -  filename:  /home/kevin/WSL-shared/cellpainting/cj-datasets/output_11102023/3_sample_embeddings/3smpl_prfl_embedding_261_HashOrder_snnl_20240924_0146_LAST_train.csv
2024-09-30 19:53:40,380 - utils.dataloader - INFO: -  type    :  train
2024-09-30 19:53:40,381 - utils.dataloader - INFO: -  start   :  0
2024-09-30 19:53:40,382 - utils.dataloader - INFO: -  end     :  277200
2024-09-30 19:53:40,382 - utils.dataloader - INFO: -  numrows :  277200
2024-09-30 19:53:40,383 - utils.dataloader - INFO: -  names   :  None     usecols :  None
2024-09-30 19:53:40,384 - utils.dataloader - INFO: -  batch_size  :  1
2024-09-30 19:53:40,384 - utils.dataloader - INFO: -  sample_size :  3
2024-09-30 19:53:40,385 - utils.dataloader - INFO: -  compounds_per_batch :  600
2024-09-30 19:53:40,385 - utils.dataloader - INFO: -  rows per batch (chunksize) :  1800
2024-0

 TRAIN_INPUT:  /home/kevin/WSL-shared/cellpainting/cj-datasets/output_11102023/3_sample_embeddings/3smpl_prfl_embedding_261_HashOrder_snnl_20240924_0146_LAST_train.csv
 TEST_INPUT :  /home/kevin/WSL-shared/cellpainting/cj-datasets/output_11102023/3_sample_embeddings/3smpl_prfl_embedding_261_HashOrder_snnl_20240924_0146_LAST_train_sub_test.csv
 ALL_INPUT  :  /home/kevin/WSL-shared/cellpainting/cj-datasets/output_11102023/3_sample_embeddings/3smpl_prfl_embedding_261_HashOrder_snnl_20240924_0146_LAST_train_sub_val.csv
 load {}
 Dataset size: 277200   rows per batch: 1800
 Dataset size: 21600   rows per batch: 1800
 Dataset size: 12600   rows per batch: 1800


## Input Dataloader

 ### Split the training portion of the dataset into train, val and test


In [14]:
# TRAIN_INPUT_FILE = f"3smpl_prfl_embedding_{num_cols}_HashOrder_{AE_RUNMODE}_{AE_DATETIME}_{AE_CKPTTYPE}_train.csv"
# TEST_INPUT_FILE  = f"3smpl_prfl_embedding_{num_cols}_HashOrder_{AE_RUNMODE}_{AE_DATETIME}_{AE_CKPTTYPE}_train_sub_test.csv"
# VAL_INPUT_FILE   = f"3smpl_prfl_embedding_{num_cols}_HashOrder_{AE_RUNMODE}_{AE_DATETIME}_{AE_CKPTTYPE}_train_sub_val.csv"
# # ALL_INPUT_FILE   = f"3smpl_prfl_embedding_{num_cols}_HashOrder_{AE_RUNMODE}_{AE_DATETIME}_{AE_CKPTTYPE}_sub_val.csv"

# TRAIN_INPUT = os.path.join(INPUT_PATH, TRAIN_INPUT_FILE)
# TEST_INPUT  = os.path.join(INPUT_PATH, TEST_INPUT_FILE)
# VAL_INPUT   = os.path.join(INPUT_PATH, VAL_INPUT_FILE)

# print(f" TRAIN_INPUT:  {TRAIN_INPUT}")
# print(f" TEST_INPUT :  {TEST_INPUT }")
# print(f" ALL_INPUT  :  {VAL_INPUT }")

In [15]:
## total rows = 346,542
## Trn file sz: 312,000 
## Train      : 277,200    (312_000 - (21,600 + 12,600 + 600) = 277,200
## Validation :  21,600
## Test       :  12,600
## Leftover   :     600
# cellpainting_args = {'sample_size': 3,
#                      'batch_size': 1,
#                      'compounds_per_batch': 600,
#                      'training_path'  : TRAIN_INPUT,
#                      'validation_path': TRAIN_INPUT,
#                      'test_path'      : TRAIN_INPUT,
#                      'train_start'    : 0,
#                      'train_end'      : 277_200,  # 277,200 samples
#                      'val_start'      : 277_200,  # 
#                      'val_end'        : 298_800,  # 21_600 samples
#                      'test_start'     : 298_800,  # 
#                      'test_end'       : 311_400,  # 12_600 samples
#                     }


# cellpainting_args = {'compounds_per_batch': COMPOUNDS_PER_BATCH,
#                      'training_path'      : TRAIN_INPUT,
#                      'validation_path'    :  VAL_INPUT,
#                      'test_path'          : TEST_INPUT,
#                      'train_start'        : 0,
#                      'train_end'          : 277_200,
#                      'val_start'          : 0,
#                      'val_end'            : 21_600,
#                      'test_start'         : 0,
#                      'test_end'           : 12_600, 
#                     }

In [16]:
# cellpainting_args

In [17]:
#### Load CellPainting Dataset
# data : keys to the dataset settings (and resulting keys in output dictionary)
# dataset = dict()
# data_loader = dict()

# print(f" load {dataset}")
# for datatype in ['train', 'val', 'test']:
#     dataset[datatype] = CellpaintingDataset(type = datatype, **cellpainting_args)
#     data_loader[datatype] = InfiniteDataLoader(dataset = dataset[datatype], batch_size=1, shuffle = False, num_workers = 0, collate_fn = custom_collate_fn)

In [18]:
# %%timeit
# for dataset in ['train', 'val', 'test']:
#     for idx, batch in enumerate(data_loader[dataset]):
#         print(batch[0].shape[0], batch[1].sum())
#         # display_cellpainting_batch(idx, batch)
#         if idx == 1:
#             break

In [19]:
# # -----------------------------------------
# #  Count pos/neg labels in each dataset
# # -----------------------------------------
# for datatype in ['train', 'val', 'test']:
#     MINIBATCH_SIZE = data_loader[datatype].dataset.sample_size * data_loader[datatype].dataset.compounds_per_batch
#     print(f" {datatype.capitalize()} Minibatch size : {MINIBATCH_SIZE} \n") 
# # for datatype in ['val', 'test']:
#     minibatches = len(data_loader[datatype]) // MINIBATCH_SIZE
#     ttl_rows = 0
#     ttl_pos_labels = 0
#     with tqdm.tqdm(enumerate(data_loader[datatype]), initial=0, total = minibatches, position=0, file=sys.stdout,
#                    leave= False, desc=f" Count labels ") as t_warmup:
#         for batch_count, (batch_features, batch_labels, _, _, _, _) in t_warmup:
#             ttl_rows += batch_labels.shape[0]
#             ttl_pos_labels += batch_labels.sum()
#     ttl_neg_labels = ttl_rows - ttl_pos_labels
#     ttl = f"\n Dataset: {datatype} -  len of {datatype} data loader: {len(data_loader[datatype])}   number of batches: {minibatches}"
#     print(ttl)
#     print('-'*len(ttl))
#     print(f" total rows     : {ttl_rows:7d}")
#     print(f" total pos rows : {ttl_pos_labels:7.0f} - {ttl_pos_labels*100.0/ttl_rows:5.2f}%")
#     print(f" total neg rows : {ttl_neg_labels:7.0f} - {ttl_neg_labels*100.0/ttl_rows:5.2f}%")
#     print()

     Minibatch size : 1800 
                                                                                                 
     Dataset: train - len of train data loader: 277200   number of batches: 154  
    ------------------------------
     total rows     :  277200
     total pos rows :   33129 - 11.95%
     total neg rows :  244071 - 88.05%

     Dataset: val - len of val data loader: 21600   number of batches: 12
    ------------------------------
     total rows     :   21600
     total pos rows :    2532 - 11.72%
     total neg rows :   19068 - 88.28%
    
     Dataset: test - len of test data loader: 12600   number of batches: 7
    ------------------------------
     total rows     :   12600
     total pos rows :    1431 - 11.36%
     total neg rows :   11169 - 88.64%

# Define Neural Net Model 

- **4 layer model :**

    Input --> Hidden1 --> (BN/NL) ---> Hidden2 ---> (BN/NL) ---> Hidden3 --->  (BN/NL) ---> 1
   
    -  **20240909_1800** : Run on 4 FC layers model (includes final layer), model configuration UNKNOWN
    -  **20240909_1801** : Run on 4 FC layers model (includes final layer), Relu non linearities (NO Batch Norm)
    -  **20240909_2100** : Run on 4 FC layers model (includes final layer), with BATCH NORM and tanh non linearities  
 - **Single Hidden Layer - 256**

   Input --> Hidden1 --> (Tanh) --->  1
    -  **20240916_1830** : Run on 1 FC layers model (includes final layer), Input --> 256 --> Tanh --> 1 ,  Read from 20240906_2201 (SNNL - CPB 600, LAT 150, SNN Factor 3)
    -  **20240926_1900** : Run on 1 FC layers model (includes final layer), Input --> 256 --> Tanh --> 1 ,  Read from 20240917_2017 (BASELINE - CPB 600, LAT 250, SNN Factor 0)
    -  **20240926_1930** : Run on 1 FC layers model (includes final layer), Input --> 256 --> Tanh --> 1 ,  Read from 20240917_2004 (SNNL - CPB 600, LAT 250, SNN Factor 3)
    -  **20240926_2000** : Run on 1 FC layers model (includes final layer), Input --> 256 --> Tanh --> 1 ,  Read from 20240924_0146 (SNNL - CPB 600, LAT 250, SNN Factor 30)
<br>

 - **Single Hidden Layer - 256**

    -  **20240921_0700** : Run on 1 FC layers model (includes final layer), Input --> 512 --> Tanh --> 1 ,  Read from 20240906_2201 (SNNL - CPB 600, LAT 150, SNN Factor 3)    


In [20]:
# Set visible GPU device 
# ----------------------------------------------
if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

try:
    del model
except Exception as e:
    pass

In [21]:

# model = build_model('single_layer', input = n_input, hidden_1 = n_hidden_1, device = device)
# model = build_model('relu', input = n_input, hidden_1 = n_hidden_1, hidden_2 = n_hidden_2, hidden_3=n_hidden_3, device = device)
model = build_model('batch_norm', input = n_input, hidden_1 = n_hidden_1, hidden_2 = n_hidden_2, hidden_3=n_hidden_3, device = device)


In [22]:
ttl_nelements = 0
for p in model.parameters():
    print(f"Parm shape: {str(p.shape):35s}    # elements: {p.nelement():8d}    Required gradient calc: {p.requires_grad}")
    ttl_nelements += p.nelement()
print(f"Total num of parameters: {ttl_nelements}")  # number of parameters in total

col_names = ["input_size", "output_size", "num_params", "params_percent", "mult_adds", "trainable"]

summary_input_size = (30, n_input)
_ = summary(model, verbose = 2, input_size=summary_input_size, col_names = col_names)

Parm shape: torch.Size([512, 250])                 # elements:   128000    Required gradient calc: True
Parm shape: torch.Size([512])                      # elements:      512    Required gradient calc: True
Parm shape: torch.Size([512])                      # elements:      512    Required gradient calc: True
Parm shape: torch.Size([512])                      # elements:      512    Required gradient calc: True
Parm shape: torch.Size([512, 512])                 # elements:   262144    Required gradient calc: True
Parm shape: torch.Size([512])                      # elements:      512    Required gradient calc: True
Parm shape: torch.Size([512])                      # elements:      512    Required gradient calc: True
Parm shape: torch.Size([512])                      # elements:      512    Required gradient calc: True
Parm shape: torch.Size([128, 512])                 # elements:    65536    Required gradient calc: True
Parm shape: torch.Size([128])                      # elements:  

In [24]:
# ## parameter initialization
# with torch.no_grad():
#     model[-1].weight *= 0.1 # last layer make less confident
# model[-1].weight.sum()

In [25]:
CKPT_FILE = f"NN_{AE_RUNMODE.lower()}_embd600_{LATENT_DIM}Ltnt_512_{AE_DATETIME}_{AE_CKPTTYPE}_{RUN_DATETIME}_ep_{{ep}}"
print(CKPT_FILE)

metrics = { 'loss_trn' : [], 'acc_trn' : [], 'loss_val' : [], 'acc_val' : []}

start_epoch, end_epoch = 0,0
init_LR = 1.0e-3
# curr_LR = init_LR

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=init_LR)

# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.3 , patience=20, cooldown=10,)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = step_size, gamma=0.1, last_epoch =-1)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.5, threshold=1.0e-06, patience=50, cooldown=10,)

NN_snnl_embd600_250Ltnt_512_20240924_0146_LAST_20240930_1945_ep_{ep}


### Read checkpoint

In [26]:
# loaded_epoch
# optimizer.state_dict()
# scheduler.state_dict()

In [27]:
# from utils.utils_cellpainting import load_checkpoint
# model, optimizer, scheudler, end_epoch = load_checkpoint(model, optimizer, scheduler, checkpoint_file.format(ep=100), ckpt_path = CKPT_PATH)
# model = model.to(device)

In [28]:
# end_epoch
# optimizer.state_dict()
# scheduler.state_dict()

# Run Training

In [29]:
# start_epoch = 0
# start_epoch = loaded_epoch
start_epoch = end_epoch
end_epoch += 600
# start_epoch, end_epoch = 0,100
print(start_epoch, end_epoch)
_ = model.train()

0 600


In [30]:

_ = fit(model, optimizer, scheduler, data_loader, metrics, start_epoch, end_epoch, device, CKPT_FILE, CKPT_PATH )


 19:55:00 | Ep:   1/ 600 | Trn loss:  0.408001 - Acc: 85.7172 | Val loss:  0.355314 - Acc: 88.1481 | last_lr: 1.00000e-03  bad_ep: 0  cdwn: 0                              
 19:55:24 | Ep:   2/ 600 | Trn loss:  0.359419 - Acc: 87.9827 | Val loss:  0.353672 - Acc: 88.1620 | last_lr: 1.00000e-03  bad_ep: 0  cdwn: 0                              
 19:55:48 | Ep:   3/ 600 | Trn loss:  0.357948 - Acc: 88.0339 | Val loss:  0.353858 - Acc: 88.1806 | last_lr: 1.00000e-03  bad_ep: 1  cdwn: 0                              
 19:56:11 | Ep:   4/ 600 | Trn loss:  0.357275 - Acc: 88.0447 | Val loss:  0.352512 - Acc: 88.2315 | last_lr: 1.00000e-03  bad_ep: 0  cdwn: 0                              
 19:56:35 | Ep:   5/ 600 | Trn loss:  0.356700 - Acc: 88.0602 | Val loss:  0.352397 - Acc: 88.2407 | last_lr: 1.00000e-03  bad_ep: 0  cdwn: 0                              
 19:56:59 | Ep:   6/ 600 | Trn loss:  0.356045 - Acc: 88.0584 | Val loss:  0.351732 - Acc: 88.2454 | last_lr: 1.00000e-03  bad_ep: 0  cdwn: 

2024-09-30 20:34:24,016 - utils.utils_cellpainting - INFO: -  Model exported to NN_snnl_embd600_250Ltnt_512_20240924_0146_LAST_20240930_1945_ep_100.pt - epoch: 100


 20:34:24 | Ep: 100/ 600 | Trn loss:  0.345923 - Acc: 88.2976 | Val loss:  0.359026 - Acc: 88.2130 | last_lr: 1.00000e-03  bad_ep: 46  cdwn: 0 
 20:34:48 | Ep: 101/ 600 | Trn loss:  0.345713 - Acc: 88.3034 | Val loss:  0.366218 - Acc: 88.1944 | last_lr: 1.00000e-03  bad_ep: 47  cdwn: 0                             
 20:35:12 | Ep: 102/ 600 | Trn loss:  0.345536 - Acc: 88.3056 | Val loss:  0.365543 - Acc: 88.2083 | last_lr: 1.00000e-03  bad_ep: 48  cdwn: 0                             
 20:35:37 | Ep: 103/ 600 | Trn loss:  0.345315 - Acc: 88.3120 | Val loss:  0.364026 - Acc: 88.1944 | last_lr: 1.00000e-03  bad_ep: 49  cdwn: 0                             
 20:36:01 | Ep: 104/ 600 | Trn loss:  0.345209 - Acc: 88.3153 | Val loss:  0.361716 - Acc: 88.2176 | last_lr: 1.00000e-03  bad_ep: 50  cdwn: 0                             
 20:36:24 | Ep: 105/ 600 | Trn loss:  0.344988 - Acc: 88.3236 | Val loss:  0.367332 - Acc: 88.2130 | last_lr: 5.00000e-04  bad_ep: 0  cdwn: 10                          

2024-09-30 21:14:10,340 - utils.utils_cellpainting - INFO: -  Model exported to NN_snnl_embd600_250Ltnt_512_20240924_0146_LAST_20240930_1945_ep_200.pt - epoch: 200


 21:14:10 | Ep: 200/ 600 | Trn loss:  0.319071 - Acc: 89.0357 | Val loss:  0.381215 - Acc: 88.0231 | last_lr: 2.50000e-04  bad_ep: 24  cdwn: 0 
 21:14:34 | Ep: 201/ 600 | Trn loss:  0.318814 - Acc: 89.0455 | Val loss:  0.383711 - Acc: 88.0185 | last_lr: 2.50000e-04  bad_ep: 25  cdwn: 0                             
 21:14:57 | Ep: 202/ 600 | Trn loss:  0.318557 - Acc: 89.0523 | Val loss:  0.386305 - Acc: 88.0185 | last_lr: 2.50000e-04  bad_ep: 26  cdwn: 0                             
 21:15:21 | Ep: 203/ 600 | Trn loss:  0.318301 - Acc: 89.0570 | Val loss:  0.387980 - Acc: 88.0185 | last_lr: 2.50000e-04  bad_ep: 27  cdwn: 0                             
 21:15:44 | Ep: 204/ 600 | Trn loss:  0.318049 - Acc: 89.0689 | Val loss:  0.389960 - Acc: 88.0324 | last_lr: 2.50000e-04  bad_ep: 28  cdwn: 0                             
 21:16:08 | Ep: 205/ 600 | Trn loss:  0.317795 - Acc: 89.0786 | Val loss:  0.393174 - Acc: 88.0417 | last_lr: 2.50000e-04  bad_ep: 29  cdwn: 0                          

2024-09-30 21:54:09,458 - utils.utils_cellpainting - INFO: -  Model exported to NN_snnl_embd600_250Ltnt_512_20240924_0146_LAST_20240930_1945_ep_300.pt - epoch: 300


 21:54:09 | Ep: 300/ 600 | Trn loss:  0.305493 - Acc: 89.4170 | Val loss:  0.497982 - Acc: 88.0185 | last_lr: 6.25000e-05  bad_ep: 2  cdwn: 0 
 21:54:33 | Ep: 301/ 600 | Trn loss:  0.305314 - Acc: 89.4203 | Val loss:  0.496125 - Acc: 88.0185 | last_lr: 6.25000e-05  bad_ep: 3  cdwn: 0                                                                 
 21:54:57 | Ep: 302/ 600 | Trn loss:  0.305142 - Acc: 89.4300 | Val loss:  0.494564 - Acc: 88.0185 | last_lr: 6.25000e-05  bad_ep: 4  cdwn: 0                                                                 
 21:55:21 | Ep: 303/ 600 | Trn loss:  0.304976 - Acc: 89.4297 | Val loss:  0.493407 - Acc: 88.0185 | last_lr: 6.25000e-05  bad_ep: 5  cdwn: 0                                                                 
 21:55:45 | Ep: 304/ 600 | Trn loss:  0.304816 - Acc: 89.4333 | Val loss:  0.492522 - Acc: 88.0185 | last_lr: 6.25000e-05  bad_ep: 6  cdwn: 0                                                                 
 21:56:09 | Ep: 305/ 600 | Tr

2024-09-30 22:34:27,306 - utils.utils_cellpainting - INFO: -  Model exported to NN_snnl_embd600_250Ltnt_512_20240924_0146_LAST_20240930_1945_ep_400.pt - epoch: 400


 22:34:27 | Ep: 400/ 600 | Trn loss:  0.297309 - Acc: 89.6335 | Val loss:  0.395052 - Acc: 87.9722 | last_lr: 3.12500e-05  bad_ep: 41  cdwn: 0 
 22:34:51 | Ep: 401/ 600 | Trn loss:  0.297251 - Acc: 89.6338 | Val loss:  0.395070 - Acc: 87.9722 | last_lr: 3.12500e-05  bad_ep: 42  cdwn: 0                                                                
 22:35:15 | Ep: 402/ 600 | Trn loss:  0.297193 - Acc: 89.6353 | Val loss:  0.395000 - Acc: 87.9722 | last_lr: 3.12500e-05  bad_ep: 43  cdwn: 0                                                                
 22:35:39 | Ep: 403/ 600 | Trn loss:  0.297135 - Acc: 89.6367 | Val loss:  0.395134 - Acc: 87.9722 | last_lr: 3.12500e-05  bad_ep: 44  cdwn: 0                                                                
 22:36:03 | Ep: 404/ 600 | Trn loss:  0.297077 - Acc: 89.6396 | Val loss:  0.395152 - Acc: 87.9676 | last_lr: 3.12500e-05  bad_ep: 45  cdwn: 0                                                                
 22:36:27 | Ep: 405/ 600 | T

2024-09-30 23:14:45,633 - utils.utils_cellpainting - INFO: -  Model exported to NN_snnl_embd600_250Ltnt_512_20240924_0146_LAST_20240930_1945_ep_500.pt - epoch: 500


 23:14:45 | Ep: 500/ 600 | Trn loss:  0.294463 - Acc: 89.7060 | Val loss:  0.381716 - Acc: 87.9769 | last_lr: 7.81250e-06  bad_ep: 19  cdwn: 0 
 23:15:10 | Ep: 501/ 600 | Trn loss:  0.294443 - Acc: 89.7060 | Val loss:  0.381714 - Acc: 87.9769 | last_lr: 7.81250e-06  bad_ep: 20  cdwn: 0                                                                
 23:15:35 | Ep: 502/ 600 | Trn loss:  0.294424 - Acc: 89.7074 | Val loss:  0.381705 - Acc: 87.9769 | last_lr: 7.81250e-06  bad_ep: 21  cdwn: 0                                                                
 23:16:00 | Ep: 503/ 600 | Trn loss:  0.294405 - Acc: 89.7074 | Val loss:  0.381721 - Acc: 87.9769 | last_lr: 7.81250e-06  bad_ep: 22  cdwn: 0                                                                
 23:16:25 | Ep: 504/ 600 | Trn loss:  0.294386 - Acc: 89.7089 | Val loss:  0.381713 - Acc: 87.9769 | last_lr: 7.81250e-06  bad_ep: 23  cdwn: 0                                                                
 23:16:50 | Ep: 505/ 600 | T

2024-09-30 23:55:16,197 - utils.utils_cellpainting - INFO: -  Model exported to NN_snnl_embd600_250Ltnt_512_20240924_0146_LAST_20240930_1945_ep_600.pt - epoch: 600


 23:55:16 | Ep: 600/ 600 | Trn loss:  0.293122 - Acc: 89.7594 | Val loss:  0.380778 - Acc: 87.9722 | last_lr: 1.95313e-06  bad_ep: 0  cdwn: 3 


In [None]:
# print(CKPT_FILE)
# save_checkpoint(end_epoch, model, optimizer, scheduler, metrics = metrics,
#                 filename = CKPT_FILE.format(ep=end_epoch),
#                 ckpt_path = CKPT_PATH, verbose = True)

In [None]:
# start_epoch, end_epoch

# for mtrc in ['loss_trn', 'loss_val']:
#     for i in range(len(metrics[mtrc])):
#         # print(i)
#         metrics[mtrc][i] = metrics[mtrc][i].item()
 

# Modify TPSA Threshold

In [None]:

train_y_72 = np.zeros_like(train_y)
train_y_72.shape[0]/3

In [None]:
print(f"                  min           max           std          mean           median")
for x in ['TPSA', 'lnTPSA', 'log10TPSA']:
    print(f"{x:12s} {df_train[x].min():13.7f} {df_train[x].max():13.7f} {df_train[x].std():13.7f} {df_train[x].mean():13.7f} {df_train[x].median():13.7f}") 

df_train.TPSA.count()
df_train[df_train.TPSA >= THRESHOLD].TPSA.count()/df_train.TPSA.count()
df_train[df_train.TPSA < THRESHOLD].TPSA.count()/df_train.TPSA.count()


In [None]:
_tmp = df_train.Metadata_Permiation.value_counts()
_tmp[0], _tmp[1]

In [None]:
for threshold in [68, 69, 70, 71, 72, 100]:
    _tmp = (df_train['Metadata_TPSA'] >= threshold).value_counts()
    print(f"\n TPSA threshold {threshold} \n Total samples: {_tmp.sum()}")
    print(f" Label 0: {_tmp[False]:>7d}      % {_tmp[False]*100/_tmp.sum():2.2f} ")
    print(f" Label 1: {_tmp[True]:>7d}      % {_tmp[True]*100/_tmp.sum():2.2f} ")

In [None]:
# fig, ax = plt.subplots(figsize=(4,4))
# fig.canvas.draw()  # Need to draw the figure to define renderer
# ax.set_title("AngleLabel example")
# # Plot two crossing lines and label each angle between them with the above
# center = (4.5, 650)
# p1 = [(2.5, 710), (6.0, 605)]
# p2 = [(3.0, 275), (5.5, 900)]
# line1, = ax.plot(*zip(*p1))
# line2, = ax.plot(*zip(*p2))
# point, = ax.plot(*center, marker="o")


In [None]:
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
num_bins = 200
# fig, ax = plt.subplots()
fig = plt.figure(figsize=(10,5))
sigma = df_train.Metadata_TPSA.std()
mu = df_train.Metadata_TPSA.mean()
med = df_train.Metadata_TPSA.median()
# the histogram of the data
# We can set the number of bins with the *bins* keyword argument.
n, bins, patches = plt.hist(df_train.Metadata_TPSA, num_bins, density=False, range=[0, 500],)
# p1 = [(med, 710), (6.0, 605)]
# _ = plt.vlines(x=med, ymin=10, ymax=17000, colors='red', linestyles='-', lw=1.75, label='Single Short Line')
_ = plt.axvline(x=med, ymin=0, ymax=.97, color='red', linestyle='-', lw=1.75, label='Single Short Line')
_ = plt.xlabel('TPSA Value');
_ = plt.ylabel('# Compounds');
_ = plt.title(fr'TPSA distribution -  $\mu={mu:.3f}$    $\sigma={sigma:.3f}$')
plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0)
# axs[1].hist(dist2, bins=n_bins)
plt.show()

# Stratified CV data splits

In [None]:
def visualize_groups(classes, groups, name):
    # Visualize dataset groups
    fig, ax = plt.subplots(figsize=(10,5))
    ax.scatter(
        range(len(groups)),
        [0.5] * len(groups),
        c=groups,
        marker="_",
        lw=50,
        cmap=cmap_data,
    )
    ax.scatter(
        range(len(groups)),
        [3.5] * len(groups),
        c=classes,
        marker="_",
        lw=50,
        cmap=cmap_data,
    )
    ax.set(
        ylim=[-1, 5],
        yticks=[0.5, 3.5],
        yticklabels=["Data\ngroup", "Data\nclass"],
        xlabel="Sample index",
    )

In [None]:
def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""
    use_groups = "Group" in type(cv).__name__
    groups = group if use_groups else None
    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=groups)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=indices,
            marker="_",
            lw=lw,
            cmap=cmap_cv,
            vmin=-0.2,
            vmax=1.2,
        )

    # Plot the data classes and groups at the end
    ax.scatter(
        range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=cmap_data
    )

    ax.scatter(
        range(len(X)), [ii + 2.5] * len(X), c=group, marker="_", lw=lw, cmap=cmap_data
    )

    # Formatting
    yticklabels = list(range(n_splits)) + ["class", "group"]
    ax.set(
        yticks=np.arange(n_splits + 2) + 0.5,
        yticklabels=yticklabels,
        xlabel="Sample index",
        ylabel="CV iteration",
        ylim=[n_splits + 2.2, -0.2],
        xlim=[0, 100],
    )
    ax.set_title("{}".format(type(cv).__name__), fontsize=15)
    return ax

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Patch


In [None]:
rng = np.random.RandomState(1338)
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
n_splits = 4

# Generate the class/group data
# n_points = 100
# X = rng.randn(100, 10)

# percentiles_classes = [0.1, 0.3, 0.6]
# y = np.hstack([[ii] * int(100 * perc) for ii, perc in enumerate(percentiles_classes)])

In [None]:
# Generate uneven groups

# group_prior = rng.dirichlet([2] * 10)
# group_prior.sum()
# group_prior

# groups = np.repeat(np.arange(10), rng.multinomial(100, group_prior))
# groups.shape
# groups

In [None]:
groups = np.repeat(0, train_X.shape[0])
groups.shape

In [None]:
visualize_groups(train_y, groups, "no groups")

In [None]:
n_splits = 5
groups = None

In [None]:
fig, ax = plt.subplots()
cv = KFold(n_splits)
plot_cv_indices(cv, train_X, train_y, groups, ax, n_splits)

# Input 

## Read Embedded Features CSV file

In [None]:
BASE_TRAIN_INPUT = os.path.join(OUTPUT_PATH, INPUT_FILE.format(runmode = BASE_runmode ,datatype='train'))
BASE_TEST_INPUT  = os.path.join(OUTPUT_PATH, INPUT_FILE.format(runmode = BASE_runmode ,datatype='test'))
SNNL_TRAIN_INPUT = os.path.join(OUTPUT_PATH, INPUT_FILE.format(runmode = SNNL_runmode ,datatype='train'))
SNNL_TEST_INPUT  = os.path.join(OUTPUT_PATH, INPUT_FILE.format(runmode = SNNL_runmode ,datatype='test'))
BASE_TRAIN_INPUT
BASE_TEST_INPUT 
SNNL_TRAIN_INPUT
SNNL_TEST_INPUT 

In [None]:
df_test = pd.read_csv(SNNL_TEST_INPUT )
df_train = pd.read_csv(SNNL_TRAIN_INPUT)

# df_train = pd.read_csv(BASE_TRAIN_INPUT)
# df_test = pd.read_csv(BASE_TEST_INPUT )
# df_train = pd.read_csv(TRAIN_INPUT, nrows = 100 )
# df_train = pd.read_csv(TRAIN_INPUT, usecols = ['Metadata_Batch'])

In [None]:
df_test.info()
df_test.shape
df_test.columns
df_test.iloc[:5,:13]

In [None]:
# df_train = pd.read_csv(TRAIN_INPUT, nrows = 100 )
# df_train = pd.read_csv(TRAIN_INPUT, usecols = ['Metadata_Batch'])

In [None]:
test_y = df_test.iloc[:,10:11].to_numpy().ravel().astype(np.uint8)
test_y.sum()
test_y.shape, type(test_y), test_y.dtype
test_X = df_test.iloc[:,11:].to_numpy()
test_X.shape,type(test_X), test_X.dtype

In [None]:
df_train.shape
df_train.info()
df_train.iloc[:5,:16]

In [None]:
312000+34542

In [None]:
train_y = df_train.iloc[:,10:11].to_numpy().ravel().astype(np.uint8)
train_y.sum()
train_y.shape, type(train_y), train_y.dtype

train_X = df_train.iloc[:,11:].to_numpy()
train_X.shape,type(train_X) ,train_X.dtype

## Standardize inputs

In [None]:
print(f"Train_X :  Min: {train_X.min():.4f}    Max: {train_X.max():.4f}   Mean: {train_X.mean():.4f}  Std: {train_X.std():.4f}")
print(f"Test_X  :  Min: {test_X.min():.4f}    Max: {test_X.max():.4f}    Mean: {test_X.mean():.4f}  Std: {test_X.std():.4f}")

In [None]:
# print(f"Train_X :  Min: {train_X.min():.4f}    Max: {train_X.max():.4f}   Mean: {train_X.mean():.4f}  Std: {train_X.std():.4f}")
# print(f"Test_X  :  Min: {test_X.min():.4f}    Max: {test_X.max():.4f}    Mean: {test_X.mean():.4f}  Std: {test_X.std():.4f}")

In [None]:
scaler = StandardScaler(copy = True)
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

In [None]:
print("After Standard Scaler Transformation")
print(f"Train_X :  Min: {train_X.min():.4f}    Max: {train_X.max():.4f}   Mean: {train_X.mean():.4f}  Std: {train_X.std():.4f}")
print(f"Test_X  :  Min: {test_X.min():.4f}    Max: {test_X.max():.4f}    Mean: {test_X.mean():.4f}  Std: {test_X.std():.4f}")


In [None]:
label_counts([("Training", train_y), ("Test", test_y)])

# TQDM Examples

In [None]:
#-----------------------------------------
# TRANGE example
#-----------------------------------------   
# with trange(+1, ns.trn_iters_warmup+1 , initial = 0 , total = ns.trn_iters_warmup, position=0, file=sys.stdout,
#             leave= False, disable = disable_tqdm, desc=f" Warmup Epoch {ns.current_epoch}/{ns.stop_epoch_warmup}") as t_warmup :
#     for _ in t_warmup:
#         ns.current_iter += 1            

#         batch = next(dldrs.warmup_trn_loader)            
#         environ.set_inputs(batch, input_size)

#         environ.optimize(is_policy=False, 
#                          num_train_layers=ns.num_train_layers,
#                          flag='update_weights', 
#                          verbose = verbose)
    
#         t_warmup.set_postfix({'curr_iter':ns.current_iter, 
#                             'Loss': f"{environ.losses['total']['total'].item():.4f}"})

#-----------------------------------------
# TQDM example
#-----------------------------------------   
# current_epoch = 1
# total_epochs = 20
# current_iter = 0 
# train_minibatches = len(data_loader['train']) // minibatch_size
# val_minibatches = len(data_loader['val']) // minibatch_size

# # with batch_count, (batch_features, batch_labels, _, _, _, _) in tqdm(enumerate(data_loader['train']), initial=0, total = 400, position=0, file=sys.stdout,
#             # leave= False, desc=f" Epoch {current_epoch}/{total_epochs}") as t_warmup :
# t_warmup =  tqdm(enumerate(data_loader['train']), initial=0, total = train_minibatches, position=0, file=sys.stdout,
#             leave= False, desc=f" Epoch {current_epoch}/{total_epochs}") 
# for batch_count, (batch_features, batch_labels, _, _, _, _) in t_warmup:
#     # batch_count, (batch_features, batch_labels, _, _, _, _) = pp
#     loss = random.random()
#     t_warmup.set_postfix({'curr_iter':batch_count, 'Loss': f"{loss:.4f}"})