  # Apply encoder to morphological profiles to get latent space representations :

# Setup

In [3]:
%load_ext autoreload  
%autoreload 2
from IPython.display import display, HTML, Image
from IPython.core.interactiveshell import InteractiveShell
display(HTML("<style>.container { width:98% !important; }</style>"))
InteractiveShell.ast_node_interactivity = "all"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import os
import sys
import random
from typing import List, Tuple
from types import SimpleNamespace
from functools import partial
import pprint
import logging
from datetime import datetime
for p in ['./src','../pt-snnl','../..']:
    if p not in sys.path:
        print(f"insert {p}")
        sys.path.insert(0, p)
print(sys.path)

import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd

import scipy
import scipy.stats as sps
import sklearn.metrics as skm
from scipy.spatial.distance import pdist, squareform, euclidean

import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt  # for making figures
from torchinfo import summary

torch.set_printoptions(precision=None, threshold=None, edgeitems=None, linewidth=180, profile=None, sci_mode=None)
torch.manual_seed(42);   # seed rng for reproducibility
pp = pprint.PrettyPrinter(indent=4)
pd.options.display.width = 132
np.set_printoptions(edgeitems=3, infstr='inf', linewidth=150, nanstr='nan')

os.environ["WANDB_NOTEBOOK_NAME"] = "AE-MAIN-SNNL.ipynb"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"

torch.set_num_threads(4)  ## <--- limit to ~ 2 CPUs
torch.get_num_threads()

['../..', '../pt-snnl', './src', '/home/kevin/WSL-shared/cellpainting/cj-datasets', '/home/kevin/miniforge3/envs/cp311/lib/python311.zip', '/home/kevin/miniforge3/envs/cp311/lib/python3.11', '/home/kevin/miniforge3/envs/cp311/lib/python3.11/lib-dynload', '', '/home/kevin/miniforge3/envs/cp311/lib/python3.11/site-packages', '/home/kevin/miniforge3/envs/cp311/lib/python3.11/site-packages/huggingface_hub-0.20.3-py3.8.egg']


<torch._C.Generator at 0x7f90e4923f10>

4

In [5]:
# from KevinsRoutines.utils.utils_general import list_namespace, save_to_pickle, load_from_pickle, get_device
import KevinsRoutines.utils as myutils
# import snnl.utils as utils
# from utils.utils_ptsnnl import display_cellpainting_batch, get_device
from utils.utils_cellpainting import label_counts, balance_datasets,save_checkpoint, load_checkpoint
from utils.dataloader import custom_collate_fn, dynamic_collate_fn, CellpaintingDataset, InfiniteDataLoader
from utils.utils_notebooks import plot_cls_metrics, compute_classification_metrics, run_model_on_test_data,\
                                train, validation, accuracy_fn, fit, build_model, define_datasets



In [5]:
timestamp = datetime.now().strftime('%Y_%m_%d_%H:%M:%S')
logger = logging.getLogger(__name__)
logLevel = os.environ.get('LOG_LEVEL', 'INFO').upper()
FORMAT = '%(asctime)s - %(name)s - %(levelname)s: - %(message)s'
logging.basicConfig(level="INFO", format= FORMAT)
logger.info(f" Excution started : {timestamp} ")
logger.info(f" Pytorch version  : {torch.__version__}")
logger.info(f" Scipy version    : {scipy.__version__}  \t\t Numpy version : {np.__version__}")
logger.info(f" Pandas version   : {pd.__version__}  ")

2024-10-02 20:01:51,365 - __main__ - INFO: -  Excution started : 2024_10_02_20:01:51 
2024-10-02 20:01:51,366 - __main__ - INFO: -  Pytorch version  : 2.2.0
2024-10-02 20:01:51,367 - __main__ - INFO: -  Scipy version    : 1.11.4  		 Numpy version : 1.26.2
2024-10-02 20:01:51,368 - __main__ - INFO: -  Pandas version   : 2.2.0  


In [16]:
# Set visible GPU device 
# ----------------------------------------------
# os.environ["CUDA_VISIBLE_DEVICES"] = '0'

# if torch.cuda.is_available():
#     device = torch.device('cuda:1')
# else:
#     device = torch.device('cpu')
# print(device)
try:
    del model
except Exception as e:
    pass

In [17]:
myutils.set_device(1)
device  = myutils.get_device(verbose = True)
print(device)

 Switched to: "cuda:1"   Device Name: Quadro GV100                  


'cuda:1'

Dev Id   Device Name                    Total Memory                     InUse                            Free Memory 
   0     Quadro GV100                   34,069,872,640 B/ (31.73 GB)  	 325,058,560 B / (0.30 GB)  	 33,744,814,080 B / (31.43 GB)  
   1     Quadro GV100                   34,069,872,640 B/ (31.73 GB)  	 771,751,936 B / (0.72 GB)  	 33,298,120,704 B / (31.01 GB)   *** CURRENT DEVICE *** 
   2     NVIDIA TITAN Xp                12,774,539,264 B/ (11.90 GB)  	 390,201,344 B / (0.36 GB)  	 12,384,337,920 B / (11.53 GB)  

 Current CUDA Device is:  "cuda:1"  Device Name: Quadro GV100
cuda:1


## Helper routines

# Args 

In [19]:
LATENT_DIM    = 150
COMPOUNDS_PER_BATCH = 600

MODEL_TYPE = 'batch_norm'
# MODEL_TYPE = 'single_layer'
# MODEL_TYPE = 'relu'
n_input    = LATENT_DIM  # the embedding dimensionality 

n_hidden_1 = 256  # the number of neurons in the hidden layer of the MLP
n_hidden_2 = 256  # the number of neurons in the hidden layer of the MLP
n_hidden_3 = 128

METADATA_COLS = ['Metadata_Source', 'Metadata_Batch', 'Metadata_Plate', 'Metadata_Well', 'Metadata_JCP2022', 'Metadata_Hash', 'Metadata_Bin', 'Metadata_TPSA', 'Metadata_lnTPSA', 'Metadata_log10TPSA', 'Metadata_Permiation']
# METADATA_COLS += [f'Feature_{x:03d}' for x in range(LATENT_DIM)]
input_cols = LATENT_DIM + len(METADATA_COLS)
print(len(METADATA_COLS))
print(input_cols)


INPUT_PATH = f"/home/kevin/WSL-shared/cellpainting/cj-datasets/output_11102023/3_sample_embeddings/"
CKPT_PATH = "./saved_models/embedding_models"

11
161


In [20]:
# RUN_DATETIME = datetime.now().strftime('%Y%m%d_%H%M')

# RUN_DATETIME = '20240926_1900'   ## Baseline CPB 600, Latent 150  - Single layer 256
# RUN_DATETIME = '20240927_2300'   ## Baseline CPB 600, Latent 150  - Single layer 512
# RUN_DATETIME = '20240929_2000'   ## Baseline CPB 600, Latent 150  - Batch Norm 256/256/128
# RUN_DATETIME = '20240929_1900'   ## Baseline CPB 600, Latent 150  - Batch Norm 512/512/128

# RUN_DATETIME = '20240930_2100'   ## Baseline CPB 600, Latent 250  - Balanced TPSA labels - Batch Norm 256/256/128
# RUN_DATETIME = '20241001_2100'   ## Baseline CPB 600, Latent 250  - Balanced TPSA labels - Batch Norm 512/512/256

# RUN_DATETIME = '20241002_1915'   ## Baseline CPB 600, Latent 150  - Single layer 256
# RUN_DATETIME = '20241002_1930'   ## Baseline CPB 600, Latent 150  - Single layer 512
# RUN_DATETIME = '20241002_1945'   ## Baseline CPB 600, Latent 150  - Batch Norm 256/256/128
# RUN_DATETIME = '20241002_2000'   ## Baseline CPB 600, Latent 150  - Batch Norm 512/512/128

RUN_DATETIME = '20241002_1945'   ## Baseline CPB 600, Latent 150, Batchn Norm 256/256/128
print(RUN_DATETIME)

20241002_1945


In [21]:
# SNNL AUTOENCODERS 
# AE_RUNMODE = "snnl"
# AE_DATETIME = "20240718_1956"
# AE_DATETIME = "20240906_2201"     # Autoencoder training - SNNL, CPB = 600, Latent 150, WD = 0.001, SNN Factor 3
# AE_DATETIME = "20240917_2004"     # Autoencoder training - SNNL, CPB = 600, Latent 250, WD = 0.001, SNN Factor 3

## BASELINE AUTOENCODERS 
AE_RUNMODE = 'base'
AE_DATETIME = "20240923_1943"     # Autoencoder training - Baseline, CPB = 600, Latent 150, WD = 0.001 (SNN Factor 0)
# AE_DATETIME = "20240917_2017"     # Autoencoder training - Baseline, CPB = 600, Latent 250, WD = 0.001 (SNN Factor 0)

AE_CKPTTYPE = "BEST"
# AE_CKPTTYPE = "LAST"

In [22]:
CKPT_FILE = f"NN_{AE_RUNMODE.lower()}_embd600_{LATENT_DIM}Ltnt_512_{AE_DATETIME}_{AE_CKPTTYPE}_{RUN_DATETIME}_ep_{{ep}}"
print(CKPT_FILE)

NN_base_embd600_150Ltnt_512_20240923_1943_BEST_20241002_1945_ep_{ep}


## Input Dataloader

 ### Split the training portion of the dataset into train, val and test

In [23]:
## total rows = 346,542
## Trn file sz: 312,000 
## Train      : 277,200    (312_000 - (21,600 + 12,600 + 600) = 277,200
## Validation :  21,600
## Test       :  12,600
## Leftover   :     600
cellpainting_args = {'compounds_per_batch': COMPOUNDS_PER_BATCH,
                     'train_start'        : 0,
                     'train_end'          : 277_200,
                     'val_start'          : 0,
                     'val_end'            : 21_600,
                     'test_start'         : 0,
                     'test_end'           : 12_600,
                     'tpsa_threshold'     : 100
                    }

In [24]:
data_loader = define_datasets(cellpainting_args, AE_RUNMODE, AE_DATETIME, input_cols, AE_CKPTTYPE, INPUT_PATH)

 TRAIN_INPUT:  /home/kevin/WSL-shared/cellpainting/cj-datasets/output_11102023/3_sample_embeddings/3smpl_prfl_embedding_161_HashOrder_base_20240923_1943_BEST_train.csv
 TEST_INPUT :  /home/kevin/WSL-shared/cellpainting/cj-datasets/output_11102023/3_sample_embeddings/3smpl_prfl_embedding_161_HashOrder_base_20240923_1943_BEST_train_sub_test.csv
 ALL_INPUT  :  /home/kevin/WSL-shared/cellpainting/cj-datasets/output_11102023/3_sample_embeddings/3smpl_prfl_embedding_161_HashOrder_base_20240923_1943_BEST_train_sub_val.csv
 load {}
 Dataset size: 277200   rows per batch: 1800  tpsa_threshold: 100
 Dataset size: 21600   rows per batch: 1800  tpsa_threshold: 100
 Dataset size: 12600   rows per batch: 1800  tpsa_threshold: 100


In [25]:
# TRAIN_INPUT_FILE = f"3smpl_prfl_embedding_{input_cols}_HashOrder_{AE_RUNMODE}_{AE_DATETIME}_{AE_CKPTTYPE}_train.csv"
# TEST_INPUT_FILE  = f"3smpl_prfl_embedding_{input_cols}_HashOrder_{AE_RUNMODE}_{AE_DATETIME}_{AE_CKPTTYPE}_train_sub_test.csv"
# VAL_INPUT_FILE   = f"3smpl_prfl_embedding_{input_cols}_HashOrder_{AE_RUNMODE}_{AE_DATETIME}_{AE_CKPTTYPE}_train_sub_val.csv"
# # ALL_INPUT_FILE   = f"3smpl_prfl_embedding_{num_cols}_HashOrder_{AE_RUNMODE}_{AE_DATETIME}_{AE_CKPTTYPE}_sub_val.csv"

# print(TRAIN_INPUT_FILE)
# print(TEST_INPUT_FILE)
# print(VAL_INPUT_FILE)

# TRAIN_INPUT = os.path.join(INPUT_PATH, TRAIN_INPUT_FILE)
# TEST_INPUT  = os.path.join(INPUT_PATH, TEST_INPUT_FILE)
# VAL_INPUT   = os.path.join(INPUT_PATH, VAL_INPUT_FILE)

# print(f" TRAIN_INPUT:  {TRAIN_INPUT}")
# print(f" TEST_INPUT :  {TEST_INPUT }")
# print(f" ALL_INPUT  :  {VAL_INPUT }")

In [26]:
## total rows = 346,542
## Trn file sz: 312,000 
## Train      : 277,200    (312_000 - (21,600 + 12,600 + 600) = 277,200
## Validation :  21,600
## Test       :  12,600
## Leftover   :     600
# cellpainting_args = {'sample_size': 3,
#                      'batch_size': 1,
#                      'compounds_per_batch': 600,
#                      'training_path'  : TRAIN_INPUT,
#                      'validation_path': TRAIN_INPUT,
#                      'test_path'      : TRAIN_INPUT,
#                      'train_start'    : 0,
#                      'train_end'      : 277_200,  # 277,200 samples
#                      'val_start'      : 277_200,  # 
#                      'val_end'        : 298_800,  # 21_600 samples
#                      'test_start'     : 298_800,  # 
#                      'test_end'       : 311_400,  # 12_600 samples
#                     }

# cellpainting_args = {'compounds_per_batch': COMPOUNDS_PER_BATCH,
#                      'training_path'      : TRAIN_INPUT,
#                      'validation_path'    :  VAL_INPUT,
#                      'test_path'          : TEST_INPUT,
#                      'train_start'        : 0,
#                      'train_end'          : 277_200,
#                      'val_start'          : 0,
#                      'val_end'            : 21_600,
#                      'test_start'         : 0,
#                      'test_end'           : 12_600,
#                      'tpsa_threshold'     : 100
#                     }

In [27]:
# cellpainting_args

In [28]:
#### Load CellPainting Dataset
# data : keys to the dataset settings (and resulting keys in output dictionary)
# dataset = dict()
# data_loader = dict()

# for datatype in ['train', 'val', 'test']:
#     dataset[datatype] = CellpaintingDataset(type = datatype, **cellpainting_args)
#     data_loader[datatype] = InfiniteDataLoader(dataset = dataset[datatype], batch_size=1, shuffle = False, num_workers = 0, 
#                                                collate_fn = partial(dynamic_collate_fn, tpsa_threshold = dataset[datatype].tpsa_threshold) )

In [29]:
# def display_cellpainting_batch(batch_id, batch):
#     # data, labels, plates, compounds, cmphash, other, labels_2
#     features, label, well_ids, compound_id, cmphash, tpsa, label_2 = batch
#     # label_2 = np.zeros_like(label)
#     print("-"*135)
#     print(f"  Batch Id: {batch_id}   {type(batch)}  Rows returned {len(batch[0])} features: {features.shape}  ")
#     print(f"+-----+------------------------------------------+----------------+--------------------------+------------------------------+-----+-----+--------------------------------------------------------+")
#     print(f"| idx |   batch[2]                               |    batch[3]    |      batch[2]            |          batch[5]            | [1] | [1] |     batch[0]                                           | ") 
#     print(f"|     | SRCE      BATCH     PLATE     WELL       |   COMPOUND_ID  |       CMPHASH / BIN      |  TPSA / Ln(TPSA) / Log(TPSA) | LBL |LBL2 |     FEATURES                                           | ")
#     print(f"+-----+------------------------------------------+----------------+--------------------------+------------------------------+-----+-----+--------------------------------------------------------+")
#          ###    0 | source_11 Batch2    EC000046  K04      | JCP2022_009278 |  7406361908543180200 -  8  |   0   |   62.78000    4.13964   1.79782 | [-0.4377299 -0.4474466  1.1898487  0.2051901]
#          # "  1 | source_10    | JCP2022_006020 | -9223347314827979542 |   10 |  0 | tensor([-0.6346, -0.6232, -1.6046])"
    
#     for i in range(len(label)):
#         print(f"| {i:3d} | {well_ids[i,0][:9]:9s} {well_ids[i,1][:12]:12s}  {well_ids[i,2][:10]:10s}  {well_ids[i,3]:4s} |"\
#               f" {compound_id[i]:14s} | {cmphash[i,0]:20d}  {cmphash[i,1]:2d} |"\
#               f" {tpsa[i,0]:7.3f}  {tpsa[i,1]:8.5f}  {tpsa[i,2]:8.5f}  |"
#               f" {int(label[i]):2d}  | {int(label_2[i]):2d}  |"\
#               f" {features[i,:4].detach().cpu().numpy()}")
#         # print(f"| {i:3d} | {batch[2][i,0]:9s} {batch[2][i,1][:9]:9s} {str(batch[2][i,2])[:9]:9s} {batch[2][i,3]:>4s}       "\
#         #       f"|{batch[3][i]:12s} | {batch[4][i,0]:20d} - {batch[4][i,1]:2d}  "\
#         #       f"{batch[5][i,0]:11.5f}   {batch[5][i,1]:8.5f}  {batch[5][i,2]:8.5f} "\
#         #       f"|  {int(batch[1][i]):1d}  | {batch[0][i,:4].detach().cpu().numpy()}")


In [30]:
# # %%timeit
# # for dataset in ['train', 'val', 'test']:
# for dataset in ['test']:
#     for idx, batch in enumerate(data_loader[dataset]):
#         for b in batch :
#             print(b.shape)
#         display_cellpainting_batch(idx, batch)
#         if idx == 0:
#             break

In [31]:
# # -----------------------------------------
# #  Count pos/neg labels in each dataset
# # -----------------------------------------
# for datatype in ['train', 'val', 'test']:
#     MINIBATCH_SIZE = data_loader[datatype].dataset.sample_size * data_loader[datatype].dataset.compounds_per_batch
#     print(f" {datatype.capitalize()} Minibatch size : {MINIBATCH_SIZE}") 
# print()

# for datatype in ['train', 'val', 'test']:
#     minibatches = len(data_loader[datatype]) // MINIBATCH_SIZE
#     ttl_rows, ttl_rows_2 = 0, 0
#     ttl_pos_labels, ttl_pos_labels_2 = 0, 0
#     with tqdm.tqdm(enumerate(data_loader[datatype]), initial=0, total = minibatches, position=0, file=sys.stdout,
#                    leave= False, desc=f" Count labels ") as t_warmup:
#         for batch_count, (_, batch_labels, _, _, _, _, batch_labels_2) in t_warmup:
#             ttl_rows += batch_labels.shape[0]
#             ttl_rows_2 += batch_labels_2.shape[0]
#             ttl_pos_labels += batch_labels.sum()
#             ttl_pos_labels_2 += batch_labels_2.sum()
#     ttl_neg_labels = ttl_rows - ttl_pos_labels
#     ttl_neg_labels_2 = ttl_rows_2 - ttl_pos_labels_2
#     ttl = f"\n Dataset: {datatype} -  len of {datatype} data loader: {len(data_loader[datatype])}   number of batches: {minibatches}"
#     print(ttl)
#     print('-'*len(ttl))
#     print(f" total rows     : {ttl_rows:7d}")
#     print(f" total pos rows : {ttl_pos_labels:7.0f} - {ttl_pos_labels*100.0/ttl_rows:5.2f}%         alternative pos rows : {ttl_pos_labels_2:7.0f} - {ttl_pos_labels_2*100.0/ttl_rows:5.2f}%      ")
#     print(f" total neg rows : {ttl_neg_labels:7.0f} - {ttl_neg_labels*100.0/ttl_rows:5.2f}%         alternative neg rows : {ttl_neg_labels_2:7.0f} - {ttl_neg_labels_2*100.0/ttl_rows:5.2f}%")
#     print()

     Minibatch size : 1800 
                                                                                                 
     Dataset: train - len of train data loader: 277200   number of batches: 154  
    ------------------------------
     total rows     :  277200
     total pos rows :   33129 - 11.95%
     total neg rows :  244071 - 88.05%

     Dataset: val - len of val data loader: 21600   number of batches: 12
    ------------------------------
     total rows     :   21600
     total pos rows :    2532 - 11.72%
     total neg rows :   19068 - 88.28%
    
     Dataset: test - len of test data loader: 12600   number of batches: 7
    ------------------------------
     total rows     :   12600
     total pos rows :    1431 - 11.36%
     total neg rows :   11169 - 88.64%

# Define Neural Net Model 

- **4 layer model :**

    Input --> Hidden1 --> (BN/NL) ---> Hidden2 ---> (BN/NL) ---> Hidden3 --->  (BN/NL) ---> 1
   
    -  **20240909_1800** : Run on 4 FC layers model (includes final layer), model configuration UNKNOWN
    -  **20240909_1801** : Run on 4 FC layers model (includes final layer), Relu non linearities (NO Batch Norm)
    -  **20240909_2100** : Run on 4 FC layers model (includes final layer), with BATCH NORM and tanh non linearities

      
 - **Single Hidden Layer - 256**

   Input --> Hidden1 --> (Tanh) --->  1
    -  **20240916_1830** : Run on 1 FC layers model (includes final layer), Input --> 256 --> Tanh --> 1 ,  Read from 20240906_2201 (SNNL - CPB 600, LAT 150, SNN Factor 3)
    -  **20240926_1900** : Run on 1 FC layers model (includes final layer), Input --> 256 --> Tanh --> 1 ,  Read from 20240917_2017 (BASELINE - CPB 600, LAT 250, SNN Factor 0)
    -  **20240926_1930** : Run on 1 FC layers model (includes final layer), Input --> 256 --> Tanh --> 1 ,  Read from 20240917_2004 (SNNL - CPB 600, LAT 250, SNN Factor 3)
    -  **20240926_2000** : Run on 1 FC layers model (includes final layer), Input --> 256 --> Tanh --> 1 ,  Read from 20240924_0146 (SNNL - CPB 600, LAT 250, SNN Factor 30)
<br>

 - **Single Hidden Layer - 256**

    -  **20240921_0700** : Run on 1 FC layers model (includes final layer), Input --> 512 --> Tanh --> 1 ,  Read from 20240906_2201 (SNNL - CPB 600, LAT 150, SNN Factor 3)    


In [32]:

model = build_model(MODEL_TYPE, input = n_input, hidden_1 = n_hidden_1, hidden_2 = n_hidden_2, hidden_3=n_hidden_3, device = device)
 

In [34]:
col_names = ["input_size", "output_size", "num_params", "params_percent", "mult_adds", "trainable"]

summary_input_size = (30, n_input)
_ = summary(model, verbose = 2, input_size=summary_input_size, col_names = col_names)

Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Param %                   Mult-Adds                 Trainable
Sequential                               [30, 150]                 [30, 1]                   --                             --                   --                        True
├─Linear: 1-1                            [30, 150]                 [30, 256]                 38,656                     27.86%                   1,159,680                 True
│    └─weight                                                                                ├─38,400
│    └─bias                                                                                  └─256
├─BatchNorm1d: 1-2                       [30, 256]                 [30, 256]                 512                         0.37%                   15,360                    True
│    └─weight                                                                             

In [35]:
metrics = { 'loss_trn' : [], 'acc_trn' : [], 'loss_val' : [], 'acc_val' : []}

start_epoch, end_epoch = 0,0
init_LR = 1.0e-3
# curr_LR = init_LR

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=init_LR)

# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.3 , patience=20, cooldown=10,)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = step_size, gamma=0.1, last_epoch =-1)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.5, threshold=1.0e-06, patience=50, cooldown=10,)

### Read checkpoint

In [39]:
# loaded_epoch
# optimizer.state_dict()
# scheduler.state_dict()

In [40]:
# model, optimizer, scheudler, end_epoch = load_checkpoint(model, optimizer, scheduler, checkpoint_file.format(ep=100), ckpt_path = CKPT_PATH)
# model = model.to(device)

In [41]:
# end_epoch
# optimizer.state_dict()
# scheduler.state_dict()

# Run Training

In [42]:
# start_epoch = 0
# start_epoch = loaded_epoch
start_epoch = end_epoch
end_epoch += 600
# start_epoch, end_epoch = 0,100
print(start_epoch, end_epoch)
_ = model.train()

0 600


In [43]:

metrics = fit(model, optimizer, scheduler, data_loader, metrics, start_epoch, end_epoch, device, CKPT_FILE, CKPT_PATH )


 20:26:02 | Ep:   1/ 600 | Trn loss:  0.456108 - Acc: 84.2908 | Val loss:  0.367906 - Acc: 88.2778 | last_lr: 1.00000e-03  bad_ep: 0  cdwn: 0                              
 20:26:16 | Ep:   2/ 600 | Trn loss:  0.366941 - Acc: 88.0606 | Val loss:  0.358349 - Acc: 88.2870 | last_lr: 1.00000e-03  bad_ep: 0  cdwn: 0                              
 20:26:30 | Ep:   3/ 600 | Trn loss:  0.362299 - Acc: 88.0631 | Val loss:  0.357601 - Acc: 88.2963 | last_lr: 1.00000e-03  bad_ep: 0  cdwn: 0                              
 20:26:44 | Ep:   4/ 600 | Trn loss:  0.360787 - Acc: 88.0606 | Val loss:  0.358432 - Acc: 88.2639 | last_lr: 1.00000e-03  bad_ep: 1  cdwn: 0                              
 20:26:58 | Ep:   5/ 600 | Trn loss:  0.359764 - Acc: 88.0617 | Val loss:  0.358954 - Acc: 88.2685 | last_lr: 1.00000e-03  bad_ep: 2  cdwn: 0                              
 20:27:12 | Ep:   6/ 600 | Trn loss:  0.358939 - Acc: 88.0639 | Val loss:  0.358520 - Acc: 88.2639 | last_lr: 1.00000e-03  bad_ep: 3  cdwn: 

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



 21:28:00 | Ep: 260/ 600 | Trn loss:  0.031130 - Acc: 99.5429 | Val loss:  1.175304 - Acc: 80.9074 | last_lr: 6.25000e-05  bad_ep: 4  cdwn: 0                              
 21:28:15 | Ep: 261/ 600 | Trn loss:  0.030635 - Acc: 99.5602 | Val loss:  1.176528 - Acc: 80.9352 | last_lr: 6.25000e-05  bad_ep: 5  cdwn: 0                              
 21:28:30 | Ep: 262/ 600 | Trn loss:  0.030156 - Acc: 99.5765 | Val loss:  1.177783 - Acc: 80.9444 | last_lr: 6.25000e-05  bad_ep: 6  cdwn: 0                              
 21:28:44 | Ep: 263/ 600 | Trn loss:  0.029691 - Acc: 99.5891 | Val loss:  1.179067 - Acc: 80.9722 | last_lr: 6.25000e-05  bad_ep: 7  cdwn: 0                              
 21:28:59 | Ep: 264/ 600 | Trn loss:  0.029239 - Acc: 99.6136 | Val loss:  1.180379 - Acc: 80.9630 | last_lr: 6.25000e-05  bad_ep: 8  cdwn: 0                              
 21:29:14 | Ep: 265/ 600 | Trn loss:  0.028802 - Acc: 99.6295 | Val loss:  1.181717 - Acc: 81.0000 | last_lr: 6.25000e-05  bad_ep: 9  cdwn: 

In [56]:
# print(filename)
 
# save_checkpoint(end_epoch, model, optimizer, scheduler, metrics = metrics,
#                 filename = CKPT_FILE.format(ep=end_epoch),
#                 ckpt_path = CKPT_PATH, verbose = True)

start_epoch, end_epoch

# for mtrc in ['loss_trn', 'loss_val']:
#     for i in range(len(metrics[mtrc])):
#         # print(i)
#         metrics[mtrc][i] = metrics[mtrc][i].item()

(600, 1200)

In [44]:
#         metrics['loss_trn'].append(trn_loss.item())
#         metrics['acc_trn'].append(trn_acc)
#         metrics['loss_val'].append(val_loss.item())
#         metrics['acc_val'].append(val_acc)
# for idx, (trn_loss, trn_acc, val_loss, val_acc) in enumerate(zip(metrics['loss_trn'],metrics['acc_trn'],metrics['loss_val'],metrics['acc_val'])):
#     print(f" {datetime.now().strftime('%X')} | Ep: {idx:3d}/{end_epoch:4d} | Trn loss: {trn_loss:9.6f} - Acc: {trn_acc:.4f} |"
#       f" Val loss: {val_loss:9.6f} - Acc: {val_acc:.4f} | ")

# Modify TPSA Threshold

In [None]:

train_y_72 = np.zeros_like(train_y)
train_y_72.shape[0]/3

In [None]:
print(f"                  min           max           std          mean           median")
for x in ['TPSA', 'lnTPSA', 'log10TPSA']:
    print(f"{x:12s} {df_train[x].min():13.7f} {df_train[x].max():13.7f} {df_train[x].std():13.7f} {df_train[x].mean():13.7f} {df_train[x].median():13.7f}") 

df_train.TPSA.count()
df_train[df_train.TPSA >= THRESHOLD].TPSA.count()/df_train.TPSA.count()
df_train[df_train.TPSA < THRESHOLD].TPSA.count()/df_train.TPSA.count()


In [None]:
_tmp = df_train.Metadata_Permiation.value_counts()
_tmp[0], _tmp[1]

In [None]:
for threshold in [68, 69, 70, 71, 72, 100]:
    _tmp = (df_train['Metadata_TPSA'] >= threshold).value_counts()
    print(f"\n TPSA threshold {threshold} \n Total samples: {_tmp.sum()}")
    print(f" Label 0: {_tmp[False]:>7d}      % {_tmp[False]*100/_tmp.sum():2.2f} ")
    print(f" Label 1: {_tmp[True]:>7d}      % {_tmp[True]*100/_tmp.sum():2.2f} ")

In [None]:
# fig, ax = plt.subplots(figsize=(4,4))
# fig.canvas.draw()  # Need to draw the figure to define renderer
# ax.set_title("AngleLabel example")
# # Plot two crossing lines and label each angle between them with the above
# center = (4.5, 650)
# p1 = [(2.5, 710), (6.0, 605)]
# p2 = [(3.0, 275), (5.5, 900)]
# line1, = ax.plot(*zip(*p1))
# line2, = ax.plot(*zip(*p2))
# point, = ax.plot(*center, marker="o")


In [None]:
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
num_bins = 200
# fig, ax = plt.subplots()
fig = plt.figure(figsize=(10,5))
sigma = df_train.Metadata_TPSA.std()
mu = df_train.Metadata_TPSA.mean()
med = df_train.Metadata_TPSA.median()
# the histogram of the data
# We can set the number of bins with the *bins* keyword argument.
n, bins, patches = plt.hist(df_train.Metadata_TPSA, num_bins, density=False, range=[0, 500],)
# p1 = [(med, 710), (6.0, 605)]
# _ = plt.vlines(x=med, ymin=10, ymax=17000, colors='red', linestyles='-', lw=1.75, label='Single Short Line')
_ = plt.axvline(x=med, ymin=0, ymax=.97, color='red', linestyle='-', lw=1.75, label='Single Short Line')
_ = plt.xlabel('TPSA Value');
_ = plt.ylabel('# Compounds');
_ = plt.title(fr'TPSA distribution -  $\mu={mu:.3f}$    $\sigma={sigma:.3f}$')
plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0)
# axs[1].hist(dist2, bins=n_bins)
plt.show()

# Stratified CV data splits

In [None]:
def visualize_groups(classes, groups, name):
    # Visualize dataset groups
    fig, ax = plt.subplots(figsize=(10,5))
    ax.scatter(
        range(len(groups)),
        [0.5] * len(groups),
        c=groups,
        marker="_",
        lw=50,
        cmap=cmap_data,
    )
    ax.scatter(
        range(len(groups)),
        [3.5] * len(groups),
        c=classes,
        marker="_",
        lw=50,
        cmap=cmap_data,
    )
    ax.set(
        ylim=[-1, 5],
        yticks=[0.5, 3.5],
        yticklabels=["Data\ngroup", "Data\nclass"],
        xlabel="Sample index",
    )

In [None]:
def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""
    use_groups = "Group" in type(cv).__name__
    groups = group if use_groups else None
    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=groups)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=indices,
            marker="_",
            lw=lw,
            cmap=cmap_cv,
            vmin=-0.2,
            vmax=1.2,
        )

    # Plot the data classes and groups at the end
    ax.scatter(
        range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=cmap_data
    )

    ax.scatter(
        range(len(X)), [ii + 2.5] * len(X), c=group, marker="_", lw=lw, cmap=cmap_data
    )

    # Formatting
    yticklabels = list(range(n_splits)) + ["class", "group"]
    ax.set(
        yticks=np.arange(n_splits + 2) + 0.5,
        yticklabels=yticklabels,
        xlabel="Sample index",
        ylabel="CV iteration",
        ylim=[n_splits + 2.2, -0.2],
        xlim=[0, 100],
    )
    ax.set_title("{}".format(type(cv).__name__), fontsize=15)
    return ax

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Patch


In [None]:
rng = np.random.RandomState(1338)
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
n_splits = 4

# Generate the class/group data
# n_points = 100
# X = rng.randn(100, 10)

# percentiles_classes = [0.1, 0.3, 0.6]
# y = np.hstack([[ii] * int(100 * perc) for ii, perc in enumerate(percentiles_classes)])

In [None]:
# Generate uneven groups

# group_prior = rng.dirichlet([2] * 10)
# group_prior.sum()
# group_prior

# groups = np.repeat(np.arange(10), rng.multinomial(100, group_prior))
# groups.shape
# groups

In [None]:
groups = np.repeat(0, train_X.shape[0])
groups.shape

In [None]:
visualize_groups(train_y, groups, "no groups")

In [None]:
n_splits = 5
groups = None

In [None]:
fig, ax = plt.subplots()
cv = KFold(n_splits)
plot_cv_indices(cv, train_X, train_y, groups, ax, n_splits)

# Input 

## Read Embedded Features CSV file

In [None]:
BASE_TRAIN_INPUT = os.path.join(OUTPUT_PATH, INPUT_FILE.format(runmode = BASE_runmode ,datatype='train'))
BASE_TEST_INPUT  = os.path.join(OUTPUT_PATH, INPUT_FILE.format(runmode = BASE_runmode ,datatype='test'))
SNNL_TRAIN_INPUT = os.path.join(OUTPUT_PATH, INPUT_FILE.format(runmode = SNNL_runmode ,datatype='train'))
SNNL_TEST_INPUT  = os.path.join(OUTPUT_PATH, INPUT_FILE.format(runmode = SNNL_runmode ,datatype='test'))
BASE_TRAIN_INPUT
BASE_TEST_INPUT 
SNNL_TRAIN_INPUT
SNNL_TEST_INPUT 

In [None]:
df_test = pd.read_csv(SNNL_TEST_INPUT )
df_train = pd.read_csv(SNNL_TRAIN_INPUT)

# df_train = pd.read_csv(BASE_TRAIN_INPUT)
# df_test = pd.read_csv(BASE_TEST_INPUT )
# df_train = pd.read_csv(TRAIN_INPUT, nrows = 100 )
# df_train = pd.read_csv(TRAIN_INPUT, usecols = ['Metadata_Batch'])

In [None]:
df_test.info()
df_test.shape
df_test.columns
df_test.iloc[:5,:13]

In [None]:
# df_train = pd.read_csv(TRAIN_INPUT, nrows = 100 )
# df_train = pd.read_csv(TRAIN_INPUT, usecols = ['Metadata_Batch'])

In [None]:
test_y = df_test.iloc[:,10:11].to_numpy().ravel().astype(np.uint8)
test_y.sum()
test_y.shape, type(test_y), test_y.dtype
test_X = df_test.iloc[:,11:].to_numpy()
test_X.shape,type(test_X), test_X.dtype

In [None]:
df_train.shape
df_train.info()
df_train.iloc[:5,:16]

In [None]:
312000+34542

In [None]:
train_y = df_train.iloc[:,10:11].to_numpy().ravel().astype(np.uint8)
train_y.sum()
train_y.shape, type(train_y), train_y.dtype

train_X = df_train.iloc[:,11:].to_numpy()
train_X.shape,type(train_X) ,train_X.dtype

## Standardize inputs

In [None]:
print(f"Train_X :  Min: {train_X.min():.4f}    Max: {train_X.max():.4f}   Mean: {train_X.mean():.4f}  Std: {train_X.std():.4f}")
print(f"Test_X  :  Min: {test_X.min():.4f}    Max: {test_X.max():.4f}    Mean: {test_X.mean():.4f}  Std: {test_X.std():.4f}")

In [None]:
# print(f"Train_X :  Min: {train_X.min():.4f}    Max: {train_X.max():.4f}   Mean: {train_X.mean():.4f}  Std: {train_X.std():.4f}")
# print(f"Test_X  :  Min: {test_X.min():.4f}    Max: {test_X.max():.4f}    Mean: {test_X.mean():.4f}  Std: {test_X.std():.4f}")

In [None]:
scaler = StandardScaler(copy = True)
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

In [None]:
print("After Standard Scaler Transformation")
print(f"Train_X :  Min: {train_X.min():.4f}    Max: {train_X.max():.4f}   Mean: {train_X.mean():.4f}  Std: {train_X.std():.4f}")
print(f"Test_X  :  Min: {test_X.min():.4f}    Max: {test_X.max():.4f}    Mean: {test_X.mean():.4f}  Std: {test_X.std():.4f}")


In [None]:
label_counts([("Training", train_y), ("Test", test_y)])