##  MTCV : Multi Task Cross Validation

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
import scipy.sparse
import argparse
pd.options.display.width = 132
pd.options.display.max_rows = 100
import os

In [4]:
def load_sparse(dataroot = None , filename = None):
    """Loads sparse from Matrix market or Numpy .npy file."""
    if filename is None or dataroot is None:
        return None
    full_path = os.path.join(dataroot, filename)
    print(f" load sparse file from {full_path}")
    if filename.endswith('.mtx'):
        return scipy.io.mmread(full_path).tocsr()
    elif filename.endswith('.npy'):
        return np.load(full_path, allow_pickle=True).item().tocsr()
    elif filename.endswith('.npz'):
        return scipy.sparse.load_npz(full_path).tocsr()
    raise ValueError(f"Loading '{full_path}' failed. It must have a suffix '.mtx', '.npy', '.npz'.")


In [3]:
def mtcv_clustered(Y_in, clusters, nfolds=None, pfolds=None, seed=None):
    """
    splits rows of Y_in, based on clusters into either 
        a) equal Y_in into nfolds 
        b) or according to the ratios defined bY_in pfolds
        
        Y_in: is the compound x target matrix
    """

    assert clusters.shape[0] == Y_in.shape[0]
    print(f"* Y_in       : {type(Y_in)} \t {Y_in.shape}")
    print(f"* clusters: {type(clusters)}  \t {clusters.shape} ")
    print(f"           Min: {clusters.min()} Max:{clusters.max()}\n{clusters} \n")
    
    
#     cl_uniq = np.sort(clusters.unique())
    cl_uniq = clusters.unique()
    print(f"* cl_uniq: {type(cl_uniq)} {len(cl_uniq)}\n{cl_uniq}\n")

          
    ## assign a unqiue id to the cluster numbers  
    cl2id   = pd.Series(np.arange(cl_uniq.shape[0]), index=cl_uniq)
        
#     print(f"* cl2id: {type(cl2id)} {len(cl2id)}\n{cl2id}\n")
#     print(f" cl2id[0]: {cl2id[0]}")
#     print(f" cl2id[2321]: {cl2id[2321]}")
#     print(f" cl2id[13660]: {cl2id[13660]}")
#     print(f" cl2id[2321]: {cl2id[2321]}")
#     print(f" cl2id[5601]: \n{cl2id[5599:5603]} \n")
    
    
    cid     = cl2id[clusters]
    print(f"* cid:   {type(cid)} {len(cid)}\n{cid}\n")
#     print(f" cid.values:\n{cid.values}")
    
    ## creating cluster2compound matrix
    ##
    ## Number of rows: cid.values
    print(f"* Create C sparse matrix - indicating the clusters compounds are assigned to.... ")
    C    = scipy.sparse.csr_matrix(
            (np.ones(cid.shape[0], dtype=np.int8), (cid.values, np.arange(cid.shape[0])))
            )

    print(f"* Shape of sparse matrix: Rows(#clusters):{C.shape[0]} Columns(#compounds): {C.shape[1]} \n")
    ## Create 
    Y_bin      = Y_in.copy()
    print(f"* Y_bin     :   {type(Y_bin)} {Y_bin.shape}")
    print(f"* Y_bin.data:   {type(Y_bin.data)} {Y_bin.data.shape}")
    Y_bin_neg = Y_bin < 0
    Y_bin_pos = Y_bin > 0 
    print(f" Sum: {(Y_bin_neg).sum()}")
    print(f" Sum: {(Y_bin_pos).sum()}")
    print(f" Sum: {(Y_bin).sum()}")
    
    Y_bin.data = np.ones(Y_bin.data.shape[0], dtype=np.int8)
    print(f"* Y_bin.data:   {type(Y_bin.data)} {Y_bin.data.shape}\n")
    Y_bin_neg = Y_bin < 0
    Y_bin_pos = Y_bin > 0 
    print(f" Sum: {(Y_bin_neg).sum()}")
    print(f" Sum: {(Y_bin_pos).sum()}")
    print(f" Sum: {(Y_bin).sum()}")
    print() 
    
    ## compute number of compounds per cluster/fingerprint
    cl_counts = C.dot(Y_bin)
    print(f"* cl_counts     :   {type(cl_counts)} {cl_counts.shape}")    
    print(f"* cl_counts.data:   {type(cl_counts.data)} {cl_counts.data.shape}")
    print(f" {cl_counts}\n")
    
    cl_counts_neg = cl_counts < 0 
    print(f"* cl_counts_neg     :   {type(cl_counts_neg)}      {cl_counts_neg.shape}")    
    print(f"* cl_counts_neg.data:   {type(cl_counts_neg.data)} {cl_counts_neg.data.shape}")
    print(f" Sum: {cl_counts_neg.sum()}")
    print(f" {cl_counts_neg} \n")    

    ## get cluster folds
    print(f"* Call mtcv with nfolds: {nfolds}  \t pfolds: {pfolds} \t  seed: {seed}")
    folds = mtcv(mask=cl_counts, nfolds=nfolds, pfolds=pfolds, seed=seed)

    print(f"* cid:   {type(cid)} {len(cid)}\n{cid}\n")
    print(f'* folds:  {folds.shape} \n {folds[cid]} \n')
    return folds[cid]


In [4]:
def mtcv(mask, nfolds=None, pfolds=None, seed=None):
    """
    Return a vector of folds

    Args:
    mask     binary mask matrix of [compounds x targets]
    nfolds   number of folds (integer)
    pfolds   array, specifying fold sizes (probability)
             If specified nfolds is ignored.
             Must sum to 1.
    """
    if nfolds is None and pfolds is None:
        raise ValueError("nfolds or pfolds must be specified.")
    if pfolds is not None and np.abs(np.sum(pfolds) - 1.0) > 1e-5:
        raise ValueError("pfolds must sum to 1.0.")

    if pfolds is None:
        pfolds = np.ones(nfolds) / nfolds
    else:
        nfolds = len(pfolds)
    
    print(f" Call mtcv with nfolds: {nfolds}  \t pfolds: {pfolds} \t  seed: {seed}\n\n")
    
    target_sizes = np.array(mask.sum(0)).flatten()
    comp_sizes   = np.array(mask.sum(1)).flatten()

    print(f'* mask        : {mask.shape} \t\n {mask} \n')
    print(f'* Target sizes (mask sum along rows) : {target_sizes.shape} \t {target_sizes} \n')
    print(f'* comp sizes   (mask sum over columns): {comp_sizes.shape}   \t {comp_sizes} \n')
    if seed is not None:
        np.random.seed(seed)

    df = pd.DataFrame({"row": np.arange(mask.shape[0]), "size": comp_sizes})
    print(f'\n* df info')
    print(df.info())
    print(df)
    
    df1 = df.sample(frac=1)
    print(f'\n df1 info \n')
    print(df1.info())
    print(df1)
    
    df1.sort_values("size", inplace=True, ascending=False)
    print(f'\n df1 info (after sort by size descending)\n')
    print(df1.info())
    print(df1)
    
    fold_sizes = np.zeros((nfolds, mask.shape[1]), dtype=np.int8)
#     #max_fold_sizes = np.ceil(target_sizes / nfolds).astype(np.int)
    max_fold_sizes = np.ceil(np.outer(target_sizes, pfolds))

    print(f'* fold_sizes : {fold_sizes.shape} \n {fold_sizes} \n')
#     print(f'* non zero folds: {(fold_sizes>0).sum()}\n')
    print(f'* max fold sizes: {max_fold_sizes.shape} \n {max_fold_sizes}\n')
    
    ## output vector (initialized to -1)
    folds = np.zeros(mask.shape[0], dtype=np.int8) - 1
    print(f'* folds : {folds.shape} \n {folds} \n')
    
    farray = np.arange(nfolds)    
    
    ## Starting at the cluster with the highest number of positive actions
    ## 
    print(f'* mask        : {mask.shape} ')
    for i in range(mask.shape[0]):
        idx   = df1.index[i]
 

        #for j in np.random.permutation(nfolds):
        choices = np.random.choice(farray, size=nfolds, replace=False, p=pfolds)
#         print(f' i: {i}   idx (cluster_id): {idx}   folds[{idx}] = {folds[idx]}   choices : {choices}')
        
        
        for j in choices: ## np.random.choice(farray, size=nfolds, replace=False, p=pfolds):
            fit_criteria = mask[idx,:].dot((fold_sizes[j,:] + mask[idx,:] > max_fold_sizes[:,j]).transpose())
#             print(f'    fit_criteria = {fit_criteria}')
            if fit_criteria == 0:
                ## compound fits into fold j
                folds[idx] = j
                fold_sizes[j,:] += mask[idx,:]
                
#                 print(f'    folds[idx] <- j:  {folds[idx]}')
#                 print(f'    fold_sizes[j] <- mask[{idx}]: {fold_sizes[j]}')
                break
                
        if folds[idx] == -1:
            j = np.random.randint(0, nfolds)
#             print(f'    force to fold {j}')
            folds[idx] = j
            fold_sizes[j,:] += mask[idx,:]
        
#         if (i == 100):
#             break
            
    print(' finished loop - i is:', i)
    return folds





In [5]:
parser = argparse.ArgumentParser(description="Multi-task clustered crossvalidation")
parser.add_argument('--thresh', type=str, required=True)
parser.add_argument('--y', type=str, required=True)
parser.add_argument('--compounds', type=str, required=True)
parser.add_argument('--clusters', type=str, required=True)
parser.add_argument('--folding', type=str, default="output/folding.npy")
parser.add_argument('--nfolds', type=int, default=5)
VERSION = "chembl_29_dev"

in_args = f" --thresh    output/{VERSION}/{VERSION}_thresholds.csv"\
          f" --clusters  output/{VERSION}/{VERSION}_clustering.npy " \
          f" --compounds output/{VERSION}/{VERSION}_X_cmpds.csv" \
          f" --y         output/{VERSION}/{VERSION}_Y_all.npy"\
          f" --folding   output/{VERSION}/{VERSION}_folding.npy"\
          " --nfolds    5"
in_args = in_args.split()

conf = parser.parse_args(in_args)
conf_vars = vars(conf)
print()
print(f" input parm - thresh  :   value: {conf_vars['thresh']}")
print(f" input parm - clusters:   value: {conf_vars['clusters']}")
print(f" input parm - compounds:  value: {conf_vars['compounds']}")

print(f" input parm - nfolds:     value: {conf_vars['nfolds']}")
print(f" output parm - y :        value: {conf_vars['y']}")
print(f" output parm - folding:   value: {conf_vars['folding']}")


 input parm - thresh  :   value: output/chembl_29_dev/chembl_29_dev_thresholds.csv
 input parm - clusters:   value: output/chembl_29_dev/chembl_29_dev_clustering.npy
 input parm - compounds:  value: output/chembl_29_dev/chembl_29_dev_X_cmpds.csv
 input parm - nfolds:     value: 5
 output parm - y :        value: output/chembl_29_dev/chembl_29_dev_Y_all.npy
 output parm - folding:   value: output/chembl_29_dev/chembl_29_dev_folding.npy


### Load compound id / target id /  threshold file `output/chembl_29_thresh.csv`

In [6]:
df = pd.read_csv(conf.thresh)
print(df)
df.info()
df.nunique()

             target_id        cmpd_id  task_group  variable  value
0        CHEMBL1075097  CHEMBL1234777           1       5.5    1.0
1        CHEMBL1075097  CHEMBL1812661           1       5.5    1.0
2        CHEMBL1075097  CHEMBL1812662           1       5.5   -1.0
3        CHEMBL1075097  CHEMBL2326084           1       5.5    1.0
4        CHEMBL1075097  CHEMBL2326085           1       5.5    1.0
...                ...            ...         ...       ...    ...
2615830     CHEMBL6175  CHEMBL4646564         836       8.5   -1.0
2615831     CHEMBL6175  CHEMBL4648732         836       8.5   -1.0
2615832     CHEMBL6175  CHEMBL4649004         836       8.5   -1.0
2615833     CHEMBL6175   CHEMBL578512         836       8.5   -1.0
2615834     CHEMBL6175    CHEMBL90852         836       8.5   -1.0

[2615835 rows x 5 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2615835 entries, 0 to 2615834
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   t

target_id        888
cmpd_id       423809
task_group        10
variable           4
value              2
dtype: int64

In [7]:
print(f"Number of rows              : {len(df):7d}")
print(f"Number of unique targets    : {len(df['target_id'].unique()):7d}   ")
print(f"Number of unique compounds  : {len(df['cmpd_id'].unique()):7d}   ")
print(f"Number of unique thresholds : {len(df['variable'].unique()):7d}")
print(f"Number of unique task groups: {len(df['task_group'].unique()):7d}")
print(f'Number of lables (targets x thresholds) : {len(df["target_id"].unique()) * len(df["variable"].unique()):7d}')

df["task_group"].value_counts(normalize = False)

Number of rows              : 2615835
Number of unique targets    :     888   
Number of unique compounds  :  423809   
Number of unique thresholds :       4
Number of unique task groups:      10
Number of lables (targets x thresholds) :    3552


6       702358
11      336268
1028    323550
1       310129
0       270580
10      155195
1005    149430
836     130131
1031    126561
643     111633
Name: task_group, dtype: int64

In [8]:
disp = df[df['target_id'] == 'CHEMBL1938210']
disp[-30:]

Unnamed: 0,target_id,cmpd_id,task_group,variable,value
2040646,CHEMBL1938210,CHEMBL3621882,836,8.5,-1.0
2040647,CHEMBL1938210,CHEMBL3774537,836,8.5,-1.0
2040648,CHEMBL1938210,CHEMBL3774598,836,8.5,-1.0
2040649,CHEMBL1938210,CHEMBL3774692,836,8.5,-1.0
2040650,CHEMBL1938210,CHEMBL3774747,836,8.5,-1.0
2040651,CHEMBL1938210,CHEMBL3775262,836,8.5,-1.0
2040652,CHEMBL1938210,CHEMBL3775272,836,8.5,-1.0
2040653,CHEMBL1938210,CHEMBL3775380,836,8.5,-1.0
2040654,CHEMBL1938210,CHEMBL3775668,836,8.5,-1.0
2040655,CHEMBL1938210,CHEMBL3775867,836,8.5,-1.0


### Load compound list `output/chembl_29_X_cmpds.csv`

In [9]:
cmpd_list = pd.read_csv(conf.compounds, header=None)
cmpd_list.info()
cmpd_list.nunique()
cmpd_list.drop(columns=0, inplace = True)

print(f"length of cmpd_list:  {len(cmpd_list)}")
print(cmpd_list)
print()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423737 entries, 0 to 423736
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       423737 non-null  int64 
 1   1       423737 non-null  object
dtypes: int64(1), object(1)
memory usage: 6.5+ MB
length of cmpd_list:  423737
                    1
0        CHEMBL405398
1        CHEMBL403325
2        CHEMBL501943
3        CHEMBL501094
4        CHEMBL505943
...               ...
423732  CHEMBL4297644
423733  CHEMBL4297666
423734  CHEMBL4297674
423735  CHEMBL4298138
423736  CHEMBL4298140

[423737 rows x 1 columns]



In [10]:
cmpd_list["cid"] = range(len(cmpd_list))
print(f"Size of cmpd_list: {len(cmpd_list)}") 
print(cmpd_list)

Size of cmpd_list: 423737
                    1     cid
0        CHEMBL405398       0
1        CHEMBL403325       1
2        CHEMBL501943       2
3        CHEMBL501094       3
4        CHEMBL505943       4
...               ...     ...
423732  CHEMBL4297644  423732
423733  CHEMBL4297666  423733
423734  CHEMBL4297674  423734
423735  CHEMBL4298138  423735
423736  CHEMBL4298140  423736

[423737 rows x 2 columns]


### Load the clustering info `output/clustering.npy`

In [11]:
#Load the clustering assignment
clusters = np.load(conf.clusters, allow_pickle=True)
print(len(clusters), clusters)

423737 [11919 11919 22648 ... 10256 26021 11919]


In [12]:
assert len(clusters) == len(cmpd_list), "Number of compound must agree with the size of clustering vector"

### Check length of cluster list and compute `Ncmpd` and `Ncol`

In [13]:
print()
Ncmpd = len(set(cmpd_list['cid']))
Ncol  = len(df["target_id"].unique()) * len(df["variable"].unique())


print(f" # of compounds Ncmpd: {Ncmpd}    ")
print(f" # of columns   Ncol : (# unique target_ids x # unique thresholds ) = ({len(df['target_id'].unique())} x {len(df['variable'].unique())}): {Ncol}")


 # of compounds Ncmpd: 423737    
 # of columns   Ncol : (# unique target_ids x # unique thresholds ) = (888 x 4): 3552


### Create dataframe of unique target_ids 

In [14]:
target_list = pd.DataFrame(df["target_id"].unique())
target_list["tid"] = range(len(target_list))
print(target_list)

                 0  tid
0    CHEMBL1075097    0
1    CHEMBL1075104    1
2    CHEMBL1075138    2
3    CHEMBL1075145    3
4    CHEMBL1075165    4
..             ...  ...
883     CHEMBL6154  883
884     CHEMBL6164  884
885     CHEMBL6166  885
886     CHEMBL6167  886
887     CHEMBL6175  887

[888 rows x 2 columns]


### create data frame of unique thresholds

In [15]:
variable_list = pd.DataFrame(df["variable"].unique())
variable_list["vid"] = range(len(variable_list))
print(variable_list)

Nvar = len(variable_list)
print(f" len(variable_list) : {Nvar}")

     0  vid
0  5.5    0
1  6.5    1
2  7.5    2
3  8.5    3
 len(variable_list) : 4


### `tmp1` : Add unique compound identifier to tmp1 

In [16]:
print(df.info())
print()
print(cmpd_list.info())
print()
print(df)
print()
print(cmpd_list)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2615835 entries, 0 to 2615834
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   target_id   object 
 1   cmpd_id     object 
 2   task_group  int64  
 3   variable    float64
 4   value       float64
dtypes: float64(2), int64(1), object(2)
memory usage: 99.8+ MB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423737 entries, 0 to 423736
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   1       423737 non-null  object
 1   cid     423737 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 6.5+ MB
None

             target_id        cmpd_id  task_group  variable  value
0        CHEMBL1075097  CHEMBL1234777           1       5.5    1.0
1        CHEMBL1075097  CHEMBL1812661           1       5.5    1.0
2        CHEMBL1075097  CHEMBL1812662           1       5.5   -1.0
3        CHEMBL1075097  CHEMBL2326084           1       5.5   

In [17]:
tmp1 = pd.merge(df, cmpd_list, left_on="cmpd_id", right_on=1)

In [18]:
print(tmp1.info())
print(tmp1.nunique())
print(tmp1)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2615084 entries, 0 to 2615083
Data columns (total 7 columns):
 #   Column      Dtype  
---  ------      -----  
 0   target_id   object 
 1   cmpd_id     object 
 2   task_group  int64  
 3   variable    float64
 4   value       float64
 5   1           object 
 6   cid         int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 159.6+ MB
None
target_id        888
cmpd_id       423737
task_group        10
variable           4
value              2
1             423737
cid           423737
dtype: int64
             target_id        cmpd_id  task_group  variable  value              1     cid
0        CHEMBL1075097  CHEMBL1234777           1       5.5    1.0  CHEMBL1234777  173927
1        CHEMBL1075097  CHEMBL1234777           1       6.5   -1.0  CHEMBL1234777  173927
2        CHEMBL1075097  CHEMBL1234777           1       7.5   -1.0  CHEMBL1234777  173927
3        CHEMBL1075097  CHEMBL1234777           1       8.5   -1.0  CHEMBL

In [18]:
# cid_value_counts = tmp2['cid'].value_counts()
# print(f'cid value counts : {cid_value_counts.shape} \n')
# print(cid_value_counts)
# print(f"\nNumber of unqiue cids:  {tmp2['cid'].nunique()}")

In [19]:
tmp1.to_csv('Step40/1_merge_with_cmpd_id.csv')

### `tmp2` : add unique threshold identifier to tmp2

In [20]:
# tmp1 = pd.read_csv('Step40/1_merge_with_cmpd_id.csv')

In [23]:
variable_list.info()
print()
print(variable_list)
print('=======================================')
tmp1.info()
print()
print(tmp1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       4 non-null      float64
 1   vid     4 non-null      int64  
dtypes: float64(1), int64(1)
memory usage: 192.0 bytes

     0  vid
0  5.5    0
1  6.5    1
2  7.5    2
3  8.5    3
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2615084 entries, 0 to 2615083
Data columns (total 7 columns):
 #   Column      Dtype  
---  ------      -----  
 0   target_id   object 
 1   cmpd_id     object 
 2   task_group  int64  
 3   variable    float64
 4   value       float64
 5   1           object 
 6   cid         int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 159.6+ MB

             target_id        cmpd_id  task_group  variable  value              1     cid
0        CHEMBL1075097  CHEMBL1234777           1       5.5    1.0  CHEMBL1234777  173927
1        CHEMBL1075097  CHEMBL1234777           1     

In [24]:
tmp2 = pd.merge(tmp1, variable_list, left_on="variable", right_on=0) 

In [25]:
tmp2.info()
print()
print(tmp2.nunique())
print()
print(tmp2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2615084 entries, 0 to 2615083
Data columns (total 9 columns):
 #   Column      Dtype  
---  ------      -----  
 0   target_id   object 
 1   cmpd_id     object 
 2   task_group  int64  
 3   variable    float64
 4   value       float64
 5   1           object 
 6   cid         int64  
 7   0           float64
 8   vid         int64  
dtypes: float64(3), int64(3), object(3)
memory usage: 199.5+ MB

target_id        888
cmpd_id       423737
task_group        10
variable           4
value              2
1             423737
cid           423737
0                  4
vid                4
dtype: int64

             target_id        cmpd_id  task_group  variable  value              1     cid    0  vid
0        CHEMBL1075097  CHEMBL1234777           1       5.5    1.0  CHEMBL1234777  173927  5.5    0
1        CHEMBL1075097  CHEMBL1812661           1       5.5    1.0  CHEMBL1812661  150148  5.5    0
2        CHEMBL1075097  CHEMBL1812662        

In [26]:
tmp2.to_csv('Step40/2_merge_with_variable_list.csv')

### `join` : Add unique target identifier to dataframe

In [27]:
# tmp2 = pd.read_csv('Step40/2_merge_with_variable_list.csv')

In [29]:
print(target_list.info())
print()
print(target_list)
print("===============================================\n")
print(tmp2.info())
print(tmp2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 888 entries, 0 to 887
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       888 non-null    object
 1   tid     888 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 14.0+ KB
None

                 0  tid
0    CHEMBL1075097    0
1    CHEMBL1075104    1
2    CHEMBL1075138    2
3    CHEMBL1075145    3
4    CHEMBL1075165    4
..             ...  ...
883     CHEMBL6154  883
884     CHEMBL6164  884
885     CHEMBL6166  885
886     CHEMBL6167  886
887     CHEMBL6175  887

[888 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2615084 entries, 0 to 2615083
Data columns (total 9 columns):
 #   Column      Dtype  
---  ------      -----  
 0   target_id   object 
 1   cmpd_id     object 
 2   task_group  int64  
 3   variable    float64
 4   value       float64
 5   1           object 
 6   cid         int64  
 7   0           float64
 8   vid         int6

In [30]:
join = pd.merge(tmp2,target_list, how="inner", left_on="target_id", right_on=0, suffixes = ['_left', '_right'])
join.info()
print(join)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2615084 entries, 0 to 2615083
Data columns (total 11 columns):
 #   Column      Dtype  
---  ------      -----  
 0   target_id   object 
 1   cmpd_id     object 
 2   task_group  int64  
 3   variable    float64
 4   value       float64
 5   1           object 
 6   cid         int64  
 7   0_left      float64
 8   vid         int64  
 9   0_right     object 
 10  tid         int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 239.4+ MB
             target_id        cmpd_id  task_group  variable  value              1     cid  0_left  vid        0_right  tid
0        CHEMBL1075097  CHEMBL1234777           1       5.5    1.0  CHEMBL1234777  173927     5.5    0  CHEMBL1075097    0
1        CHEMBL1075097  CHEMBL1812661           1       5.5    1.0  CHEMBL1812661  150148     5.5    0  CHEMBL1075097    0
2        CHEMBL1075097  CHEMBL1812662           1       5.5   -1.0  CHEMBL1812662  143524     5.5    0  CHEMBL1075097    0
3     

In [31]:
join.drop(columns=[1, '0_left', '0_right'],inplace=True)

In [32]:
join.info()
print()
print(join.nunique())
print()
print(join)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2615084 entries, 0 to 2615083
Data columns (total 8 columns):
 #   Column      Dtype  
---  ------      -----  
 0   target_id   object 
 1   cmpd_id     object 
 2   task_group  int64  
 3   variable    float64
 4   value       float64
 5   cid         int64  
 6   vid         int64  
 7   tid         int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 179.6+ MB

target_id        888
cmpd_id       423737
task_group        10
variable           4
value              2
cid           423737
vid                4
tid              888
dtype: int64

             target_id        cmpd_id  task_group  variable  value     cid  vid  tid
0        CHEMBL1075097  CHEMBL1234777           1       5.5    1.0  173927    0    0
1        CHEMBL1075097  CHEMBL1812661           1       5.5    1.0  150148    0    0
2        CHEMBL1075097  CHEMBL1812662           1       5.5   -1.0  143524    0    0
3        CHEMBL1075097  CHEMBL2326084           1  

In [33]:
print(join['task_group'].value_counts())
ttl_counts = 0
for i in join['task_group'].value_counts():
    ttl_counts += i
print (f"total   {ttl_counts}")

6       702258
11      336091
1028    323448
1       309909
0       270448
10      155187
1005    149426
836     130131
1031    126553
643     111633
Name: task_group, dtype: int64
total   2615084


In [34]:
join.to_csv('Step40/2_merge_with_target_list.csv')

## `join` data analysis

In [35]:
# join = pd.read_csv('Step40/2_merge_with_target_list.csv')

Note: 
    The following targets are connected to 2 different task groups:
    
    CHEMBL3521       2
    CHEMBL4105860    2
    CHEMBL4958       2
    CHEMBL4081       2

In [36]:
# join.groupby(by=['target_id']).nunique('task_group').sort_values(by='task_group')
join.groupby(by=['target_id']).nunique('task_group').sort_values(by='task_group')

Unnamed: 0_level_0,cmpd_id,task_group,variable,value,cid,vid,tid
target_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CHEMBL1075097,120,1,4,2,120,4,1
CHEMBL3988583,1094,1,4,2,1094,4,1
CHEMBL3988599,321,1,4,2,321,4,1
CHEMBL3989381,114,1,4,1,114,4,1
CHEMBL3991,211,1,4,2,211,4,1
...,...,...,...,...,...,...,...
CHEMBL245,887,1,4,2,887,4,1
CHEMBL3521,214,2,4,2,214,4,1
CHEMBL4105860,204,2,4,1,204,4,1
CHEMBL4958,181,2,4,2,181,4,1


In [37]:
join.groupby('target_id')['task_group'].nunique().sort_values().nlargest(10)

target_id
CHEMBL3521       2
CHEMBL4105860    2
CHEMBL4958       2
CHEMBL4081       2
CHEMBL1075097    1
CHEMBL3988583    1
CHEMBL3988599    1
CHEMBL3989381    1
CHEMBL3991       1
CHEMBL3991501    1
Name: task_group, dtype: int64

In [38]:
# join.groupby(by=['target_id']).nunique('task_group').sort_values(by='task_group')
# join.groupby(by=['tid','cid']).nunique('task_group').sort_values(by=['task_group'])
tst = join.drop(columns=['cid', 'tid', 'vid'])
tst['counter'] = 1
tst

Unnamed: 0,target_id,cmpd_id,task_group,variable,value,counter
0,CHEMBL1075097,CHEMBL1234777,1,5.5,1.0,1
1,CHEMBL1075097,CHEMBL1812661,1,5.5,1.0,1
2,CHEMBL1075097,CHEMBL1812662,1,5.5,-1.0,1
3,CHEMBL1075097,CHEMBL2326084,1,5.5,1.0,1
4,CHEMBL1075097,CHEMBL2326085,1,5.5,1.0,1
...,...,...,...,...,...,...
2615079,CHEMBL5966,CHEMBL526861,0,8.5,-1.0,1
2615080,CHEMBL5966,CHEMBL527054,0,8.5,-1.0,1
2615081,CHEMBL5966,CHEMBL527072,0,8.5,-1.0,1
2615082,CHEMBL5966,CHEMBL527080,0,8.5,-1.0,1


In [39]:
dups =join.drop_duplicates(subset=['target_id', 'task_group'])
print(dups)
dups[['target_id']].value_counts()

             target_id        cmpd_id  task_group  variable  value     cid  vid  tid
0        CHEMBL1075097  CHEMBL1234777           1       5.5    1.0  173927    0    0
480         CHEMBL1808  CHEMBL4476621        1028       5.5    1.0   82782    0   74
3219        CHEMBL1951  CHEMBL4476621          10       5.5    1.0   82782    0  133
14968        CHEMBL204  CHEMBL4476621        1028       5.5    1.0   82782    0  172
26765        CHEMBL205  CHEMBL4476621           1       5.5    1.0   82782    0  177
...                ...            ...         ...       ...    ...     ...  ...  ...
2611004     CHEMBL5413  CHEMBL1163732          11       5.5    1.0  159280    0  804
2611408     CHEMBL5480  CHEMBL1795350        1005       5.5   -1.0  105276    0  812
2612828     CHEMBL5649  CHEMBL3448261          11       5.5   -1.0  270814    0  831
2613472     CHEMBL5689  CHEMBL2103855          10       5.5    1.0  228385    0  835
2614252     CHEMBL5966  CHEMBL1089944           0       5.5   -1.

target_id    
CHEMBL4105860    2
CHEMBL4081       2
CHEMBL3521       2
CHEMBL4958       2
CHEMBL1075097    1
                ..
CHEMBL2447       1
CHEMBL245        1
CHEMBL2473       1
CHEMBL2474       1
CHEMBL6175       1
Length: 888, dtype: int64

In [40]:
tst1 = tst.groupby(by=['target_id','cmpd_id', 'variable']).count() ##.sort_values(by=['task_group'])
tst1.reset_index(inplace=True)
tst1.info()
tst1.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2612254 entries, 0 to 2612253
Data columns (total 6 columns):
 #   Column      Dtype  
---  ------      -----  
 0   target_id   object 
 1   cmpd_id     object 
 2   variable    float64
 3   task_group  int64  
 4   value       int64  
 5   counter     int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 119.6+ MB


Unnamed: 0,target_id,cmpd_id,variable,task_group,value,counter
0,CHEMBL1075097,CHEMBL1234777,5.5,1,1,1
1,CHEMBL1075097,CHEMBL1234777,6.5,1,1,1
2,CHEMBL1075097,CHEMBL1234777,7.5,1,1,1
3,CHEMBL1075097,CHEMBL1234777,8.5,1,1,1
4,CHEMBL1075097,CHEMBL1812661,5.5,1,1,1


In [45]:
tst1['counter'].sum()

2615084

In [46]:
# tst[tst['task_group']!= 1]
tst_lt_0  = tst1[tst1['task_group'] <  0]
tst_gt_0  = tst1[tst1['task_group'] >  0]
tst_0     = tst1[tst1['task_group'] == 0]
tst_1     = tst1[tst1['task_group'] == 1]
tst_2     = tst1[tst1['task_group'] == 2]
tst_3     = tst1[tst1['task_group'] == 3]
print('tst shape: ', tst.shape)
print(' tst[task_group] < 0 ', tst_lt_0.shape[0])
print(' tst[task_group] > 0 ', tst_gt_0.shape[0])
print(' tst[task_group] = 0 ', tst_0.shape[0])
print(' tst[task_group] = 1 ', tst_1.shape[0])
print(' tst[task_group] = 2 ', tst_2.shape[0])
print(' tst[task_group] = 3 ', tst_3.shape[0])


tst shape:  (2615084, 6)
 tst[task_group] < 0  (0, 6)
 tst[task_group] > 0  (2612254, 6)
 tst[task_group] = 0  (0, 6)
 tst[task_group] = 1  (2609424, 6)
 tst[task_group] = 2  (2830, 6)
 tst[task_group] = 3  (0, 6)


In [47]:
print(f" Join Shape   : {join.shape}")    
print()
print(f" #(join < 0) : {tst_lt_0.shape[0]}")
print(f" #(join > 0) : {tst_gt_0.shape[0]}")


 Join Shape   : (2615084, 8)

 #(join < 0) : 0
 #(join > 0) : 2612254


In [48]:
disp = join[join['target_id'] == 'CHEMBL1938210']
disp

Unnamed: 0,target_id,cmpd_id,task_group,variable,value,cid,vid,tid
2462632,CHEMBL1938210,CHEMBL51979,836,5.5,1.0,7715,0,122
2462633,CHEMBL1938210,CHEMBL1229308,836,5.5,1.0,163717,0,122
2462634,CHEMBL1938210,CHEMBL1230640,836,5.5,-1.0,171856,0,122
2462635,CHEMBL1938210,CHEMBL1255743,836,5.5,-1.0,180434,0,122
2462636,CHEMBL1938210,CHEMBL1614745,836,5.5,-1.0,172828,0,122
...,...,...,...,...,...,...,...,...
2463026,CHEMBL1938210,CHEMBL4542928,836,8.5,-1.0,122939,3,122
2463027,CHEMBL1938210,CHEMBL4585876,836,8.5,-1.0,142470,3,122
2463028,CHEMBL1938210,CHEMBL4592908,836,8.5,-1.0,144809,3,122
2463029,CHEMBL1938210,CHEMBL90852,836,8.5,-1.0,14505,3,122


In [124]:
disp = join[join['target_id'] == 'CHEMBL1938210']
disp

Unnamed: 0,target_id,cmpd_id,task_group,variable,value,cid,vid,tid
2545436,CHEMBL1938210,CHEMBL51979,836,5.5,1.0,7715,0,122
2545437,CHEMBL1938210,CHEMBL51979,836,5.5,1.0,7715,0,122
2545438,CHEMBL1938210,CHEMBL1229308,836,5.5,1.0,163717,0,122
2545439,CHEMBL1938210,CHEMBL1229308,836,5.5,1.0,163717,0,122
2545440,CHEMBL1938210,CHEMBL1230640,836,5.5,-1.0,171856,0,122
...,...,...,...,...,...,...,...,...
2546229,CHEMBL1938210,CHEMBL4592908,836,8.5,-1.0,144809,3,122
2546230,CHEMBL1938210,CHEMBL90852,836,8.5,-1.0,14505,3,122
2546231,CHEMBL1938210,CHEMBL90852,836,8.5,-1.0,14505,3,122
2546232,CHEMBL1938210,CHEMBL4646431,836,8.5,-1.0,145467,3,122


In [50]:
tst2 = tst1[tst1['value'] == +2]
tst2

Unnamed: 0,target_id,cmpd_id,variable,task_group,value,counter
1684987,CHEMBL3521,CHEMBL1077438,5.5,2,2,2
1684988,CHEMBL3521,CHEMBL1077438,6.5,2,2,2
1684989,CHEMBL3521,CHEMBL1077438,7.5,2,2,2
1684990,CHEMBL3521,CHEMBL1077438,8.5,2,2,2
1684991,CHEMBL3521,CHEMBL108721,5.5,2,2,2
...,...,...,...,...,...,...
2390234,CHEMBL4958,CHEMBL69590,8.5,2,2,2
2390235,CHEMBL4958,CHEMBL77030,5.5,2,2,2
2390236,CHEMBL4958,CHEMBL77030,6.5,2,2,2
2390237,CHEMBL4958,CHEMBL77030,7.5,2,2,2


In [51]:
# join.groupby(by=['target_id']).nunique('task_group').sort_values(by='task_group')
# join.groupby(by=['tid','cid']).nunique('task_group').sort_values(by=['task_group'])
# tst = join.groupby(by=['target_id','cmpd_id', 'variable']).sum() ##.sort_values(by=['task_group'])

# tst_2.info()
# tst_2.head()

print('\n tgt_id       entries')
print(tst_2['target_id'].value_counts())
print(len(tst_2['target_id'].unique()))


 tgt_id       entries
CHEMBL3521       856
CHEMBL4105860    816
CHEMBL4958       724
CHEMBL4081       434
Name: target_id, dtype: int64
4


In [52]:
# join.groupby(by=['target_id']).nunique('task_group').sort_values(by='task_group')
# join.groupby(by=['tid','cid']).nunique('task_group').sort_values(by=['task_group'])
# tst = join.groupby(by=['target_id','cmpd_id', 'variable']).sum() ##.sort_values(by=['task_group'])

tst_2.info()
# tst_2.head()

print('\n tgt_id       entries')
print(tst_2['target_id'].value_counts())
print(len(tst_2['target_id'].unique()))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76889 entries, 134185 to 2539764
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   target_id   76889 non-null  object 
 1   cmpd_id     76889 non-null  object 
 2   variable    76889 non-null  float64
 3   task_group  76889 non-null  int64  
 4   value       76889 non-null  int64  
 5   counter     76889 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 4.1+ MB

 tgt_id       entries
CHEMBL2971       22864
CHEMBL2835       15859
CHEMBL2148       14684
CHEMBL3553        5449
CHEMBL5747        1988
CHEMBL2424504     1917
CHEMBL1795177     1872
CHEMBL2345        1595
CHEMBL3784        1185
CHEMBL3906        1176
CHEMBL3521         856
CHEMBL5500         847
CHEMBL4105860      816
CHEMBL3108638      732
CHEMBL4958         724
CHEMBL2163176      638
CHEMBL3588739      516
CHEMBL2424510      495
CHEMBL4237         494
CHEMBL2176772      460
CHEMBL2553  

In [60]:
# tst2 = tst[tst['value'] == +2]
# print(tst2.info())
# print(tst2.head())

# print(tst2['target_id'].nunique())
# print(tst2['target_id'].unique())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76889 entries, 134185 to 2539764
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   target_id   76889 non-null  object 
 1   cmpd_id     76889 non-null  object 
 2   variable    76889 non-null  float64
 3   task_group  76889 non-null  int64  
 4   value       76889 non-null  int64  
 5   counter     76889 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 4.1+ MB
None
            target_id        cmpd_id  variable  task_group  value  counter
134185  CHEMBL1795177  CHEMBL1829304       5.5           2      2        2
134186  CHEMBL1795177  CHEMBL1829304       6.5           2      2        2
134187  CHEMBL1795177  CHEMBL1829304       7.5           2      2        2
134188  CHEMBL1795177  CHEMBL1829304       8.5           2      2        2
134189  CHEMBL1795177  CHEMBL2204995       5.5           2      2        2


In [52]:
tst22 = tst2[['target_id', 'cmpd_id']].groupby(by=['target_id']).count()
tst22

Unnamed: 0_level_0,cmpd_id
target_id,Unnamed: 1_level_1
CHEMBL3521,856
CHEMBL4081,434
CHEMBL4105860,816
CHEMBL4958,724


In [74]:
tst22 = tst2[['target_id', 'cmpd_id']].groupby(by=['target_id']).count()
tst22

Unnamed: 0_level_0,cmpd_id
target_id,Unnamed: 1_level_1
CHEMBL1795177,1872
CHEMBL1938210,399
CHEMBL1938212,434
CHEMBL2148,14684
CHEMBL2163176,638
CHEMBL2176772,460
CHEMBL2345,1595
CHEMBL2424504,1917
CHEMBL2424510,495
CHEMBL2553,455


In [53]:
tst3 = tst1[tst1['value'] == +3]
print(tst3.info())
print(tst3.head())

print(tst3['target_id'].nunique())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   target_id   0 non-null      object 
 1   cmpd_id     0 non-null      object 
 2   variable    0 non-null      float64
 3   task_group  0 non-null      int64  
 4   value       0 non-null      int64  
 5   counter     0 non-null      int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 0.0+ bytes
None
Empty DataFrame
Columns: [target_id, cmpd_id, variable, task_group, value, counter]
Index: []
0


In [113]:
print(tst3[['target_id','cmpd_id', 'task_group']].value_counts())

target_id      cmpd_id        task_group
CHEMBL3132741  CHEMBL1346130  3             4
CHEMBL6175     CHEMBL3775262  3             4
               CHEMBL3775431  3             4
               CHEMBL3775380  3             4
               CHEMBL3775366  3             4
                                           ..
               CHEMBL3786963  3             1
               CHEMBL3786862  3             1
               CHEMBL3786644  3             1
               CHEMBL3785491  3             1
CHEMBL5896     CHEMBL3786644  3             1
Length: 1206, dtype: int64


In [106]:
tst32 = tst[['target_id', 'task_group']].groupby(by=['target_id', 'task_group']).count()  
tst32

target_id,task_group
CHEMBL1075097,1
CHEMBL1075104,1
CHEMBL1075138,1
CHEMBL1075145,1
CHEMBL1075165,1
...,...
CHEMBL6154,1
CHEMBL6164,1
CHEMBL6166,1
CHEMBL6167,1


In [101]:
 tst3[['target_id', 'task_group']] 

Unnamed: 0,target_id,task_group
1440117,CHEMBL3132741,3
1440118,CHEMBL3132741,3
1440119,CHEMBL3132741,3
1440120,CHEMBL3132741,3
1440121,CHEMBL3132741,3


In [10]:
# print(tst2.nunique())
# print(tst2['target_id'].value_counts())
# print(tst2)

In [11]:
# tst22 = tst2.groupby(by=['target_id']).sum()
# tst22

In [12]:
# tst2['target_id'].value_counts()

In [13]:
# tst2_sum = tst2.groupby(by=['target_id']).count().sum(axis=0)
# tst2_sum

In [14]:
# tst3 = tst[tst['value'] == +3]
# tst3.info()
# tst3.head()

# print(tst3.nunique())
# tst3

In [15]:
# tst2.groupby(by=['target_id']).count()

## Create `Ymask` sparse matrix

    #-----------------------------------------------------------------------------------------------------
    # Build COOrdiante formatted sparse matrices for Ymask and  Y
    #
    # Ymask =  coo_matrix((data, (i, j)), [shape=(M, N)])
    # to construct from three arrays:
    # data[:] the entries of the matrix, in any order
    # 
    # i[:] the row indices of the matrix entries
    # j[:] the column indices of the matrix entries
    # 
    # Where A[i[k], j[k]] = data[k]. When shape is not specified, it is inferred from the index arrays
    #
    # Number of rows    : Ncmpd - Unique compound ids 
    # Number of columns : Ncol  - Number of unique target ids x number of unqiue thresholds 
    #
    #
    #-----------------------------------------------------------------------------------------------------
    
    Dims:   `Ncmpd x NCol`

In [54]:
print()
Ncmpd = len(set(cmpd_list['cid']))
Ncol  = len(df["target_id"].unique()) * len(df["variable"].unique())

print(f" # of compounds Ncmpd: {Ncmpd}    ")
print(f" # of columns   Ncol : (# unique target_ids x # unique thresholds ) = ({len(df['target_id'].unique())} x {len(df['variable'].unique())}): {Ncol}")

I = join["cid"].to_numpy()                         ## Rows: Compund Ids
J = (Nvar * join["tid"] + join["vid"]).to_numpy()  ## Columns: (4 x tid) + vid
V = np.ones(len(I))

Ymask = scipy.sparse.coo_matrix((V,(I,J)),(Ncmpd,Ncol))


 # of compounds Ncmpd: 423737    
 # of columns   Ncol : (# unique target_ids x # unique thresholds ) = (888 x 4): 3552


In [55]:
print(f' I (Row coordiantes) Len: {len(I)} - {I}') ## Rows 
print(f' J (Col coordiantes) Len: {len(J)} - {J}') ## Columns 
print(f' V (Data)            Len: {len(V)} - {V} min: {min(V)} max: {max(V)}') # Data

 I (Row coordiantes) Len: 2615084 - [173927 150148 143524 ...    211 118961    159]
 J (Col coordiantes) Len: 2615084 - [   0    0    0 ... 3467 3467 3467]
 V (Data)            Len: 2615084 - [1. 1. 1. ... 1. 1. 1.] min: 1.0 max: 1.0


In [56]:
Ymask_neg = (Ymask < 0)

print(f" Ymask        : {type(Ymask)}      Shape: {Ymask.shape}")    
print(f" Ymask.data   : {type(Ymask.data)} Shape: {Ymask.data.shape}")
print()
print(f" #(Ymask < 0) : {(Ymask<0).sum()}")
print(f" #(Ymask > 0) : {(Ymask>0).sum()}")
print(f" #(Ymask = 1) : {(Ymask==1).sum()}")
print(f" #(Ymask = 2) : {(Ymask==2).sum()}")
print(f" #(Ymask = 3) : {(Ymask==3).sum()}")
print(f" #(Ymask = 4) : {(Ymask==4).sum()}")
print(f" #(Ymask = 5) : {(Ymask==5).sum()}")
print(f" #(Ymask > 5) : {(Ymask>5).sum()}")
print(f" Ymask.sum()  : {Ymask.sum()}")
# print(Ymask)

 Ymask        : <class 'scipy.sparse.coo.coo_matrix'>      Shape: (423737, 3552)
 Ymask.data   : <class 'numpy.ndarray'> Shape: (2615084,)

 #(Ymask < 0) : 0
 #(Ymask > 0) : 2612254
 #(Ymask = 1) : 2609424
 #(Ymask = 2) : 2830
 #(Ymask = 3) : 0
 #(Ymask = 4) : 0
 #(Ymask = 5) : 0
 #(Ymask > 5) : 0
 Ymask.sum()  : 2615084.0


##  Create Y sparse matrix 

In [57]:
y_data = join["value"].to_numpy()

print(f' I (Row coordiantes) Len: {len(I)} - {I}') ## Rows 
print(f' J (Col coordiantes) Len: {len(J)} - {J}') ## Columns 
print(f' V (Data)            Len: {len(y_data)} - {y_data} min: {min(y_data)} max: {max(y_data)}') # Data

 I (Row coordiantes) Len: 2615084 - [173927 150148 143524 ...    211 118961    159]
 J (Col coordiantes) Len: 2615084 - [   0    0    0 ... 3467 3467 3467]
 V (Data)            Len: 2615084 - [ 1.  1. -1. ... -1. -1. -1.] min: -1.0 max: 1.0


In [58]:
Y_coo = scipy.sparse.coo_matrix((y_data,(I,J)),(Ncmpd, Ncol))

In [59]:
Y = scipy.sparse.csr_matrix((y_data,(I,J)),(Ncmpd, Ncol))

In [62]:
print(repr(Y))
print(f" #  (Y < 0 ) : {(Y<0).sum()}")
print(f" #  (Y != 0) : {(Y!=0).sum()}")
print(f" #  (Y > 0)  : {(Y>0).sum()}")
print()
print(f" #  (Y <= -3): {(Y <=-3).sum()}")
print(f" #  (Y = -2) : {(Y ==-2).sum()}")
print(f" #  (Y = -1) : {(Y ==-1).sum()}")
print(f" #  (Y = 1)  : {(Y == 1).sum()}")
print(f" #  (Y = 2)  : {(Y == 2).sum()}")
print(f" #  (Y >= 3) : {(Y >= 3).sum()}")
print(f" Y.sum()     : {Y.sum()}")

<423737x3552 sparse matrix of type '<class 'numpy.float64'>'
	with 2612254 stored elements in Compressed Sparse Row format>
 #  (Y < 0 ) : 1679110
 #  (Y != 0) : 2612254
 #  (Y > 0)  : 933144

 #  (Y <= -3): 0
 #  (Y = -2) : 0
 #  (Y = -1) : 1679110
 #  (Y = 1)  : 933144
 #  (Y = 2)  : 0
 #  (Y >= 3) : 0
 Y.sum()     : -745966.0


In [61]:
if (Y >= +2).sum():
    print(" ============ MODIFY > +1 ===========")
    Y[Y >= +2] = +1
#     modify = True

if (Y <= -2).sum():
    print(" ============ MODIFY < -1 ===========")
    Y[Y <= -1] = -1
#     modify = True# print(Y)



## Write Y file

In [63]:
conf.y
all_Y_filename = conf.y[:-3]+"npy"
print(all_Y_filename)

output/chembl_29_dev/chembl_29_dev_Y_all.npy


In [64]:
np.save(all_Y_filename, Y)
print(f"write Y file to: {all_Y_filename}")

write Y file to: output/chembl_29_dev/chembl_29_dev_Y_all.npy


## Write Y file for each task group

In [65]:
task_groups = join['task_group'].unique()
print(len(task_groups), task_groups)

threshold_count = len(df["variable"].unique())
print(f" threshold count {threshold_count}")
print(join['task_group'].value_counts())

10 [   1 1028   10   11    0 1005    6  836  643 1031]
 threshold count 4
6       702258
11      336091
1028    323448
1       309909
0       270448
10      155187
1005    149426
836     130131
1031    126553
643     111633
Name: task_group, dtype: int64


    Previous:
    ----------------
    10 [   1 1028   10   11    0 1005    6  836  643 1031]
     threshold count 4
    6       764834
    11      336091
    1028    323448
    1       309909
    0       270448
    10      155187
    836     150758
    1005    149426
    1031    126553
    643     111633
    Name: task_group, dtype: int64

In [115]:
ttl_cols = 0

for i in task_groups:
    print("\n---------------------")
    print(f"   Taskgroup {i:4d} ")
    print("---------------------")
    
    tg_join = join[join['task_group'] == i]
    tg_target_list = pd.DataFrame(tg_join['tid'].unique())    
    tg_target_count = len(tg_target_list) 
    tg_target_list["col_id"] = range(len(tg_target_list))
    tg_join = pd.merge(tg_join, tg_target_list, how="inner", left_on="tid", right_on=0)    
    
#     print(tg_join)    
#     print(tg_join['tid'].unique())
#     print(tg_target_list)    
#     print(tg_join)        

    Ncol = tg_target_count * threshold_count   
    ttl_cols += Ncol
    I = tg_join["cid"].to_numpy()
    J = (Nvar * tg_join["col_id"] + tg_join["vid"]).to_numpy()
    V = tg_join["value"].to_numpy()
    tg_Y = scipy.sparse.csr_matrix((V,(I,J)),(Ncmpd, Ncol))
    
#     tg_Ymask_data = np.ones(len(I))
#     tg_Ymask = scipy.sparse.coo_matrix((y_mask_data,(I,J)),(Ncmpd,Ncol))
    print()
    print(f" Taskgroup {i:6d} - rows: {len(tg_join)}")
    print(f" Target_list shape      : {tg_target_list.shape}")
    print(f" tg_targer_count        : {tg_target_count}")
    print(f' Number of columns      : {Ncol}')
    print(f' I (Row coordiantes) Len: {len(I)} - {I}') ## Rows 
    print(f' J (Col coordiantes) Len: {len(J)} - {J}') ## Columns 
    print(f' V (Data)            Len: {len(V)} - {V} min: {min(V)} max: {max(V)}') # Data   
    print(f' Sparse matrix shape    : {tg_Y.shape}')

    print(repr(tg_Y))
    print(f" #  (Y < 0 ) : {(tg_Y < 0).sum()}")
    print(f" #  (Y > 0)  : {(tg_Y > 0).sum()}")
    print(f" #  (Y != 0) : {(tg_Y !=0).sum()}")    
    print()
    print(f" #  (Y < -3) : {(tg_Y  < -3).sum()}")
    print(f" #  (Y = -3) : {(tg_Y == -3).sum()}")
    print(f" #  (Y = -2) : {(tg_Y == -2).sum()}")
    print(f" #  (Y = -1) : {(tg_Y == -1).sum()}")
    print(f" #  (Y = +1) : {(tg_Y == 1).sum()}")
    print(f" #  (Y = +2) : {(tg_Y == 2).sum()}")
    print(f" #  (Y = +3) : {(tg_Y == 3).sum()}")
    print(f" #  (Y > +3) : {(tg_Y  > 3).sum()}")
    print(f" Y.sum() +   : {tg_Y.sum()}")  
    
    modify = False
    if (tg_Y >= +2).sum():
        print(" ============ MODIFY > +1 ===========")
        tg_Y[tg_Y >= +2] = +1
        modify = True
        
    if (tg_Y <= -2).sum():
        print(" ============ MODIFY < -1 ===========")
        tg_Y[tg_Y <= -1] = -1
        modify = True
        
    if modify:
        print(f" Write Y file for task group {i} to: {tg_Y_filename}")
        print(f" #  (Y < 0 ) : {(tg_Y < 0).sum()}")
        print(f" #  (Y > 0)  : {(tg_Y > 0).sum()}")
        print(f" #  (Y != 0) : {(tg_Y !=0).sum()}")        
        print()
        print(f" #  (Y < -5) : {(tg_Y  < -5).sum()}")
        print(f" #  (Y = -5) : {(tg_Y == -5).sum()}")
        print(f" #  (Y = -4) : {(tg_Y == -4).sum()}")
        print(f" #  (Y = -3) : {(tg_Y == -3).sum()}")
        print(f" #  (Y = -2) : {(tg_Y == -2).sum()}")
        print(f" #  (Y = -1) : {(tg_Y == -1).sum()}")
        print(f" #  (Y = +1)  : {(tg_Y == 1).sum()}")
        print(f" #  (Y = +2)  : {(tg_Y == 2).sum()}")
        print(f" #  (Y = +3)  : {(tg_Y == 3).sum()}")
        print(f" #  (Y = +4)  : {(tg_Y == 4).sum()}")
        print(f" #  (Y = +5)  : {(tg_Y == 5).sum()}")
        print(f" #  (Y > +5)  : {(tg_Y  > 5).sum()}")
        print(f" Y.sum() +    : {tg_Y.sum()}")  

        
    tg_Y_filename= f"{conf.y[:-8]}_tg_{i}_cols_{Ncol}.npy"

#     np.save(tg_Y_filename, tg_Y)
    print(f" Taskgroup {i} written to {tg_Y_filename } ")
print(f"ttl_cols: {ttl_cols}")


---------------------
   Taskgroup    1 
---------------------

 Taskgroup      1 - rows: 309909
 Target_list shape      : (156, 2)
 tg_targer_count        : 156
 Number of columns      : 624
 I (Row coordiantes) Len: 309909 - [173927 150148 143524 ... 226340 225143 225274]
 J (Col coordiantes) Len: 309909 - [  0   0   0 ... 623 623 623]
 V (Data)            Len: 309909 - [ 1.  1. -1. ... -1. -1. -1.] min: -1.0 max: 1.0
 Sparse matrix shape    : (423737, 624)
<423737x624 sparse matrix of type '<class 'numpy.float64'>'
	with 309909 stored elements in Compressed Sparse Row format>
 #  (Y < 0 ) : 219244
 #  (Y > 0)  : 90665
 #  (Y != 0) : 309909

 #  (Y < -3) : 0
 #  (Y = -3) : 0
 #  (Y = -2) : 0
 #  (Y = -1) : 219244
 #  (Y = +1) : 90665
 #  (Y = +2) : 0
 #  (Y = +3) : 0
 #  (Y > +3) : 0
 Y.sum() +   : -128579.0
 Taskgroup 1 written to output/chembl_29_dev/chembl_29_dev_Y_tg_1_cols_624.npy 

---------------------
   Taskgroup 1028 
---------------------

 Taskgroup   1028 - rows: 323448

##  Create Folding file and save to `xxxx_folding.npy`

In [67]:
print(Ymask.shape)

print(f"Calculate folding... nfolds: {conf.nfolds}")

(423737, 3552)
Calculate folding... nfolds: 5


In [68]:
folds = mtcv_clustered(Ymask, pd.Series(clusters), conf.nfolds)

* Y_in       : <class 'scipy.sparse.coo.coo_matrix'> 	 (423737, 3552)
* clusters: <class 'pandas.core.series.Series'>  	 (423737,) 
           Min: 0 Max:26090
0         11919
1         11919
2         22648
3         11919
4         22648
          ...  
423732     3361
423733    26021
423734    10256
423735    26021
423736    11919
Length: 423737, dtype: int64 

* cl_uniq: <class 'numpy.ndarray'> 15835
[11919 22648 22247 ...  4024 25988 10256]

* cid:   <class 'pandas.core.series.Series'> 423737
11919        0
11919        0
22648        1
11919        0
22648        1
         ...  
3361      1643
26021        4
10256    15834
26021        4
11919        0
Length: 423737, dtype: int64

* Create C sparse matrix - indicating the clusters compounds are assigned to.... 
* Shape of sparse matrix: Rows(#clusters):15835 Columns(#compounds): 423737 

* Y_bin     :   <class 'scipy.sparse.coo.coo_matrix'> (423737, 3552)
* Y_bin.data:   <class 'numpy.ndarray'> (2615084,)
 Sum: 0
 Sum: 2612254


In [69]:
print(folds.shape)
print(folds.min(), folds.max())

(423737,)
0 4


In [70]:
print(conf.folding)

output/chembl_29_dev/chembl_29_dev_folding.npy


In [71]:
#TODO: save folding
np.save(conf.folding, folds)
print(f"save folding file to {conf.folding}")

save folding file to output/chembl_29_dev/chembl_29_dev_folding.npy


## Check Y file for > +1 or <-1 entries

In [11]:
import glob
filelist = [i.replace('output/chembl_29/', '') for i in glob.glob('output/chembl_29/chembl_29_Y_tg*')]
for i in filelist:
    print(i)
 

chembl_29_Y_tg_1031_cols_72.npy
chembl_29_Y_tg_0_cols_472.npy
chembl_29_Y_tg_6_cols_688.npy
chembl_29_Y_tg_1028_cols_344.npy
chembl_29_Y_tg_836_cols_224.npy
chembl_29_Y_tg_11_cols_620.npy
chembl_29_Y_tg_10_cols_192.npy
chembl_29_Y_tg_1_cols_624.npy
chembl_29_Y_tg_643_cols_184.npy
chembl_29_Y_tg_1005_cols_148.npy


In [13]:
filelist = ['chembl_29_Y_all.npy']

In [14]:
for test_file in filelist:
    Y_new = load_sparse(dataroot = 'output/chembl_29_dev', filename = test_file)

    Y_old = load_sparse(dataroot = 'output/chembl_29_10task', filename = test_file)

    repr(Y_new)
    # Y_new[0,672] = 5
    # print(Y_new)

    repr(Y_old)

    tst_equal = ((Y_new != Y_old).nnz == 0)
    print(type(tst_equal), tst_equal)
    if (tst_equal):
        print( f'{test_file}  -- files are  equal')
    else:
        print( f'{test_file}  -- files are NOT equial')
        print(Y_new != Y_old)




 load sparse file from output/chembl_29_dev/chembl_29_Y_all.npy
 load sparse file from output/chembl_29_10task/chembl_29_Y_all.npy
<class 'bool'> False
chembl_29_Y_all.npy  -- files are NOT equial
  (38, 2180)	True
  (38, 2181)	True
  (38, 2182)	True
  (38, 2183)	True
  (38, 3264)	True
  (38, 3265)	True
  (38, 3266)	True
  (38, 3267)	True
  (360, 828)	True
  (360, 829)	True
  (360, 830)	True
  (360, 831)	True
  (360, 1600)	True
  (360, 1601)	True
  (360, 1602)	True
  (360, 1603)	True
  (533, 1088)	True
  (533, 1089)	True
  (533, 1090)	True
  (533, 1091)	True
  (533, 2280)	True
  (533, 2281)	True
  (533, 2282)	True
  (533, 2283)	True
  (533, 2528)	True
  :	:
  (423698, 497)	True
  (423698, 498)	True
  (423698, 499)	True
  (423702, 1600)	True
  (423702, 1601)	True
  (423702, 1602)	True
  (423702, 1603)	True
  (423705, 1160)	True
  (423705, 1161)	True
  (423705, 1162)	True
  (423705, 1163)	True
  (423705, 3424)	True
  (423705, 3425)	True
  (423705, 3426)	True
  (423705, 3427)	True
  (4237

In [16]:
print(Y_old[38,2180:2184])
print(Y_new[38,2180:2184])

  (0, 0)	-2.0
  (0, 1)	-2.0
  (0, 2)	-2.0
  (0, 3)	-2.0
  (0, 0)	-1.0
  (0, 1)	-1.0
  (0, 2)	-1.0
  (0, 3)	-1.0


###  Dedupe `chembl_29_Y_all_with_dups`

This will previously contained > +1 and < -1 entries. Convery those entires to +/-1 and make sure it matches the Y generated by the modified pipeline process

In [19]:
Y = load_sparse(dataroot = 'output/chembl_29_10task', filename = 'chembl_29_Y_all_with_dups.npy')

 load sparse file from output/chembl_29_10task/chembl_29_Y_all_with_dups.npy


In [31]:
Y_new = load_sparse(dataroot = 'output/chembl_29', filename = 'chembl_29_Y_all.npy')

 load sparse file from output/chembl_29/chembl_29_Y_all.npy


In [20]:
print(repr(Y))
print(f" #  (Y < 0 ) : {(Y <0).sum()}")
print(f" #  (Y != 0) : {(Y !=0).sum()}")
print(f" #  (Y > 0)  : {(Y >0).sum()}")
print(f" #  (Y <= -3): {(Y <=-3).sum()}")
print(f" #  (Y = -2) : {(Y ==-2).sum()}")
print(f" #  (Y = -1) : {(Y ==-1).sum()}")
print(f" #  (Y = 1)  : {(Y == 1).sum()}")
print(f" #  (Y = 2)  : {(Y == 2).sum()}")
print(f" #  (Y >= 3) : {(Y >= 3).sum()}")
print(f" Y.sum()     :  {Y.sum()}")

<423737x3552 sparse matrix of type '<class 'numpy.float64'>'
	with 2612254 stored elements in Compressed Sparse Row format>
 #  (Y < 0 ) : 1679110
 #  (Y != 0) : 2612254
 #  (Y > 0)  : 933144
 #  (Y <= -3): 3322
 #  (Y = -2) : 40605
 #  (Y = -1) : 1635183
 #  (Y = 1)  : 895610
 #  (Y = 2)  : 36284
 #  (Y >= 3) : 1250
 Y.sum()     :  -754431.0


In [21]:
if (Y >= +2).sum():
    print(" ============ MODIFY > +1 ===========")
    Y[Y >= +2] = +1
#     modify = True

if (Y <= -2).sum():
    print(" ============ MODIFY < -1 ===========")
    Y[Y <= -1] = -1
#     modify = True# print(Y)



In [22]:
print(repr(Y))
print(f" #  (Y < 0 ) : {(Y <0).sum()}")
print(f" #  (Y != 0) : {(Y !=0).sum()}")
print(f" #  (Y > 0)  : {(Y >0).sum()}")
print(f" #  (Y <= -3): {(Y <=-3).sum()}")
print(f" #  (Y = -2) : {(Y ==-2).sum()}")
print(f" #  (Y = -1) : {(Y ==-1).sum()}")
print(f" #  (Y = 1)  : {(Y == 1).sum()}")
print(f" #  (Y = 2)  : {(Y == 2).sum()}")
print(f" #  (Y >= 3) : {(Y >= 3).sum()}")
print(f" Y.sum()     :  {Y.sum()}")

<423737x3552 sparse matrix of type '<class 'numpy.float64'>'
	with 2612254 stored elements in Compressed Sparse Row format>
 #  (Y < 0 ) : 1679110
 #  (Y != 0) : 2612254
 #  (Y > 0)  : 933144
 #  (Y <= -3): 0
 #  (Y = -2) : 0
 #  (Y = -1) : 1679110
 #  (Y = 1)  : 933144
 #  (Y = 2)  : 0
 #  (Y >= 3) : 0
 Y.sum()     :  -745966.0


In [32]:
tst_equal = ((Y_new != Y).nnz == 0)
print(type(tst_equal), tst_equal)
if (tst_equal):
    print( f'{test_file}  -- files are  equal')
else:
    print( f'{test_file}  -- files are NOT equial')
    print(Y_new != Y_old)



<class 'bool'> True
chembl_29_Y_all.npy  -- files are  equal


In [27]:
all_Y_filename = 'output/chembl_29_10task/chembl_29_Y_all.npy'
print(all_Y_filename)

np.save(all_Y_filename, Y)
print(f"write Y file to: {all_Y_filename}")

output/chembl_29_10task/chembl_29_Y_all.npy


## Step by Step

### `mtcv_clustered` step by step

In [275]:
print(clusters.shape)
clusters_ser = pd.Series(clusters)
clusters_ser

(423737,)


0         12266
1         10766
2         10766
3          4557
4          4557
          ...  
423732     4557
423733    15081
423734     8125
423735    25220
423736    15081
Length: 423737, dtype: int64

In [276]:
assert clusters_ser.shape[0] == Y.shape[0]

In [None]:
## Assign a sequential number to the cluster ids
cl_uniq = clusters_ser.unique()
cl2id   = pd.Series(np.arange(cl_uniq.shape[0]), index=cl_uniq)
cid     = cl2id[clusters]

In [283]:
print(f" Number of unique clusters: {cl_uniq.shape}")
print(f" cl2id: {cl2id.shape}")
print(cl2id)


 Number of unique clusters: (17672,)
 cl2id: (17672,)
12266        0
10766        1
4557         2
9771         3
14746        4
         ...  
7565     17667
1755     17668
13401    17669
22801    17670
25220    17671
Length: 17672, dtype: int64
 cid  : (423737,)
12266        0
10766        1
10766        1
4557         2
4557         2
         ...  
4557         2
15081        7
8125      6788
25220    17671
15081        7
Length: 423737, dtype: int64


In [278]:
print(cl_uniq)
print(cl_uniq.shape, cl_uniq.min(), cl_uniq.max())

[12266 10766  4557 ... 13401 22801 25220]
(17672,) 0 25993


In [284]:
print(f" cid  : {cid.shape}")
print(' cid.values: ', cid.values)
print(' cid.shape : ', cid.shape)
print(cid)

 cid  : (423737,)
 cid.values:  [    0     1     1 ...  6788 17671     7]
 cid.shape :  (423737,)
12266        0
10766        1
10766        1
4557         2
4557         2
         ...  
4557         2
15081        7
8125      6788
25220    17671
15081        7
Length: 423737, dtype: int64


In [132]:
## creating cluster2compound matrix
## csr_matrix( (data,(row_ind,col_ind)))
##    where data, row_ind and col_ind satisfy the relationship a[row_ind[k], col_ind[k]] = data[k].
##   
##   Number of rows:     cid.values (cluster ids)
##   Number of columns : number of rows in Y file ()
##   Clusters x Y Rows
## 
##   in each row (cluster) there will be a '1' in the columns (compounds) 
##   where the compound has been assigned to the cluster
##   each compound is only assigned to one cluster, so the sum of each column 
##   is == 1 (i.e., C[:,i].sum() == 1)

C    = scipy.sparse.csr_matrix(
        (np.ones(cid.shape[0], dtype=np.int8), 
        (cid.values, np.arange(cid.shape[0])) )
        )


In [294]:
print(repr(C))
print(C[:,1].sum())

<17672x423737 sparse matrix of type '<class 'numpy.int8'>'
	with 423737 stored elements in Compressed Sparse Row format>
1


In [296]:
## Ybin : Inidcates which compound / labels are present in the Y dataset

In [139]:
Ybin      = Y.copy()

In [140]:
print(Ybin.shape)
print(Ybin.data.shape)
print(Ybin.data)

(423737, 3552)
(2698287,)
[ 1.  1. -1. ... -1. -1. -1.]


In [141]:
Ybin.data = np.ones(Ybin.data.shape[0], dtype=np.int8)

In [301]:
print(Ybin.shape)
print(Ybin.data.shape)
# print(Ybin)
print(f" #  (Y < 0 ) : {(Ybin<0).sum()}")
print(f" #  (Y != 0) : {(Ybin!=0).sum()}")
print(f" #  (Y > 0)  : {(Ybin>0).sum()}")
print()
print(f" #  (Y < -5) : {(Ybin<-5).sum()}")
print(f" #  (Y = -5) : {(Ybin==-5).sum()}")
print(f" #  (Y = -4) : {(Ybin==-4).sum()}")
print(f" #  (Y = -3) : {(Ybin==-3).sum()}")
print(f" #  (Y = -2) : {(Ybin==-2).sum()}")
print(f" #  (Y = -1) : {(Ybin==-1).sum()}")
print(f" #  (Y = 1)  : {(Ybin==1).sum()}")
print(f" #  (Y = 2)  : {(Ybin==2).sum()}")
print(f" #  (Y = 3)  : {(Ybin==3).sum()}")
print(f" #  (Y = 4)  : {(Ybin==4).sum()}")
print(f" #  (Y = 5)  : {(Ybin==5).sum()}")
print(f" #  (Y > 5)  : {(Ybin>5).sum()}")
print(f" Y.sum()     : {Ybin.sum()}")

(423737, 3552)
(2698287,)
 #  (Y < 0 ) : 0
 #  (Y != 0) : 2612254
 #  (Y > 0)  : 2612254

 #  (Y < -5) : 0
 #  (Y = -5) : 0
 #  (Y = -4) : 0
 #  (Y = -3) : 0
 #  (Y = -2) : 0
 #  (Y = -1) : 0
 #  (Y = 1)  : 2530793
 #  (Y = 2)  : 76889
 #  (Y = 3)  : 4572
 #  (Y = 4)  : 0
 #  (Y = 5)  : 0
 #  (Y > 5)  : 0
 Y.sum()     : 2698287


In [146]:
## Get a count of labels in each cluster  
## C (17672x423737) dot (423737, 3552) = 17672 x 3552
cl_counts = C.dot(Ybin)

In [300]:
print(cl_counts.shape)
print(cl_counts.min(), cl_counts.max())
print(cl_counts)

(17672, 3552)
-128 127
  (0, 672)	14
  (0, 673)	14
  (0, 674)	14
  (0, 675)	14
  (1, 0)	24
  (1, 1)	24
  (1, 2)	24
  (1, 3)	24
  (1, 4)	-31
  (1, 5)	-32
  (1, 6)	-32
  (1, 7)	-33
  (1, 8)	104
  (1, 9)	104
  (1, 10)	104
  (1, 11)	104
  (1, 12)	61
  (1, 13)	59
  (1, 14)	58
  (1, 15)	58
  (1, 16)	26
  (1, 17)	26
  (1, 18)	26
  (1, 19)	26
  (1, 20)	31
  :	:
  (17669, 1811)	1
  (17669, 2832)	1
  (17669, 2833)	1
  (17669, 2834)	1
  (17669, 2835)	1
  (17670, 1180)	1
  (17670, 1181)	1
  (17670, 1182)	1
  (17670, 1183)	1
  (17671, 1364)	1
  (17671, 1365)	1
  (17671, 1366)	1
  (17671, 1367)	1
  (17671, 1500)	1
  (17671, 1501)	1
  (17671, 1502)	1
  (17671, 1503)	1
  (17671, 2536)	1
  (17671, 2537)	1
  (17671, 2538)	1
  (17671, 2539)	1
  (17671, 3300)	1
  (17671, 3301)	1
  (17671, 3302)	1
  (17671, 3303)	1


### `mtcv` step by step

In [157]:
nfolds = conf.nfolds
pfolds = None
if nfolds is None and pfolds is None:
    raise ValueError("nfolds or pfolds must be specified.")
if pfolds is not None and np.abs(np.sum(pfolds) - 1.0) > 1e-5:
    raise ValueError("pfolds must sum to 1.0.")
if pfolds is None:
    pfolds = np.ones(nfolds) / nfolds
else:
    nfolds = len(pfolds)

In [199]:
print(pfolds)
print(nfolds)

[0.2 0.2 0.2 0.2 0.2]
5


In [314]:
target_sizes = np.array(cl_counts.sum(0)).flatten() ## Sum cl_counts over targets
comp_sizes   = np.array(cl_counts.sum(1)).flatten() ## sum cl_counts over compounds

In [311]:
print(' cl_counts shape   : ',cl_counts.shape)
print()
print(' target_sizes shape: ',target_sizes.shape)
print(' Number of targets in each cluster: ', target_sizes, target_sizes.sum())
print(' target_sizes < 0 : ', (target_sizes < 0).sum())
print(' target_sizes > 0 : ', (target_sizes > 0).sum())
print()
print(' comp_sizes shape  : ', comp_sizes.shape)
print(' Number of compounds in each cluster: ', comp_sizes)
print(' comp_sizes < 0 : ', (comp_sizes < 0).sum())
print(' comp_sizes > 0 : ', (comp_sizes > 0).sum())

 cl_counts shape   :  (17672, 3552)

 target_sizes shape:  (3552,)
 Number of targets in each cluster:  [120 120 120 ... 498 761 752] 917039
 target_sizes < 0 :  153
 target_sizes > 0 :  3394

 comp_sizes shape  :  (17672,)
 Number of compounds in each cluster:  [   56 82909 56604 ...    12     4    16]
 comp_sizes < 0 :  48
 comp_sizes > 0 :  17624


In [312]:
df_10 = pd.DataFrame({"row": np.arange(cl_counts.shape[0]), "size": comp_sizes})
df_10.shape

(17672, 2)

In [315]:
df_10.info()
df_10

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17672 entries, 0 to 17671
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   row     17672 non-null  int64
 1   size    17672 non-null  int64
dtypes: int64(2)
memory usage: 276.2 KB


Unnamed: 0,row,size
0,0,56
1,1,82909
2,2,56604
3,3,472
4,4,272
...,...,...
17667,17667,8
17668,17668,8
17669,17669,12
17670,17670,4


In [None]:
# Randomize df_10

In [316]:
df_11 = df_10.sample(frac=1)

In [317]:
df_11.info()
df_11.shape
df_11

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17672 entries, 6420 to 3767
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   row     17672 non-null  int64
 1   size    17672 non-null  int64
dtypes: int64(2)
memory usage: 414.2 KB


Unnamed: 0,row,size
6420,6420,40
13961,13961,20
16567,16567,16
156,156,28
15820,15820,4
...,...,...
2908,2908,12
16049,16049,16
1413,1413,20
17577,17577,24


In [None]:
## Sort in descending compound count order 

In [318]:
df_11.sort_values("size", inplace=True, ascending=False)

In [319]:
df_11.info()
df_11.shape
df_11

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17672 entries, 1 to 15151
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   row     17672 non-null  int64
 1   size    17672 non-null  int64
dtypes: int64(2)
memory usage: 414.2 KB


Unnamed: 0,row,size
1,1,82909
7,7,79806
2,2,56604
49,49,6921
3969,3969,1745
...,...,...
13443,13443,-539
13272,13272,-620
16191,16191,-740
606,606,-750


In [320]:
fold_sizes = np.zeros((nfolds, cl_counts.shape[1]), dtype=np.int8)
#max_fold_sizes = np.ceil(target_sizes / nfolds).astype(np.int)
max_fold_sizes = np.ceil(np.outer(target_sizes, pfolds))


In [321]:
print('\n fold_sizes \n')
print(fold_sizes.shape)
print(fold_sizes)
print('\n pfolds & target_sizes \n')
print(pfolds.shape)
print(pfolds)
print(target_sizes.shape)
print(target_sizes)
print('\n np.outer(target_sizes, pfolds) \n')
print(np.outer(target_sizes, pfolds).shape)
print(np.outer(target_sizes, pfolds))
print('\n max_fold_sizes \n')
print(max_fold_sizes.shape)
print(max_fold_sizes)



 fold_sizes 

(5, 3552)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

 pfolds & target_sizes 

(5,)
[0.2 0.2 0.2 0.2 0.2]
(3552,)
[120 120 120 ... 498 761 752]

 np.outer(target_sizes, pfolds) 

(3552, 5)
[[ 24.   24.   24.   24.   24. ]
 [ 24.   24.   24.   24.   24. ]
 [ 24.   24.   24.   24.   24. ]
 ...
 [ 99.6  99.6  99.6  99.6  99.6]
 [152.2 152.2 152.2 152.2 152.2]
 [150.4 150.4 150.4 150.4 150.4]]

 max_fold_sizes 

(3552, 5)
[[ 24.  24.  24.  24.  24.]
 [ 24.  24.  24.  24.  24.]
 [ 24.  24.  24.  24.  24.]
 ...
 [100. 100. 100. 100. 100.]
 [153. 153. 153. 153. 153.]
 [151. 151. 151. 151. 151.]]


In [366]:
## output vector (initialized to -1)
folds = np.zeros(cl_counts.shape[0], dtype=np.int8) - 1
farray = np.arange(nfolds)    

In [367]:
print('\n folds\n')
print(folds.shape)
print(folds)
print(farray)


 folds

(17672,)
[-1 -1 -1 ... -1 -1 -1]
[0 1 2 3 4]


In [368]:
cl_counts.shape

(17672, 3552)

In [369]:
# for i in range(cl_counts.shape[0]):
i= 0

In [358]:
i += 1


In [370]:
# for i in range(mask.shape[0]):
idx   = df_11.index[i]
print(f"row: {i}  index: {idx}         df_11['row'][idx]: {df_11['row'][idx]}         df_11['size'][idx]: {df_11['size'][idx]}")

row: 0  index: 1         df_11['row'][idx]: 1         df_11['size'][idx]: 82909


In [371]:
random_folds =  np.random.choice(farray, size=nfolds, replace=False, p=pfolds)
print(random_folds)
random_folds = iter(random_folds)

[3 1 4 2 0]


In [361]:
# j  = next(random_folds)
print(j)

print('cl_counts[idx] : ', cl_counts[idx].shape, cl_counts[idx].sum())
print((fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).shape , (fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]))
print((fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).any())
print(cl_counts[idx,:].dot((fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).transpose()))
print(cl_counts[idx,:].dot((fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).transpose()) == 0)

2


In [372]:
#for j in np.random.permutation(nfolds):
# make a random list of the folds, and go through the list 
for j in random_folds:
    print(f'fold: {j}')
    print('   cl_counts[idx] : ', cl_counts[idx].shape, cl_counts[idx].sum())
    print('   test shape     : ', (fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).shape , (fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]))
    print('   test.any()     : ', (fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).any())
    print('   dot operation  : ', cl_counts[idx,:].dot((fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).transpose()))
    print('   dot op == 0    : ', cl_counts[idx,:].dot((fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).transpose()) == 0)    
    if cl_counts[idx,:].dot((fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).transpose()) == 0:
        ## compound fits into fold j
        print(f" *** compound fits into fold {j}")
        folds[idx] = j
        fold_sizes[j,:] += mask[idx,:]
        break

fold: 3
   cl_counts[idx] :  (1, 3552) 82909
   test shape     :  (1, 3552) [[False False False ... False False False]]
   test.any()     :  True
   dot operation  :  [[-119]]
   dot op == 0    :  [[False]]
fold: 1
   cl_counts[idx] :  (1, 3552) 82909
   test shape     :  (1, 3552) [[False False False ... False False False]]
   test.any()     :  True
   dot operation  :  [[-119]]
   dot op == 0    :  [[False]]
fold: 4
   cl_counts[idx] :  (1, 3552) 82909
   test shape     :  (1, 3552) [[False False False ... False False False]]
   test.any()     :  True
   dot operation  :  [[-119]]
   dot op == 0    :  [[False]]
fold: 2
   cl_counts[idx] :  (1, 3552) 82909
   test shape     :  (1, 3552) [[False False False ... False False False]]
   test.any()     :  True
   dot operation  :  [[-119]]
   dot op == 0    :  [[False]]
fold: 0
   cl_counts[idx] :  (1, 3552) 82909
   test shape     :  (1, 3552) [[False False False ... False False False]]
   test.any()     :  True
   dot operation  :  [[-11

In [373]:
print(folds[idx])

-1


In [374]:
        
## If still havent assigned, randomly assign to a fold.
if folds[idx] == -1:
    picked_fold = np.random.randint(0, nfolds)
    folds[idx] = picked_fold
    fold_sizes[folds[idx],:] += cl_counts[idx,:]
    print(f" random assignment into {picked_fold} ")
    print(f" cl_counts[{idx}]  :    {cl_counts[idx,:]} ")
    print(f" fold_sizes[{folds[idx]},:]  :    {fold_sizes[folds[idx],:]} ")

 random assignment into 4 
 cl_counts[1]  :      (0, 0)	24
  (0, 1)	24
  (0, 2)	24
  (0, 3)	24
  (0, 4)	-31
  (0, 5)	-32
  (0, 6)	-32
  (0, 7)	-33
  (0, 8)	104
  (0, 9)	104
  (0, 10)	104
  (0, 11)	104
  (0, 12)	61
  (0, 13)	59
  (0, 14)	58
  (0, 15)	58
  (0, 16)	26
  (0, 17)	26
  (0, 18)	26
  (0, 19)	26
  (0, 20)	31
  (0, 21)	31
  (0, 22)	31
  (0, 23)	31
  (0, 24)	31
  :	:
  (0, 3527)	56
  (0, 3528)	38
  (0, 3529)	38
  (0, 3530)	38
  (0, 3531)	38
  (0, 3532)	-94
  (0, 3533)	-94
  (0, 3534)	-94
  (0, 3535)	-94
  (0, 3536)	-80
  (0, 3537)	-80
  (0, 3538)	-80
  (0, 3539)	-80
  (0, 3540)	40
  (0, 3541)	40
  (0, 3542)	40
  (0, 3543)	40
  (0, 3544)	116
  (0, 3545)	122
  (0, 3546)	122
  (0, 3547)	122
  (0, 3548)	-59
  (0, 3549)	-59
  (0, 3550)	-128
  (0, 3551)	-128 
 fold_sizes[4,:]  :    [  24   24   24 ...  -59 -128 -128] 


In [365]:
print(folds)

[3 4 4 ... 3 2 0]
