##  MTCV : Multi Task Cross Validation

In [10]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
import numpy as np
import pandas as pd
import scipy.sparse
import argparse
pd.options.display.width = 132

In [12]:
def mtcv_clustered(Y_in, clusters, nfolds=None, pfolds=None, seed=None):
    """
    splits rows of Y_in, based on clusters into either 
        a) equal Y_in into nfolds 
        b) or according to the ratios defined bY_in pfolds
        
        Y_in: is the compound x target matrix
    """

    assert clusters.shape[0] == Y_in.shape[0]
    print(f"* Y_in       : {type(Y_in)} \t {Y_in.shape}")
    print(f"* clusters: {type(clusters)}  \t {clusters.shape} ")
    print(f"           Min: {clusters.min()} Max:{clusters.max()}\n{clusters} \n")
    
    
#     cl_uniq = np.sort(clusters.unique())
    cl_uniq = clusters.unique()
    print(f"* cl_uniq: {type(cl_uniq)} {len(cl_uniq)}\n{cl_uniq}\n")

          
    ## assign a unqiue id to the cluster numbers  
    cl2id   = pd.Series(np.arange(cl_uniq.shape[0]), index=cl_uniq)
        
#     print(f"* cl2id: {type(cl2id)} {len(cl2id)}\n{cl2id}\n")
#     print(f" cl2id[0]: {cl2id[0]}")
#     print(f" cl2id[2321]: {cl2id[2321]}")
#     print(f" cl2id[13660]: {cl2id[13660]}")
#     print(f" cl2id[2321]: {cl2id[2321]}")
#     print(f" cl2id[5601]: \n{cl2id[5599:5603]} \n")
    
    
    cid     = cl2id[clusters]
    print(f"* cid:   {type(cid)} {len(cid)}\n{cid}\n")
#     print(f" cid.values:\n{cid.values}")
    
    ## creating cluster2compound matrix
    ##
    ## Number of rows: cid.values
    print(f"* Create C sparse matrix - indicating the clusters compounds are assigned to.... ")
    C    = scipy.sparse.csr_matrix(
            (np.ones(cid.shape[0], dtype=np.int8), (cid.values, np.arange(cid.shape[0])))
            )

    print(f"* Shape of sparse matrix: Rows(#clusters):{C.shape[0]} Columns(#compounds): {C.shape[1]} \n")
    ## Create 
    Y_bin      = Y_in.copy()
    print(f"* Y_bin     :   {type(Y_bin)} {Y_bin.shape}")
    print(f"* Y_bin.data:   {type(Y_bin.data)} {Y_bin.data.shape}")
    Y_bin_neg = Y_bin < 0
    Y_bin_pos = Y_bin > 0 
    print(f" Sum: {(Y_bin_neg).sum()}")
    print(f" Sum: {(Y_bin_pos).sum()}")
    print(f" Sum: {(Y_bin).sum()}")
    
    Y_bin.data = np.ones(Y_bin.data.shape[0], dtype=np.int8)
    print(f"* Y_bin.data:   {type(Y_bin.data)} {Y_bin.data.shape}\n")
    Y_bin_neg = Y_bin < 0
    Y_bin_pos = Y_bin > 0 
    print(f" Sum: {(Y_bin_neg).sum()}")
    print(f" Sum: {(Y_bin_pos).sum()}")
    print(f" Sum: {(Y_bin).sum()}")
    print() 
    
    ## compute number of compounds per cluster/fingerprint
    cl_counts = C.dot(Y_bin)
    print(f"* cl_counts     :   {type(cl_counts)} {cl_counts.shape}")    
    print(f"* cl_counts.data:   {type(cl_counts.data)} {cl_counts.data.shape}")
    print(f" {cl_counts}\n")
    
    cl_counts_neg = cl_counts < 0 
    print(f"* cl_counts_neg     :   {type(cl_counts_neg)}      {cl_counts_neg.shape}")    
    print(f"* cl_counts_neg.data:   {type(cl_counts_neg.data)} {cl_counts_neg.data.shape}")
    print(f" Sum: {cl_counts_neg.sum()}")
    print(f" {cl_counts_neg} \n")    

    ## get cluster folds
    print(f"* Call mtcv with nfolds: {nfolds}  \t pfolds: {pfolds} \t  seed: {seed}")
    folds = mtcv(mask=cl_counts, nfolds=nfolds, pfolds=pfolds, seed=seed)

    print(f"* cid:   {type(cid)} {len(cid)}\n{cid}\n")
    print(f'* folds:  {folds.shape} \n {folds[cid]} \n')
    return folds[cid]


In [13]:
def mtcv(mask, nfolds=None, pfolds=None, seed=None):
    """
    Return a vector of folds

    Args:
    mask     binary mask matrix of [compounds x targets]
    nfolds   number of folds (integer)
    pfolds   array, specifying fold sizes (probability)
             If specified nfolds is ignored.
             Must sum to 1.
    """
    if nfolds is None and pfolds is None:
        raise ValueError("nfolds or pfolds must be specified.")
    if pfolds is not None and np.abs(np.sum(pfolds) - 1.0) > 1e-5:
        raise ValueError("pfolds must sum to 1.0.")

    if pfolds is None:
        pfolds = np.ones(nfolds) / nfolds
    else:
        nfolds = len(pfolds)
    
    print(f" Call mtcv with nfolds: {nfolds}  \t pfolds: {pfolds} \t  seed: {seed}\n\n")
    
    target_sizes = np.array(mask.sum(0)).flatten()
    comp_sizes   = np.array(mask.sum(1)).flatten()

    print(f'* mask        : {mask.shape} \t\n {mask} \n')
    print(f'* Target sizes (mask sum along rows) : {target_sizes.shape} \t {target_sizes} \n')
    print(f'* comp sizes   (mask sum over columns): {comp_sizes.shape}   \t {comp_sizes} \n')
    if seed is not None:
        np.random.seed(seed)

    df = pd.DataFrame({"row": np.arange(mask.shape[0]), "size": comp_sizes})
    print(f'\n* df info')
    print(df.info())
    print(df)
    
    df1 = df.sample(frac=1)
    print(f'\n df1 info \n')
    print(df1.info())
    print(df1)
    
    df1.sort_values("size", inplace=True, ascending=False)
    print(f'\n df1 info (after sort by size descending)\n')
    print(df1.info())
    print(df1)
    
    fold_sizes = np.zeros((nfolds, mask.shape[1]), dtype=np.int8)
#     #max_fold_sizes = np.ceil(target_sizes / nfolds).astype(np.int)
    max_fold_sizes = np.ceil(np.outer(target_sizes, pfolds))

    print(f'* fold_sizes : {fold_sizes.shape} \n {fold_sizes} \n')
#     print(f'* non zero folds: {(fold_sizes>0).sum()}\n')
    print(f'* max fold sizes: {max_fold_sizes.shape} \n {max_fold_sizes}\n')
    
    ## output vector (initialized to -1)
    folds = np.zeros(mask.shape[0], dtype=np.int8) - 1
    print(f'* folds : {folds.shape} \n {folds} \n')
    
    farray = np.arange(nfolds)    
    
    ## Starting at the cluster with the highest number of positive actions
    ## 
    print(f'* mask        : {mask.shape} ')
    for i in range(mask.shape[0]):
        idx   = df1.index[i]
 

        #for j in np.random.permutation(nfolds):
        choices = np.random.choice(farray, size=nfolds, replace=False, p=pfolds)
#         print(f' i: {i}   idx (cluster_id): {idx}   folds[{idx}] = {folds[idx]}   choices : {choices}')
        
        
        for j in choices: ## np.random.choice(farray, size=nfolds, replace=False, p=pfolds):
            fit_criteria = mask[idx,:].dot((fold_sizes[j,:] + mask[idx,:] > max_fold_sizes[:,j]).transpose())
#             print(f'    fit_criteria = {fit_criteria}')
            if fit_criteria == 0:
                ## compound fits into fold j
                folds[idx] = j
                fold_sizes[j,:] += mask[idx,:]
                
#                 print(f'    folds[idx] <- j:  {folds[idx]}')
#                 print(f'    fold_sizes[j] <- mask[{idx}]: {fold_sizes[j]}')
                break
                
        if folds[idx] == -1:
            j = np.random.randint(0, nfolds)
#             print(f'    force to fold {j}')
            folds[idx] = j
            fold_sizes[j,:] += mask[idx,:]
        
#         if (i == 100):
#             break
            
    print(' finished loop - i is:', i)
    return folds





In [26]:
parser = argparse.ArgumentParser(description="Multi-task clustered crossvalidation")
parser.add_argument('--thresh', type=str, required=True)
parser.add_argument('--y', type=str, required=True)
parser.add_argument('--compounds', type=str, required=True)
parser.add_argument('--clusters', type=str, required=True)
parser.add_argument('--folding', type=str, default="output/folding.npy")
parser.add_argument('--nfolds', type=int, default=5)

in_args = " --thresh    output/chembl_29/chembl_29_thresholds.csv"\
          " --clusters  output/chembl_29/chembl_29_clustering.npy " \
          " --compounds output/chembl_29/chembl_29_X_cmpds.csv" \
          " --y         output/chembl_29/chembl_29_Y_all.npy"\
          " --folding   output/chembl_29/chembl_29_folding.npy"\
          " --nfolds    5"
in_args = in_args.split()

conf = parser.parse_args(in_args)
conf_vars = vars(conf)
print()
print(f" input parm - thresh  :   value: {conf_vars['thresh']}")
print(f" input parm - clusters:   value: {conf_vars['clusters']}")
print(f" input parm - compounds:  value: {conf_vars['compounds']}")
print(f" input parm - nfolds:     value: {conf_vars['nfolds']}")
print(f" output parm - y :        value: {conf_vars['y']}")
print(f" output parm - folding:   value: {conf_vars['folding']}")


 input parm - thresh  :   value: output/chembl_29/chembl_29_thresholds.csv
 input parm - clusters:   value: output/chembl_29/chembl_29_clustering.npy
 input parm - compounds:  value: output/chembl_29/chembl_29_X_cmpds.csv
 input parm - nfolds:     value: 5
 output parm - y :        value: output/chembl_29/chembl_29_Y_all.npy
 output parm - folding:   value: output/chembl_29/chembl_29_folding.npy


### Load compound id / target id /  threshold file `output/chembl_29_thresh.csv`

In [32]:
df = pd.read_csv(conf.thresh)
print(df)
df.info()
df.nunique()

             target_id        cmpd_id  task_group  variable  value
0        CHEMBL1075097  CHEMBL1234777           1       5.5    1.0
1        CHEMBL1075097  CHEMBL1812661           1       5.5    1.0
2        CHEMBL1075097  CHEMBL1812662           1       5.5   -1.0
3        CHEMBL1075097  CHEMBL2326084           1       5.5    1.0
4        CHEMBL1075097  CHEMBL2326085           1       5.5    1.0
...                ...            ...         ...       ...    ...
2699033     CHEMBL6175   CHEMBL578512         836       8.5   -1.0
2699034     CHEMBL6175   CHEMBL578512         836       8.5   -1.0
2699035     CHEMBL6175    CHEMBL90852         836       8.5   -1.0
2699036     CHEMBL6175    CHEMBL90852         836       8.5   -1.0
2699037     CHEMBL6175    CHEMBL90852         836       8.5   -1.0

[2699038 rows x 5 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2699038 entries, 0 to 2699037
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   t

target_id        888
cmpd_id       423809
task_group        10
variable           4
value              2
dtype: int64

In [33]:
print(f"Number of rows              : {len(df):7d}")
print(f"Number of unique targets    : {len(df['target_id'].unique()):7d}   ")
print(f"Number of unique compounds  : {len(df['cmpd_id'].unique()):7d}   ")
print(f"Number of unique thresholds : {len(df['variable'].unique()):7d}")
print(f"Number of unique task groups: {len(df['task_group'].unique()):7d}")
print(f'Number of lables (targets x thresholds) : {len(df["target_id"].unique()) * len(df["variable"].unique()):7d}')

df["task_group"].value_counts(normalize = False)

Number of rows              : 2699038
Number of unique targets    :     888   
Number of unique compounds  :  423809   
Number of unique thresholds :       4
Number of unique task groups:      10
Number of lables (targets x thresholds) :    3552


6       764934
11      336268
1028    323550
1       310129
0       270580
10      155195
836     150758
1005    149430
1031    126561
643     111633
Name: task_group, dtype: int64

### Load compound list `output/chembl_29_X_cmpds.csv`

In [24]:
cmpd_list = pd.read_csv(conf.compounds, header=None)
cmpd_list.info()
cmpd_list.nunique()
cmpd_list.drop(columns=0, inplace = True)
print(f"length of cmpd_list:  {len(cmpd_list)}")
print(cmpd_list)
print()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423737 entries, 0 to 423736
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       423737 non-null  int64 
 1   1       423737 non-null  object
dtypes: int64(1), object(1)
memory usage: 6.5+ MB
length of cmpd_list:  423737
                    1
0        CHEMBL405398
1        CHEMBL403325
2        CHEMBL501943
3        CHEMBL501094
4        CHEMBL505943
...               ...
423732  CHEMBL4297644
423733  CHEMBL4297666
423734  CHEMBL4297674
423735  CHEMBL4298138
423736  CHEMBL4298140

[423737 rows x 1 columns]



In [25]:
cmpd_list["cid"] = range(len(cmpd_list))
print(f"Size of cmpd_list: {len(cmpd_list)}") 
print(cmpd_list)

Size of cmpd_list: 423737
                    1     cid
0        CHEMBL405398       0
1        CHEMBL403325       1
2        CHEMBL501943       2
3        CHEMBL501094       3
4        CHEMBL505943       4
...               ...     ...
423732  CHEMBL4297644  423732
423733  CHEMBL4297666  423733
423734  CHEMBL4297674  423734
423735  CHEMBL4298138  423735
423736  CHEMBL4298140  423736

[423737 rows x 2 columns]


### Load the clustering info `output/clustering.npy`

In [29]:
#Load the clustering assignment
clusters = np.load(conf.clusters, allow_pickle=True)
print(len(clusters), clusters)

423737 [24163 24163  8923 ... 18967 15079 12696]


### Create mask

In [30]:
assert len(clusters) == len(cmpd_list), "Number of compound must agree with the size of clustering vector"

In [34]:
print()
Ncmpd = len(set(cmpd_list['cid']))
Ncol  = len(df["target_id"].unique()) * len(df["variable"].unique())
print(f" # of compounds Ncmpd: {Ncmpd}    ")
print(f" # of columns   Ncol : (# unique target_ids x # unique thresholds ) = ({len(df['target_id'].unique())} x {len(df['variable'].unique())}): {Ncol}")


 # of compounds Ncmpd: 423737    
 # of columns   Ncol : (# unique target_ids x # unique thresholds ) = (888 x 4): 3552


### Create dataframe of unique target_ids 

In [35]:
target_list = pd.DataFrame(df["target_id"].unique())
target_list["tid"] = range(len(target_list))
print(target_list)

                 0  tid
0    CHEMBL1075097    0
1    CHEMBL1075104    1
2    CHEMBL1075138    2
3    CHEMBL1075145    3
4    CHEMBL1075165    4
..             ...  ...
883     CHEMBL6154  883
884     CHEMBL6164  884
885     CHEMBL6166  885
886     CHEMBL6167  886
887     CHEMBL6175  887

[888 rows x 2 columns]


### create data frame of unique thresholds

In [36]:
variable_list = pd.DataFrame(df["variable"].unique())
variable_list["vid"] = range(len(variable_list))
print(variable_list)

Nvar = len(variable_list)
print(f" len(variable_list) : {Nvar}")

     0  vid
0  5.5    0
1  6.5    1
2  7.5    2
3  8.5    3
 len(variable_list) : 4


### `tmp1` : Add unique compound identifier to tmp1 

In [37]:
print(df)
print(cmpd_list)

             target_id        cmpd_id  task_group  variable  value
0        CHEMBL1075097  CHEMBL1234777           1       5.5    1.0
1        CHEMBL1075097  CHEMBL1812661           1       5.5    1.0
2        CHEMBL1075097  CHEMBL1812662           1       5.5   -1.0
3        CHEMBL1075097  CHEMBL2326084           1       5.5    1.0
4        CHEMBL1075097  CHEMBL2326085           1       5.5    1.0
...                ...            ...         ...       ...    ...
2699033     CHEMBL6175   CHEMBL578512         836       8.5   -1.0
2699034     CHEMBL6175   CHEMBL578512         836       8.5   -1.0
2699035     CHEMBL6175    CHEMBL90852         836       8.5   -1.0
2699036     CHEMBL6175    CHEMBL90852         836       8.5   -1.0
2699037     CHEMBL6175    CHEMBL90852         836       8.5   -1.0

[2699038 rows x 5 columns]
                    1     cid
0        CHEMBL405398       0
1        CHEMBL403325       1
2        CHEMBL501943       2
3        CHEMBL501094       3
4        CHEMBL505

In [38]:
tmp1 = pd.merge(df, cmpd_list, left_on="cmpd_id", right_on=1)

In [52]:
print(df.info())
print(tmp1.info())
print(tmp1.nunique())
print(tmp1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2699038 entries, 0 to 2699037
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   target_id   object 
 1   cmpd_id     object 
 2   task_group  int64  
 3   variable    float64
 4   value       float64
dtypes: float64(2), int64(1), object(2)
memory usage: 103.0+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2698287 entries, 0 to 2698286
Data columns (total 7 columns):
 #   Column      Dtype  
---  ------      -----  
 0   target_id   object 
 1   cmpd_id     object 
 2   task_group  int64  
 3   variable    float64
 4   value       float64
 5   1           object 
 6   cid         int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 164.7+ MB
None
target_id        888
cmpd_id       423737
task_group        10
variable           4
value              2
1             423737
cid           423737
dtype: int64
             target_id        cmpd_id  task_group  variable  value            

In [18]:
# cid_value_counts = tmp2['cid'].value_counts()
# print(f'cid value counts : {cid_value_counts.shape} \n')
# print(cid_value_counts)
# print(f"\nNumber of unqiue cids:  {tmp2['cid'].nunique()}")

### `tmp2` : add unique threshold identifier to tmp2

In [42]:
print(tmp1)
print(variable_list)

             target_id        cmpd_id  task_group  variable  value              1     cid
0        CHEMBL1075097  CHEMBL1234777           1       5.5    1.0  CHEMBL1234777  173927
1        CHEMBL1075097  CHEMBL1234777           1       6.5   -1.0  CHEMBL1234777  173927
2        CHEMBL1075097  CHEMBL1234777           1       7.5   -1.0  CHEMBL1234777  173927
3        CHEMBL1075097  CHEMBL1234777           1       8.5   -1.0  CHEMBL1234777  173927
4        CHEMBL1075097  CHEMBL1812661           1       5.5    1.0  CHEMBL1812661  150148
...                ...            ...         ...       ...    ...            ...     ...
2698282     CHEMBL5113  CHEMBL4647127          11       8.5   -1.0  CHEMBL4647127  145687
2698283     CHEMBL5141  CHEMBL2147092        1031       8.5   -1.0  CHEMBL2147092  227992
2698284     CHEMBL5145  CHEMBL3754116           6       8.5   -1.0  CHEMBL3754116  328259
2698285     CHEMBL5406   CHEMBL570550         836       8.5   -1.0   CHEMBL570550  140043
2698286   

In [43]:
tmp2 = pd.merge(tmp1, variable_list, left_on="variable", right_on=0) #WHY [0]

In [54]:
df.info()
tmp1.info()
tmp2.info()
print(tmp2.nunique())
print(tmp2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2699038 entries, 0 to 2699037
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   target_id   object 
 1   cmpd_id     object 
 2   task_group  int64  
 3   variable    float64
 4   value       float64
dtypes: float64(2), int64(1), object(2)
memory usage: 103.0+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2698287 entries, 0 to 2698286
Data columns (total 7 columns):
 #   Column      Dtype  
---  ------      -----  
 0   target_id   object 
 1   cmpd_id     object 
 2   task_group  int64  
 3   variable    float64
 4   value       float64
 5   1           object 
 6   cid         int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 164.7+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2698287 entries, 0 to 2698286
Data columns (total 9 columns):
 #   Column      Dtype  
---  ------      -----  
 0   target_id   object 
 1   cmpd_id     object 
 2   task_group  int64  
 3   variab

### `join` : Add unique target identifier to dataframe

In [55]:
print(tmp2)
print(target_list)

             target_id        cmpd_id  task_group  variable  value              1     cid    0  vid
0        CHEMBL1075097  CHEMBL1234777           1       5.5    1.0  CHEMBL1234777  173927  5.5    0
1        CHEMBL1075097  CHEMBL1812661           1       5.5    1.0  CHEMBL1812661  150148  5.5    0
2        CHEMBL1075097  CHEMBL1812662           1       5.5   -1.0  CHEMBL1812662  143524  5.5    0
3        CHEMBL1075097  CHEMBL2326084           1       5.5    1.0  CHEMBL2326084  233209  5.5    0
4        CHEMBL1075097  CHEMBL2326085           1       5.5    1.0  CHEMBL2326085  237772  5.5    0
...                ...            ...         ...       ...    ...            ...     ...  ...  ...
2698282     CHEMBL5113  CHEMBL4647127          11       8.5   -1.0  CHEMBL4647127  145687  8.5    3
2698283     CHEMBL5141  CHEMBL2147092        1031       8.5   -1.0  CHEMBL2147092  227992  8.5    3
2698284     CHEMBL5145  CHEMBL3754116           6       8.5   -1.0  CHEMBL3754116  328259  8.5    3


In [47]:
tmp2.info()
target_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2698287 entries, 0 to 2698286
Data columns (total 9 columns):
 #   Column      Dtype  
---  ------      -----  
 0   target_id   object 
 1   cmpd_id     object 
 2   task_group  int64  
 3   variable    float64
 4   value       float64
 5   1           object 
 6   cid         int64  
 7   0           float64
 8   vid         int64  
dtypes: float64(3), int64(3), object(3)
memory usage: 205.9+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 888 entries, 0 to 887
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       888 non-null    object
 1   tid     888 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 14.0+ KB


In [56]:
join = pd.merge(tmp2,target_list, how="inner", left_on="target_id", right_on=0, suffixes = ['_left', '_right'])

In [49]:
join.drop(columns=[1, '0_left', '0_right'],inplace=True)

In [57]:
join.info()
print(join.nunique())
print(join)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2698287 entries, 0 to 2698286
Data columns (total 11 columns):
 #   Column      Dtype  
---  ------      -----  
 0   target_id   object 
 1   cmpd_id     object 
 2   task_group  int64  
 3   variable    float64
 4   value       float64
 5   1           object 
 6   cid         int64  
 7   0_left      float64
 8   vid         int64  
 9   0_right     object 
 10  tid         int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 247.0+ MB
target_id        888
cmpd_id       423737
task_group        10
variable           4
value              2
1             423737
cid           423737
0_left             4
vid                4
0_right          888
tid              888
dtype: int64
             target_id        cmpd_id  task_group  variable  value              1     cid  0_left  vid        0_right  tid
0        CHEMBL1075097  CHEMBL1234777           1       5.5    1.0  CHEMBL1234777  173927     5.5    0  CHEMBL1075097    0
1       

In [58]:
print(join['vid'].nunique())
print(join['tid'].nunique())
print(join['cid'].nunique())

4
888
423737


In [59]:
print(join['task_group'].value_counts())
ttl_counts = 0
for i in join['task_group'].value_counts():
    ttl_counts += i
print (f"total   {ttl_counts}")

6       764834
11      336091
1028    323448
1       309909
0       270448
10      155187
836     150758
1005    149426
1031    126553
643     111633
Name: task_group, dtype: int64
total   2698287


Note: 
    The following targets are connected to 2 different task groups:
    
    CHEMBL3521       2
    CHEMBL4105860    2
    CHEMBL4958       2
    CHEMBL4081       2

In [42]:
# join.groupby(by=['target_id']).nunique('task_group').sort_values(by='task_group')
join.groupby(by=['target_id']).nunique('task_group').sort_values(by='task_group')

Unnamed: 0_level_0,cmpd_id,task_group,variable,value,cid,vid,tid
target_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CHEMBL1075097,120,1,4,2,120,4,1
CHEMBL3988583,1094,1,4,2,1094,4,1
CHEMBL3988599,321,1,4,2,321,4,1
CHEMBL3989381,114,1,4,1,114,4,1
CHEMBL3991,211,1,4,2,211,4,1
...,...,...,...,...,...,...,...
CHEMBL245,887,1,4,2,887,4,1
CHEMBL3521,214,2,4,2,214,4,1
CHEMBL4105860,204,2,4,1,204,4,1
CHEMBL4958,181,2,4,2,181,4,1


In [54]:
join.groupby('target_id')['task_group'].nunique().sort_values().nlargest(10)

target_id
CHEMBL3521       2
CHEMBL4105860    2
CHEMBL4958       2
CHEMBL4081       2
CHEMBL1075097    1
CHEMBL3988583    1
CHEMBL3988599    1
CHEMBL3989381    1
CHEMBL3991       1
CHEMBL3991501    1
Name: task_group, dtype: int64

## Create `Ymask` sparse matrix
    #-----------------------------------------------------------------------------------------------------
    # Build COOrdiante formatted sparse matrices for Ymask and  Y
    #
    # Ymask =  coo_matrix((data, (i, j)), [shape=(M, N)])
    # to construct from three arrays:
    # data[:] the entries of the matrix, in any order
    # 
    # i[:] the row indices of the matrix entries
    # j[:] the column indices of the matrix entries
    # 
    # Where A[i[k], j[k]] = data[k]. When shape is not specified, it is inferred from the index arrays
    #
    # Number of rows    : Ncmpd - Unique compound ids 
    # Number of columns : Ncol  - Number of unique target ids x number of unqiue thresholds 
    #
    #
    #-----------------------------------------------------------------------------------------------------
    
    Dims:   `Ncmpd x NCol`

In [28]:
print()
Ncmpd = len(set(cmpd_list['cid']))
Ncol  = len(df["target_id"].unique()) * len(df["variable"].unique())

print(f" # of compounds Ncmpd: {Ncmpd}    ")
print(f" # of columns   Ncol : (# unique target_ids x # unique thresholds ) = ({len(df['target_id'].unique())} x {len(df['variable'].unique())}): {Ncol}")


 # of compounds Ncmpd: 423737    
 # of columns   Ncol : (# unique target_ids x # unique thresholds ) = (888 x 4): 3552


In [29]:
I = join["cid"].to_numpy()                         ## Rows: Compund Ids
J = (Nvar * join["tid"] + join["vid"]).to_numpy()  ## Columns: (4 x tid) + vid
V = np.ones(len(I))

Ymask = scipy.sparse.coo_matrix((V,(I,J)),(Ncmpd,Ncol))

In [30]:
print(f' I (Row coordiantes) Len: {len(I)} - {I}') ## Rows 
print(f' J (Col coordiantes) Len: {len(J)} - {J}') ## Columns 
print(f' V (Data)            Len: {len(V)} - {V} min: {min(V)} max: {max(V)}') # Data

 I (Row coordiantes) Len: 2698287 - [173927 150148 143524 ...    211 118961    159]
 J (Col coordiantes) Len: 2698287 - [   0    0    0 ... 3467 3467 3467]
 V (Data)            Len: 2698287 - [1. 1. 1. ... 1. 1. 1.] min: 1.0 max: 1.0


In [31]:
Ymask_neg = (Ymask < 0)

print(f" Ymask        : {type(Ymask)}      Shape: {Ymask.shape}")    
print(f" Ymask.data   : {type(Ymask.data)} Shape: {Ymask.data.shape}")
print()
print(f" #(Ymask < 0) : {(Ymask<0).sum()}")
print(f" #(Ymask > 0) : {(Ymask>0).sum()}")
print(f" #(Ymask = 1) : {(Ymask==1).sum()}")
print(f" #(Ymask = 2) : {(Ymask==2).sum()}")
print(f" #(Ymask = 3) : {(Ymask==3).sum()}")
print(f" #(Ymask = 4) : {(Ymask==4).sum()}")
print(f" #(Ymask = 5) : {(Ymask==5).sum()}")
print(f" #(Ymask > 5) : {(Ymask>5).sum()}")
print(f" Ymask.sum()  : {Ymask.sum()}")
# print(Ymask)

 Ymask        : <class 'scipy.sparse.coo.coo_matrix'>      Shape: (423737, 3552)
 Ymask.data   : <class 'numpy.ndarray'> Shape: (2698287,)

 #(Ymask < 0) : 0
 #(Ymask > 0) : 2612254
 #(Ymask = 1) : 2530793
 #(Ymask = 2) : 76889
 #(Ymask = 3) : 4572
 #(Ymask = 4) : 0
 #(Ymask = 5) : 0
 #(Ymask > 5) : 0
 Ymask.sum()  : 2698287.0


##  Create Y sparse matrix 

In [32]:
y_data = join["value"].to_numpy()

print(f' I (Row coordiantes) Len: {len(I)} - {I}') ## Rows 
print(f' J (Col coordiantes) Len: {len(J)} - {J}') ## Columns 
print(f' V (Data)            Len: {len(y_data)} - {y_data} min: {min(y_data)} max: {max(y_data)}') # Data

 I (Row coordiantes) Len: 2698287 - [173927 150148 143524 ...    211 118961    159]
 J (Col coordiantes) Len: 2698287 - [   0    0    0 ... 3467 3467 3467]
 V (Data)            Len: 2698287 - [ 1.  1. -1. ... -1. -1. -1.] min: -1.0 max: 1.0


In [33]:
Y = scipy.sparse.coo_matrix((y_data,(I,J)),(Ncmpd, Ncol))

In [34]:
print(repr(Y))
print(f" #  (Y < 0 ) : {(Y<0).sum()}")
print(f" #  (Y != 0) : {(Y!=0).sum()}")
print(f" #  (Y > 0)  : {(Y>0).sum()}")
print()
print(f" #  (Y < -5) : {(Y<-5).sum()}")
print(f" #  (Y = -5) : {(Y==-5).sum()}")
print(f" #  (Y = -4) : {(Y==-4).sum()}")
print(f" #  (Y = -3) : {(Y==-3).sum()}")
print(f" #  (Y = -2) : {(Y==-2).sum()}")
print(f" #  (Y = -1) : {(Y==-1).sum()}")
print(f" #  (Y = 1)  : {(Y==1).sum()}")
print(f" #  (Y = 2)  : {(Y==2).sum()}")
print(f" #  (Y = 3)  : {(Y==3).sum()}")
print(f" #  (Y = 4)  : {(Y==4).sum()}")
print(f" #  (Y = 5)  : {(Y==5).sum()}")
print(f" #  (Y > 5)  : {(Y>5).sum()}")
print(f" Y.sum()     : {Y.sum()}")

<423737x3552 sparse matrix of type '<class 'numpy.float64'>'
	with 2698287 stored elements in COOrdinate format>
 #  (Y < 0 ) : 1679110
 #  (Y != 0) : 2612254
 #  (Y > 0)  : 933144

 #  (Y < -5) : 0
 #  (Y = -5) : 0
 #  (Y = -4) : 0
 #  (Y = -3) : 3322
 #  (Y = -2) : 40605
 #  (Y = -1) : 1635183
 #  (Y = 1)  : 895610
 #  (Y = 2)  : 36284
 #  (Y = 3)  : 1250
 #  (Y = 4)  : 0
 #  (Y = 5)  : 0
 #  (Y > 5)  : 0
 Y.sum()     : -754431.0


In [178]:
# print(Y)

## Write Y file

In [35]:
conf.y
all_Y_filename = conf.y[:-3]+"npy"
print(all_Y_filename)

output/chembl_29/chembl_29_Y_all.npy


In [36]:
np.save(all_Y_filename, Y)
print(f"write Y file to: {all_Y_filename}")

write Y file to: output/chembl_29/chembl_29_Y_all.npy


## Write Y file for each task group

In [35]:
task_groups = join['task_group'].unique()
print(len(task_groups), task_groups)

threshold_count = len(df["variable"].unique())
print(f" threshold count {threshold_count}")
print(join['task_group'].value_counts())

10 [   1 1028   10   11    0 1005    6  836  643 1031]
 threshold count 4
6       764834
11      336091
1028    323448
1       309909
0       270448
10      155187
836     150758
1005    149426
1031    126553
643     111633
Name: task_group, dtype: int64


In [50]:
ttl_cols = 0

for i in task_groups:
    print("\n---------------------")
    print(f"   Taskgroup {i:4d} ")
    print("---------------------")
    
    tg_join = join[join['task_group'] == i]
    tg_target_list = pd.DataFrame(tg_join['tid'].unique())    
    tg_target_count = len(tg_target_list) 
    tg_target_list["col_id"] = range(len(tg_target_list))
    tg_join = pd.merge(tg_join, tg_target_list, how="inner", left_on="tid", right_on=0)    
    
#     print(tg_join)    
#     print(tg_join['tid'].unique())
#     print(tg_target_list)    
#     print(tg_join)        

    Ncol = tg_target_count * threshold_count   
    ttl_cols += Ncol
    I = tg_join["cid"].to_numpy()
    J = (Nvar * tg_join["col_id"] + tg_join["vid"]).to_numpy()
    V = tg_join["value"].to_numpy()
    tg_Y = scipy.sparse.csr_matrix((V,(I,J)),(Ncmpd, Ncol))
    
#     tg_Ymask_data = np.ones(len(I))
#     tg_Ymask = scipy.sparse.coo_matrix((y_mask_data,(I,J)),(Ncmpd,Ncol))
    print()
    print(f" Taskgroup {i:6d} - rows: {len(tg_join)}")
    print(f" Target_list shape      : {tg_target_list.shape}")
    print(f" tg_targer_count        : {tg_target_count}")
    print(f' Number of columns      : {Ncol}')
    print(f' I (Row coordiantes) Len: {len(I)} - {I}') ## Rows 
    print(f' J (Col coordiantes) Len: {len(J)} - {J}') ## Columns 
    print(f' V (Data)            Len: {len(V)} - {V} min: {min(V)} max: {max(V)}') # Data   
    print(f' Sparse matrix shape    : {tg_Y.shape}')

    print(repr(tg_Y))
    print(f" #  (Y < 0 ) : {(tg_Y < 0).sum()}")
    print(f" #  (Y > 0)  : {(tg_Y > 0).sum()}")
    print(f" #  (Y != 0) : {(tg_Y !=0).sum()}")    
    print()
    print(f" #  (Y < -5) : {(tg_Y  < -5).sum()}")
    print(f" #  (Y = -5) : {(tg_Y == -5).sum()}")
    print(f" #  (Y = -4) : {(tg_Y == -4).sum()}")
    print(f" #  (Y = -3) : {(tg_Y == -3).sum()}")
    print(f" #  (Y = -2) : {(tg_Y == -2).sum()}")
    print(f" #  (Y = -1) : {(tg_Y == -1).sum()}")
    print(f" #  (Y = +1)  : {(tg_Y == 1).sum()}")
    print(f" #  (Y = +2)  : {(tg_Y == 2).sum()}")
    print(f" #  (Y = +3)  : {(tg_Y == 3).sum()}")
    print(f" #  (Y = +4)  : {(tg_Y == 4).sum()}")
    print(f" #  (Y = +5)  : {(tg_Y == 5).sum()}")
    print(f" #  (Y > +5)  : {(tg_Y  > 5).sum()}")
    print(f" Y.sum() +    : {tg_Y.sum()}")  
    
    modify = False
    if (tg_Y >= +2).sum():
        print(" ============ MODIFY > +1 ===========")
        tg_Y[tg_Y >= +2] = +1
        modify = True
        
    if (tg_Y <= -2).sum():
        print(" ============ MODIFY < -1 ===========")
        tg_Y[tg_Y <= -1] = -1
        modify = True
        
    if modify:
        print(f" Write Y file for task group {i} to: {tg_Y_filename}")
        print(f" #  (Y < 0 ) : {(tg_Y < 0).sum()}")
        print(f" #  (Y > 0)  : {(tg_Y > 0).sum()}")
        print(f" #  (Y != 0) : {(tg_Y !=0).sum()}")        
        print()
        print(f" #  (Y < -5) : {(tg_Y  < -5).sum()}")
        print(f" #  (Y = -5) : {(tg_Y == -5).sum()}")
        print(f" #  (Y = -4) : {(tg_Y == -4).sum()}")
        print(f" #  (Y = -3) : {(tg_Y == -3).sum()}")
        print(f" #  (Y = -2) : {(tg_Y == -2).sum()}")
        print(f" #  (Y = -1) : {(tg_Y == -1).sum()}")
        print(f" #  (Y = +1)  : {(tg_Y == 1).sum()}")
        print(f" #  (Y = +2)  : {(tg_Y == 2).sum()}")
        print(f" #  (Y = +3)  : {(tg_Y == 3).sum()}")
        print(f" #  (Y = +4)  : {(tg_Y == 4).sum()}")
        print(f" #  (Y = +5)  : {(tg_Y == 5).sum()}")
        print(f" #  (Y > +5)  : {(tg_Y  > 5).sum()}")
        print(f" Y.sum() +    : {tg_Y.sum()}")  

        
    tg_Y_filename= f"{conf.y[:-4]}_tg_{i}_cols_{Ncol}.npy"

    np.save(tg_Y_filename, tg_Y)
    print(f" Taskgroup {i} written to tg_Y_filename  ")
print(f"ttl_cols: {ttl_cols}")


---------------------
   Taskgroup    1 
---------------------

 Taskgroup      1 - rows: 309909
 Target_list shape      : (156, 2)
 tg_targer_count        : 156
 Number of columns      : 624
 I (Row coordiantes) Len: 309909 - [173927 150148 143524 ... 226340 225143 225274]
 J (Col coordiantes) Len: 309909 - [  0   0   0 ... 623 623 623]
 V (Data)            Len: 309909 - [ 1.  1. -1. ... -1. -1. -1.] min: -1.0 max: 1.0
 Sparse matrix shape    : (423737, 624)
<423737x624 sparse matrix of type '<class 'numpy.float64'>'
	with 309909 stored elements in Compressed Sparse Row format>
 #  (Y < 0 ) : 219244
 #  (Y > 0)  : 90665
 #  (Y != 0) : 309909

 #  (Y < -5) : 0
 #  (Y = -5) : 0
 #  (Y = -4) : 0
 #  (Y = -3) : 0
 #  (Y = -2) : 0
 #  (Y = -1) : 219244
 #  (Y = +1)  : 90665
 #  (Y = +2)  : 0
 #  (Y = +3)  : 0
 #  (Y = +4)  : 0
 #  (Y = +5)  : 0
 #  (Y > +5)  : 0
 Y.sum() +    : -128579.0
 Taskgroup 1 written to tg_Y_filename  

---------------------
   Taskgroup 1028 
--------------------

 #  (Y = -3) : 0
 #  (Y = -2) : 0
 #  (Y = -1) : 91904
 #  (Y = +1)  : 38227
 #  (Y = +2)  : 0
 #  (Y = +3)  : 0
 #  (Y = +4)  : 0
 #  (Y = +5)  : 0
 #  (Y > +5)  : 0
 Y.sum() +    : -53677.0
 Taskgroup 836 written to tg_Y_filename  

---------------------
   Taskgroup  643 
---------------------

 Taskgroup    643 - rows: 111633
 Target_list shape      : (46, 2)
 tg_targer_count        : 46
 Number of columns      : 184
 I (Row coordiantes) Len: 111633 - [200706 346235  78309 ... 148868 147673 146621]
 J (Col coordiantes) Len: 111633 - [  0   0   0 ... 183 183 183]
 V (Data)            Len: 111633 - [ 1.  1.  1. ... -1. -1. -1.] min: -1.0 max: 1.0
 Sparse matrix shape    : (423737, 184)
<423737x184 sparse matrix of type '<class 'numpy.float64'>'
	with 111633 stored elements in Compressed Sparse Row format>
 #  (Y < 0 ) : 69820
 #  (Y > 0)  : 41813
 #  (Y != 0) : 111633

 #  (Y < -5) : 0
 #  (Y = -5) : 0
 #  (Y = -4) : 0
 #  (Y = -3) : 0
 #  (Y = -2) : 0
 #  (Y = -1) : 69820
 #  (Y = +

##  Create Folding file

In [43]:
print(Ymask.shape)

print(f"Calculate folding... nfolds: {conf.nfolds}")

(423737, 3552)
Calculate folding... nfolds: 5


In [44]:
folds = mtcv_clustered(Ymask, pd.Series(clusters), conf.nfolds)

* Y_in       : <class 'scipy.sparse.coo.coo_matrix'> 	 (423737, 3552)
* clusters: <class 'pandas.core.series.Series'>  	 (423737,) 
           Min: 2 Max:26057
0         24163
1         24163
2          8923
3         24163
4         12269
          ...  
423732    24163
423733    24670
423734    18967
423735    15079
423736    12696
Length: 423737, dtype: int64 

* cl_uniq: <class 'numpy.ndarray'> 16747
[24163  8923 12269 ... 21287 10892 15079]

* cid:   <class 'pandas.core.series.Series'> 423737
24163        0
24163        0
8923         1
24163        0
12269        2
         ...  
24163        0
24670        4
18967       12
15079    16746
12696    15924
Length: 423737, dtype: int64

* Create C sparse matrix - indicating the clusters compounds are assigned to.... 
* Shape of sparse matrix: Rows(#clusters):16747 Columns(#compounds): 423737 

* Y_bin     :   <class 'scipy.sparse.coo.coo_matrix'> (423737, 3552)
* Y_bin.data:   <class 'numpy.ndarray'> (2698287,)
 Sum: 0
 Sum: 2612254


In [45]:
print(folds.shape)
print(folds.min(), folds.max())

(423737,)
0 4


In [46]:
print(conf.folding)

output/chembl_29/chembl_29_folding.npy


In [47]:
#TODO: save folding
np.save(conf.folding, folds)
print(f"save folding file to {conf.folding}")

save folding file to output/chembl_29/chembl_29_folding.npy


### `mtcv_clustered` step by step

In [275]:
print(clusters.shape)
clusters_ser = pd.Series(clusters)
clusters_ser

(423737,)


0         12266
1         10766
2         10766
3          4557
4          4557
          ...  
423732     4557
423733    15081
423734     8125
423735    25220
423736    15081
Length: 423737, dtype: int64

In [276]:
assert clusters_ser.shape[0] == Y.shape[0]

In [None]:
## Assign a sequential number to the cluster ids
cl_uniq = clusters_ser.unique()
cl2id   = pd.Series(np.arange(cl_uniq.shape[0]), index=cl_uniq)
cid     = cl2id[clusters]

In [283]:
print(f" Number of unique clusters: {cl_uniq.shape}")
print(f" cl2id: {cl2id.shape}")
print(cl2id)


 Number of unique clusters: (17672,)
 cl2id: (17672,)
12266        0
10766        1
4557         2
9771         3
14746        4
         ...  
7565     17667
1755     17668
13401    17669
22801    17670
25220    17671
Length: 17672, dtype: int64
 cid  : (423737,)
12266        0
10766        1
10766        1
4557         2
4557         2
         ...  
4557         2
15081        7
8125      6788
25220    17671
15081        7
Length: 423737, dtype: int64


In [278]:
print(cl_uniq)
print(cl_uniq.shape, cl_uniq.min(), cl_uniq.max())

[12266 10766  4557 ... 13401 22801 25220]
(17672,) 0 25993


In [284]:
print(f" cid  : {cid.shape}")
print(' cid.values: ', cid.values)
print(' cid.shape : ', cid.shape)
print(cid)

 cid  : (423737,)
 cid.values:  [    0     1     1 ...  6788 17671     7]
 cid.shape :  (423737,)
12266        0
10766        1
10766        1
4557         2
4557         2
         ...  
4557         2
15081        7
8125      6788
25220    17671
15081        7
Length: 423737, dtype: int64


In [132]:
## creating cluster2compound matrix
## csr_matrix( (data,(row_ind,col_ind)))
##    where data, row_ind and col_ind satisfy the relationship a[row_ind[k], col_ind[k]] = data[k].
##   
##   Number of rows:     cid.values (cluster ids)
##   Number of columns : number of rows in Y file ()
##   Clusters x Y Rows
## 
##   in each row (cluster) there will be a '1' in the columns (compounds) 
##   where the compound has been assigned to the cluster
##   each compound is only assigned to one cluster, so the sum of each column 
##   is == 1 (i.e., C[:,i].sum() == 1)

C    = scipy.sparse.csr_matrix(
        (np.ones(cid.shape[0], dtype=np.int8), 
        (cid.values, np.arange(cid.shape[0])) )
        )


In [294]:
print(repr(C))
print(C[:,1].sum())

<17672x423737 sparse matrix of type '<class 'numpy.int8'>'
	with 423737 stored elements in Compressed Sparse Row format>
1


In [296]:
## Ybin : Inidcates which compound / labels are present in the Y dataset

In [139]:
Ybin      = Y.copy()

In [140]:
print(Ybin.shape)
print(Ybin.data.shape)
print(Ybin.data)

(423737, 3552)
(2698287,)
[ 1.  1. -1. ... -1. -1. -1.]


In [141]:
Ybin.data = np.ones(Ybin.data.shape[0], dtype=np.int8)

In [301]:
print(Ybin.shape)
print(Ybin.data.shape)
# print(Ybin)
print(f" #  (Y < 0 ) : {(Ybin<0).sum()}")
print(f" #  (Y != 0) : {(Ybin!=0).sum()}")
print(f" #  (Y > 0)  : {(Ybin>0).sum()}")
print()
print(f" #  (Y < -5) : {(Ybin<-5).sum()}")
print(f" #  (Y = -5) : {(Ybin==-5).sum()}")
print(f" #  (Y = -4) : {(Ybin==-4).sum()}")
print(f" #  (Y = -3) : {(Ybin==-3).sum()}")
print(f" #  (Y = -2) : {(Ybin==-2).sum()}")
print(f" #  (Y = -1) : {(Ybin==-1).sum()}")
print(f" #  (Y = 1)  : {(Ybin==1).sum()}")
print(f" #  (Y = 2)  : {(Ybin==2).sum()}")
print(f" #  (Y = 3)  : {(Ybin==3).sum()}")
print(f" #  (Y = 4)  : {(Ybin==4).sum()}")
print(f" #  (Y = 5)  : {(Ybin==5).sum()}")
print(f" #  (Y > 5)  : {(Ybin>5).sum()}")
print(f" Y.sum()     : {Ybin.sum()}")

(423737, 3552)
(2698287,)
 #  (Y < 0 ) : 0
 #  (Y != 0) : 2612254
 #  (Y > 0)  : 2612254

 #  (Y < -5) : 0
 #  (Y = -5) : 0
 #  (Y = -4) : 0
 #  (Y = -3) : 0
 #  (Y = -2) : 0
 #  (Y = -1) : 0
 #  (Y = 1)  : 2530793
 #  (Y = 2)  : 76889
 #  (Y = 3)  : 4572
 #  (Y = 4)  : 0
 #  (Y = 5)  : 0
 #  (Y > 5)  : 0
 Y.sum()     : 2698287


In [146]:
## Get a count of labels in each cluster  
## C (17672x423737) dot (423737, 3552) = 17672 x 3552
cl_counts = C.dot(Ybin)

In [300]:
print(cl_counts.shape)
print(cl_counts.min(), cl_counts.max())
print(cl_counts)

(17672, 3552)
-128 127
  (0, 672)	14
  (0, 673)	14
  (0, 674)	14
  (0, 675)	14
  (1, 0)	24
  (1, 1)	24
  (1, 2)	24
  (1, 3)	24
  (1, 4)	-31
  (1, 5)	-32
  (1, 6)	-32
  (1, 7)	-33
  (1, 8)	104
  (1, 9)	104
  (1, 10)	104
  (1, 11)	104
  (1, 12)	61
  (1, 13)	59
  (1, 14)	58
  (1, 15)	58
  (1, 16)	26
  (1, 17)	26
  (1, 18)	26
  (1, 19)	26
  (1, 20)	31
  :	:
  (17669, 1811)	1
  (17669, 2832)	1
  (17669, 2833)	1
  (17669, 2834)	1
  (17669, 2835)	1
  (17670, 1180)	1
  (17670, 1181)	1
  (17670, 1182)	1
  (17670, 1183)	1
  (17671, 1364)	1
  (17671, 1365)	1
  (17671, 1366)	1
  (17671, 1367)	1
  (17671, 1500)	1
  (17671, 1501)	1
  (17671, 1502)	1
  (17671, 1503)	1
  (17671, 2536)	1
  (17671, 2537)	1
  (17671, 2538)	1
  (17671, 2539)	1
  (17671, 3300)	1
  (17671, 3301)	1
  (17671, 3302)	1
  (17671, 3303)	1


### `mtcv` step by step

In [157]:
nfolds = conf.nfolds
pfolds = None
if nfolds is None and pfolds is None:
    raise ValueError("nfolds or pfolds must be specified.")
if pfolds is not None and np.abs(np.sum(pfolds) - 1.0) > 1e-5:
    raise ValueError("pfolds must sum to 1.0.")
if pfolds is None:
    pfolds = np.ones(nfolds) / nfolds
else:
    nfolds = len(pfolds)

In [199]:
print(pfolds)
print(nfolds)

[0.2 0.2 0.2 0.2 0.2]
5


In [314]:
target_sizes = np.array(cl_counts.sum(0)).flatten() ## Sum cl_counts over targets
comp_sizes   = np.array(cl_counts.sum(1)).flatten() ## sum cl_counts over compounds

In [311]:
print(' cl_counts shape   : ',cl_counts.shape)
print()
print(' target_sizes shape: ',target_sizes.shape)
print(' Number of targets in each cluster: ', target_sizes, target_sizes.sum())
print(' target_sizes < 0 : ', (target_sizes < 0).sum())
print(' target_sizes > 0 : ', (target_sizes > 0).sum())
print()
print(' comp_sizes shape  : ', comp_sizes.shape)
print(' Number of compounds in each cluster: ', comp_sizes)
print(' comp_sizes < 0 : ', (comp_sizes < 0).sum())
print(' comp_sizes > 0 : ', (comp_sizes > 0).sum())

 cl_counts shape   :  (17672, 3552)

 target_sizes shape:  (3552,)
 Number of targets in each cluster:  [120 120 120 ... 498 761 752] 917039
 target_sizes < 0 :  153
 target_sizes > 0 :  3394

 comp_sizes shape  :  (17672,)
 Number of compounds in each cluster:  [   56 82909 56604 ...    12     4    16]
 comp_sizes < 0 :  48
 comp_sizes > 0 :  17624


In [312]:
df_10 = pd.DataFrame({"row": np.arange(cl_counts.shape[0]), "size": comp_sizes})
df_10.shape

(17672, 2)

In [315]:
df_10.info()
df_10

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17672 entries, 0 to 17671
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   row     17672 non-null  int64
 1   size    17672 non-null  int64
dtypes: int64(2)
memory usage: 276.2 KB


Unnamed: 0,row,size
0,0,56
1,1,82909
2,2,56604
3,3,472
4,4,272
...,...,...
17667,17667,8
17668,17668,8
17669,17669,12
17670,17670,4


In [None]:
# Randomize df_10

In [316]:
df_11 = df_10.sample(frac=1)

In [317]:
df_11.info()
df_11.shape
df_11

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17672 entries, 6420 to 3767
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   row     17672 non-null  int64
 1   size    17672 non-null  int64
dtypes: int64(2)
memory usage: 414.2 KB


Unnamed: 0,row,size
6420,6420,40
13961,13961,20
16567,16567,16
156,156,28
15820,15820,4
...,...,...
2908,2908,12
16049,16049,16
1413,1413,20
17577,17577,24


In [None]:
## Sort in descending compound count order 

In [318]:
df_11.sort_values("size", inplace=True, ascending=False)

In [319]:
df_11.info()
df_11.shape
df_11

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17672 entries, 1 to 15151
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   row     17672 non-null  int64
 1   size    17672 non-null  int64
dtypes: int64(2)
memory usage: 414.2 KB


Unnamed: 0,row,size
1,1,82909
7,7,79806
2,2,56604
49,49,6921
3969,3969,1745
...,...,...
13443,13443,-539
13272,13272,-620
16191,16191,-740
606,606,-750


In [320]:
fold_sizes = np.zeros((nfolds, cl_counts.shape[1]), dtype=np.int8)
#max_fold_sizes = np.ceil(target_sizes / nfolds).astype(np.int)
max_fold_sizes = np.ceil(np.outer(target_sizes, pfolds))


In [321]:
print('\n fold_sizes \n')
print(fold_sizes.shape)
print(fold_sizes)
print('\n pfolds & target_sizes \n')
print(pfolds.shape)
print(pfolds)
print(target_sizes.shape)
print(target_sizes)
print('\n np.outer(target_sizes, pfolds) \n')
print(np.outer(target_sizes, pfolds).shape)
print(np.outer(target_sizes, pfolds))
print('\n max_fold_sizes \n')
print(max_fold_sizes.shape)
print(max_fold_sizes)



 fold_sizes 

(5, 3552)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

 pfolds & target_sizes 

(5,)
[0.2 0.2 0.2 0.2 0.2]
(3552,)
[120 120 120 ... 498 761 752]

 np.outer(target_sizes, pfolds) 

(3552, 5)
[[ 24.   24.   24.   24.   24. ]
 [ 24.   24.   24.   24.   24. ]
 [ 24.   24.   24.   24.   24. ]
 ...
 [ 99.6  99.6  99.6  99.6  99.6]
 [152.2 152.2 152.2 152.2 152.2]
 [150.4 150.4 150.4 150.4 150.4]]

 max_fold_sizes 

(3552, 5)
[[ 24.  24.  24.  24.  24.]
 [ 24.  24.  24.  24.  24.]
 [ 24.  24.  24.  24.  24.]
 ...
 [100. 100. 100. 100. 100.]
 [153. 153. 153. 153. 153.]
 [151. 151. 151. 151. 151.]]


In [366]:
## output vector (initialized to -1)
folds = np.zeros(cl_counts.shape[0], dtype=np.int8) - 1
farray = np.arange(nfolds)    

In [367]:
print('\n folds\n')
print(folds.shape)
print(folds)
print(farray)


 folds

(17672,)
[-1 -1 -1 ... -1 -1 -1]
[0 1 2 3 4]


In [368]:
cl_counts.shape

(17672, 3552)

In [369]:
# for i in range(cl_counts.shape[0]):
i= 0

In [358]:
i += 1


In [370]:
# for i in range(mask.shape[0]):
idx   = df_11.index[i]
print(f"row: {i}  index: {idx}         df_11['row'][idx]: {df_11['row'][idx]}         df_11['size'][idx]: {df_11['size'][idx]}")

row: 0  index: 1         df_11['row'][idx]: 1         df_11['size'][idx]: 82909


In [371]:
random_folds =  np.random.choice(farray, size=nfolds, replace=False, p=pfolds)
print(random_folds)
random_folds = iter(random_folds)

[3 1 4 2 0]


In [361]:
# j  = next(random_folds)
print(j)

print('cl_counts[idx] : ', cl_counts[idx].shape, cl_counts[idx].sum())
print((fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).shape , (fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]))
print((fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).any())
print(cl_counts[idx,:].dot((fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).transpose()))
print(cl_counts[idx,:].dot((fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).transpose()) == 0)

2


In [372]:
#for j in np.random.permutation(nfolds):
# make a random list of the folds, and go through the list 
for j in random_folds:
    print(f'fold: {j}')
    print('   cl_counts[idx] : ', cl_counts[idx].shape, cl_counts[idx].sum())
    print('   test shape     : ', (fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).shape , (fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]))
    print('   test.any()     : ', (fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).any())
    print('   dot operation  : ', cl_counts[idx,:].dot((fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).transpose()))
    print('   dot op == 0    : ', cl_counts[idx,:].dot((fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).transpose()) == 0)    
    if cl_counts[idx,:].dot((fold_sizes[j,:] + cl_counts[idx,:] > max_fold_sizes[:,j]).transpose()) == 0:
        ## compound fits into fold j
        print(f" *** compound fits into fold {j}")
        folds[idx] = j
        fold_sizes[j,:] += mask[idx,:]
        break

fold: 3
   cl_counts[idx] :  (1, 3552) 82909
   test shape     :  (1, 3552) [[False False False ... False False False]]
   test.any()     :  True
   dot operation  :  [[-119]]
   dot op == 0    :  [[False]]
fold: 1
   cl_counts[idx] :  (1, 3552) 82909
   test shape     :  (1, 3552) [[False False False ... False False False]]
   test.any()     :  True
   dot operation  :  [[-119]]
   dot op == 0    :  [[False]]
fold: 4
   cl_counts[idx] :  (1, 3552) 82909
   test shape     :  (1, 3552) [[False False False ... False False False]]
   test.any()     :  True
   dot operation  :  [[-119]]
   dot op == 0    :  [[False]]
fold: 2
   cl_counts[idx] :  (1, 3552) 82909
   test shape     :  (1, 3552) [[False False False ... False False False]]
   test.any()     :  True
   dot operation  :  [[-119]]
   dot op == 0    :  [[False]]
fold: 0
   cl_counts[idx] :  (1, 3552) 82909
   test shape     :  (1, 3552) [[False False False ... False False False]]
   test.any()     :  True
   dot operation  :  [[-11

In [373]:
print(folds[idx])

-1


In [374]:
        
## If still havent assigned, randomly assign to a fold.
if folds[idx] == -1:
    picked_fold = np.random.randint(0, nfolds)
    folds[idx] = picked_fold
    fold_sizes[folds[idx],:] += cl_counts[idx,:]
    print(f" random assignment into {picked_fold} ")
    print(f" cl_counts[{idx}]  :    {cl_counts[idx,:]} ")
    print(f" fold_sizes[{folds[idx]},:]  :    {fold_sizes[folds[idx],:]} ")

 random assignment into 4 
 cl_counts[1]  :      (0, 0)	24
  (0, 1)	24
  (0, 2)	24
  (0, 3)	24
  (0, 4)	-31
  (0, 5)	-32
  (0, 6)	-32
  (0, 7)	-33
  (0, 8)	104
  (0, 9)	104
  (0, 10)	104
  (0, 11)	104
  (0, 12)	61
  (0, 13)	59
  (0, 14)	58
  (0, 15)	58
  (0, 16)	26
  (0, 17)	26
  (0, 18)	26
  (0, 19)	26
  (0, 20)	31
  (0, 21)	31
  (0, 22)	31
  (0, 23)	31
  (0, 24)	31
  :	:
  (0, 3527)	56
  (0, 3528)	38
  (0, 3529)	38
  (0, 3530)	38
  (0, 3531)	38
  (0, 3532)	-94
  (0, 3533)	-94
  (0, 3534)	-94
  (0, 3535)	-94
  (0, 3536)	-80
  (0, 3537)	-80
  (0, 3538)	-80
  (0, 3539)	-80
  (0, 3540)	40
  (0, 3541)	40
  (0, 3542)	40
  (0, 3543)	40
  (0, 3544)	116
  (0, 3545)	122
  (0, 3546)	122
  (0, 3547)	122
  (0, 3548)	-59
  (0, 3549)	-59
  (0, 3550)	-128
  (0, 3551)	-128 
 fold_sizes[4,:]  :    [  24   24   24 ...  -59 -128 -128] 


In [365]:
print(folds)

[3 4 4 ... 3 2 0]
