In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
import argparse

In [2]:
parser = argparse.ArgumentParser(description="Multi-task clustered crossvalidation")
parser.add_argument('--y', type=str, required=True)
parser.add_argument('--compounds', type=str, required=True)
parser.add_argument('--clusters', type=str, required=True)
parser.add_argument('--folding', type=str, default="output/folding.npy")
parser.add_argument('--nfolds', type=int, default=5)

in_args = "--y output/chembl_29/chembl_29_thresh.csv --clusters output/clustering.npy " \
          "--folding output/folding.npy --compounds output/chembl_29_X_cmpds.csv"
in_args = in_args.split()

conf = parser.parse_args(in_args)
conf_vars = vars(conf)
for x in conf_vars:
    print(f' parm: {x:10s}    value: {conf_vars[x]}')

 parm: y             value: output/chembl_29/chembl_29_thresh.csv
 parm: compounds     value: output/chembl_29_X_cmpds.csv
 parm: clusters      value: output/clustering.npy
 parm: folding       value: output/folding.npy
 parm: nfolds        value: 5


In [13]:
#Load the clustering
clusters = np.load(conf.clusters, allow_pickle=True)
print(clusters)


[ 8014  3958  3466 ... 10806 19103 13660]


#### Load y matrix and create the compund threshold file `output/chembl_29_thresh.csv`

In [23]:
df = pd.read_csv(conf.y)

Ncol  = len(df["target_id"].unique()) * len(df["variable"].unique())

print(df)


             target_id        cmpd_id  variable  value
0        CHEMBL1075097  CHEMBL1234777       5.5    1.0
1        CHEMBL1075097  CHEMBL1812661       5.5    1.0
2        CHEMBL1075097  CHEMBL1812662       5.5   -1.0
3        CHEMBL1075097  CHEMBL2326084       5.5    1.0
4        CHEMBL1075097  CHEMBL2326085       5.5    1.0
...                ...            ...       ...    ...
2612992     CHEMBL6175  CHEMBL4646564       8.5   -1.0
2612993     CHEMBL6175  CHEMBL4648732       8.5   -1.0
2612994     CHEMBL6175  CHEMBL4649004       8.5   -1.0
2612995     CHEMBL6175   CHEMBL578512       8.5   -1.0
2612996     CHEMBL6175    CHEMBL90852       8.5   -1.0

[2612997 rows x 4 columns]


In [25]:

print(f"Number of rows   : {len(df):7d}")
print(f"Unique target ids: {len(df['target_id'].unique()):7d}   ")
print(f"Unique thresholds: {len(df['variable'].unique()):7d}")
print(f'Number of cols   : {Ncol:7d}')

Number of rows   : 2612997
Unique target ids:     888   
Unique thresholds:       4
Number of cols   :    3552


#### Load compound list `output/chembl_29_X_cmpds.csv`

In [26]:
cmpd_list = pd.read_csv(conf.compounds, header=None)
print(cmpd_list)

             0              1
0            0   CHEMBL405398
1            1   CHEMBL403325
2            2   CHEMBL501943
3            3   CHEMBL501094
4            4   CHEMBL505943
...        ...            ...
423731  423731  CHEMBL4297644
423732  423732  CHEMBL4297666
423733  423733  CHEMBL4297674
423734  423734  CHEMBL4298138
423735  423735  CHEMBL4298140

[423736 rows x 2 columns]


In [27]:
cmpd_list["cid"] = range(len(cmpd_list))
Ncmpd = len(set(cmpd_list[0]))

In [28]:
print(f"Size of cmpd_list: {len(cmpd_list)}")
print(cmpd_list)
print(Ncmpd)

Size of cmpd_list: 423736
             0              1     cid
0            0   CHEMBL405398       0
1            1   CHEMBL403325       1
2            2   CHEMBL501943       2
3            3   CHEMBL501094       3
4            4   CHEMBL505943       4
...        ...            ...     ...
423731  423731  CHEMBL4297644  423731
423732  423732  CHEMBL4297666  423732
423733  423733  CHEMBL4297674  423733
423734  423734  CHEMBL4298138  423734
423735  423735  CHEMBL4298140  423735

[423736 rows x 3 columns]
423736


#### Load the clustering info `output/clustering.npy`

In [172]:
#Load the clustering
clusters = np.load(conf.clusters, allow_pickle=True)
print(len(clusters), clusters)


423736 [ 8014  3958  3466 ... 10806 19103 13660]


#### Create mask

In [21]:
assert len(clusters) == len(cmpd_list[0]), "Number of compound must agree with the size of clustering vector"

In [29]:
 
print(f"Ncmpd: {Ncmpd}    Ncol: {Ncol}")

Ncmpd: 423736    Ncol: 3552


#### Create dataframe of unique target_ids 

In [16]:
target_list = pd.DataFrame(df["target_id"].unique())
target_list["tid"] = range(len(target_list))
print(target_list)

                 0  tid
0    CHEMBL1075097    0
1    CHEMBL1075104    1
2    CHEMBL1075138    2
3    CHEMBL1075145    3
4    CHEMBL1075165    4
..             ...  ...
883     CHEMBL6154  883
884     CHEMBL6164  884
885     CHEMBL6166  885
886     CHEMBL6167  886
887     CHEMBL6175  887

[888 rows x 2 columns]


In [8]:
variable_list = pd.DataFrame(df["variable"].unique())
variable_list["vid"] = range(len(variable_list))
print(variable_list)

     0  vid
0  5.5    0
1  6.5    1
2  7.5    2
3  8.5    3


In [9]:
Nvar = len(variable_list)
print(Nvar)

4


#### `tmp1` : Add unique target identifier to dataframe

In [33]:
print(df)
print(cmpd_list)

             target_id        cmpd_id  variable  value
0        CHEMBL1075097  CHEMBL1234777       5.5    1.0
1        CHEMBL1075097  CHEMBL1812661       5.5    1.0
2        CHEMBL1075097  CHEMBL1812662       5.5   -1.0
3        CHEMBL1075097  CHEMBL2326084       5.5    1.0
4        CHEMBL1075097  CHEMBL2326085       5.5    1.0
...                ...            ...       ...    ...
2612992     CHEMBL6175  CHEMBL4646564       8.5   -1.0
2612993     CHEMBL6175  CHEMBL4648732       8.5   -1.0
2612994     CHEMBL6175  CHEMBL4649004       8.5   -1.0
2612995     CHEMBL6175   CHEMBL578512       8.5   -1.0
2612996     CHEMBL6175    CHEMBL90852       8.5   -1.0

[2612997 rows x 4 columns]
             0              1     cid
0            0   CHEMBL405398       0
1            1   CHEMBL403325       1
2            2   CHEMBL501943       2
3            3   CHEMBL501094       3
4            4   CHEMBL505943       4
...        ...            ...     ...
423731  423731  CHEMBL4297644  423731
423732  

In [35]:
tmp1 = pd.merge(df,target_list, how="inner", left_on="target_id", right_on=0)
print(tmp1)

             target_id        cmpd_id  variable  value              0  tid
0        CHEMBL1075097  CHEMBL1234777       5.5    1.0  CHEMBL1075097    0
1        CHEMBL1075097  CHEMBL1812661       5.5    1.0  CHEMBL1075097    0
2        CHEMBL1075097  CHEMBL1812662       5.5   -1.0  CHEMBL1075097    0
3        CHEMBL1075097  CHEMBL2326084       5.5    1.0  CHEMBL1075097    0
4        CHEMBL1075097  CHEMBL2326085       5.5    1.0  CHEMBL1075097    0
...                ...            ...       ...    ...            ...  ...
2612992     CHEMBL6175  CHEMBL4646564       8.5   -1.0     CHEMBL6175  887
2612993     CHEMBL6175  CHEMBL4648732       8.5   -1.0     CHEMBL6175  887
2612994     CHEMBL6175  CHEMBL4649004       8.5   -1.0     CHEMBL6175  887
2612995     CHEMBL6175   CHEMBL578512       8.5   -1.0     CHEMBL6175  887
2612996     CHEMBL6175    CHEMBL90852       8.5   -1.0     CHEMBL6175  887

[2612997 rows x 6 columns]


#### `tmp2` : Add unique compound identifier to tmp1 

In [36]:
print(tmp1)
print(cmpd_list)

             target_id        cmpd_id  variable  value              0  tid
0        CHEMBL1075097  CHEMBL1234777       5.5    1.0  CHEMBL1075097    0
1        CHEMBL1075097  CHEMBL1812661       5.5    1.0  CHEMBL1075097    0
2        CHEMBL1075097  CHEMBL1812662       5.5   -1.0  CHEMBL1075097    0
3        CHEMBL1075097  CHEMBL2326084       5.5    1.0  CHEMBL1075097    0
4        CHEMBL1075097  CHEMBL2326085       5.5    1.0  CHEMBL1075097    0
...                ...            ...       ...    ...            ...  ...
2612992     CHEMBL6175  CHEMBL4646564       8.5   -1.0     CHEMBL6175  887
2612993     CHEMBL6175  CHEMBL4648732       8.5   -1.0     CHEMBL6175  887
2612994     CHEMBL6175  CHEMBL4649004       8.5   -1.0     CHEMBL6175  887
2612995     CHEMBL6175   CHEMBL578512       8.5   -1.0     CHEMBL6175  887
2612996     CHEMBL6175    CHEMBL90852       8.5   -1.0     CHEMBL6175  887

[2612997 rows x 6 columns]
             0              1     cid
0            0   CHEMBL405398     

In [60]:
tmp2 = pd.merge(tmp1, cmpd_list, left_on="cmpd_id", right_on=1)
print(tmp2)
print()
cid_value_counts = tmp2['cid'].value_counts()
print(f'cid value counts : {cid_value_counts.shape} \n')
print(cid_value_counts)
print(f"\nNumber of unqiue cids:  {tmp2['cid'].nunique()}")



             target_id        cmpd_id  variable  value            0_x  tid  \
0        CHEMBL1075097  CHEMBL1234777       5.5    1.0  CHEMBL1075097    0   
1        CHEMBL1075097  CHEMBL1234777       6.5   -1.0  CHEMBL1075097    0   
2        CHEMBL1075097  CHEMBL1234777       7.5   -1.0  CHEMBL1075097    0   
3        CHEMBL1075097  CHEMBL1234777       8.5   -1.0  CHEMBL1075097    0   
4        CHEMBL1075097  CHEMBL1812661       5.5    1.0  CHEMBL1075097    0   
...                ...            ...       ...    ...            ...  ...   
2612245     CHEMBL6175  CHEMBL4649004       7.5   -1.0     CHEMBL6175  887   
2612246     CHEMBL6175  CHEMBL4649004       8.5   -1.0     CHEMBL6175  887   
2612247     CHEMBL6175  CHEMBL4202719       6.5   -1.0     CHEMBL6175  887   
2612248     CHEMBL6175  CHEMBL4202719       7.5   -1.0     CHEMBL6175  887   
2612249     CHEMBL6175  CHEMBL4202719       8.5   -1.0     CHEMBL6175  887   

            0_y              1     cid  
0        173926  CHEMB

#### `join` : add unique threshold identifier to tmp2

In [37]:
print(tmp2)
print(variable_list)

             target_id        cmpd_id  variable  value            0_x  tid  \
0        CHEMBL1075097  CHEMBL1234777       5.5    1.0  CHEMBL1075097    0   
1        CHEMBL1075097  CHEMBL1234777       6.5   -1.0  CHEMBL1075097    0   
2        CHEMBL1075097  CHEMBL1234777       7.5   -1.0  CHEMBL1075097    0   
3        CHEMBL1075097  CHEMBL1234777       8.5   -1.0  CHEMBL1075097    0   
4        CHEMBL1075097  CHEMBL1812661       5.5    1.0  CHEMBL1075097    0   
...                ...            ...       ...    ...            ...  ...   
2612245     CHEMBL6175  CHEMBL4649004       7.5   -1.0     CHEMBL6175  887   
2612246     CHEMBL6175  CHEMBL4649004       8.5   -1.0     CHEMBL6175  887   
2612247     CHEMBL6175  CHEMBL4202719       6.5   -1.0     CHEMBL6175  887   
2612248     CHEMBL6175  CHEMBL4202719       7.5   -1.0     CHEMBL6175  887   
2612249     CHEMBL6175  CHEMBL4202719       8.5   -1.0     CHEMBL6175  887   

            0_y              1     cid  
0        173926  CHEMB

In [38]:
join = pd.merge(tmp2, variable_list, left_on="variable", right_on=0) #WHY [0]
print(join)

             target_id        cmpd_id  variable  value            0_x  tid  \
0        CHEMBL1075097  CHEMBL1234777       5.5    1.0  CHEMBL1075097    0   
1        CHEMBL1075097  CHEMBL1812661       5.5    1.0  CHEMBL1075097    0   
2        CHEMBL1075097  CHEMBL1812662       5.5   -1.0  CHEMBL1075097    0   
3        CHEMBL1075097  CHEMBL2326084       5.5    1.0  CHEMBL1075097    0   
4        CHEMBL1075097  CHEMBL2326085       5.5    1.0  CHEMBL1075097    0   
...                ...            ...       ...    ...            ...  ...   
2612245     CHEMBL6175  CHEMBL4646339       8.5   -1.0     CHEMBL6175  887   
2612246     CHEMBL6175  CHEMBL4646564       8.5   -1.0     CHEMBL6175  887   
2612247     CHEMBL6175  CHEMBL4648732       8.5   -1.0     CHEMBL6175  887   
2612248     CHEMBL6175  CHEMBL4649004       8.5   -1.0     CHEMBL6175  887   
2612249     CHEMBL6175  CHEMBL4202719       8.5   -1.0     CHEMBL6175  887   

            0_y              1     cid    0  vid  
0        173

#### Create `Ymask` sparse matrix
Dims:   `Ncmpd x NCol`

In [212]:
I = join["cid"].to_numpy()
J = (Nvar * join["tid"] + join["vid"]).to_numpy()
V = np.ones(len(I))

In [220]:
print(f' Nvar: {Nvar}')
print(f' I Len: {len(I)} - {I}') ## Rows 
print(f' J Len: {len(J)} - {J}') ## Columns 
print(f' V Len: {len(V)} - {V}')

 Nvar: 4
 I Len: 2612250 - [173926 150148 143524 ... 146159 145835 416262]
 J Len: 2612250 - [   0    0    0 ... 3551 3551 3551]
 V Len: 2612250 - [1. 1. 1. ... 1. 1. 1.]


In [None]:
Ymask = scipy.sparse.coo_matrix((V,(I,J)),(Ncmpd,Ncol))

In [313]:
Ymask_neg = (Ymask < 0)
print(Ymask.shape)

print(f" Ymask        : {type(Ymask)} {Ymask.shape}")    
print(f" Ymask.data   : {type(Ymask.data)} {Ymask.data.shape}")
print(f" Ymask sum    : {Ymask.sum()}")
print(f" Ymask < 0    : {(Ymask>0).sum()}")
print(f" Ymask > 0    : {(Ymask<0).sum()}")
print(f" Ymask        : {    Ymask.sum()}")
print(Ymask)

(423736, 3552)
 Ymask        : <class 'scipy.sparse.coo.coo_matrix'> (423736, 3552)
 Ymask.data   : <class 'numpy.ndarray'> (2612250,)
 Ymask sum    : 2612250.0
 Ymask < 0 : 2612250
 Ymask > 0 : 0
 Ymask     : 2612250.0
  (173926, 0)	1.0
  (150148, 0)	1.0
  (143524, 0)	1.0
  (233208, 0)	1.0
  (237771, 0)	1.0
  (237772, 0)	1.0
  (233837, 0)	1.0
  (234711, 0)	1.0
  (234125, 0)	1.0
  (234831, 0)	1.0
  (234129, 0)	1.0
  (234712, 0)	1.0
  (234713, 0)	1.0
  (234873, 0)	1.0
  (234874, 0)	1.0
  (234875, 0)	1.0
  (235156, 0)	1.0
  (233413, 0)	1.0
  (233568, 0)	1.0
  (241433, 0)	1.0
  (241434, 0)	1.0
  (241435, 0)	1.0
  (241436, 0)	1.0
  (241437, 0)	1.0
  (241076, 0)	1.0
  :	:
  (330963, 3551)	1.0
  (330967, 3551)	1.0
  (329646, 3551)	1.0
  (331536, 3551)	1.0
  (330033, 3551)	1.0
  (329648, 3551)	1.0
  (331247, 3551)	1.0
  (330208, 3551)	1.0
  (331399, 3551)	1.0
  (330899, 3551)	1.0
  (330804, 3551)	1.0
  (330811, 3551)	1.0
  (416189, 3551)	1.0
  (416397, 3551)	1.0
  (416876, 3551)	1.0
  (412842

####  Create Y sparse matrix 

In [321]:
y_data = join["value"].to_numpy()
Y = scipy.sparse.coo_matrix((y_data,(I,J)),(Ncmpd, Ncol))
print(repr(Y))


print(f"   Y < 0 : {(Y>0).sum()}")
print(f"   Y > 0 : {(Y<0).sum()}")
print(f"   Y != 0: {(Y!=0).sum()}")
print(Y)


<423736x3552 sparse matrix of type '<class 'numpy.float64'>'
	with 2612250 stored elements in COOrdinate format>
   Y < 0 : 933144
   Y > 0 : 1679106
   Y != 0: 2612250
  (173926, 0)	1.0
  (150148, 0)	1.0
  (143524, 0)	-1.0
  (233208, 0)	1.0
  (237771, 0)	1.0
  (237772, 0)	1.0
  (233837, 0)	1.0
  (234711, 0)	1.0
  (234125, 0)	1.0
  (234831, 0)	1.0
  (234129, 0)	-1.0
  (234712, 0)	1.0
  (234713, 0)	1.0
  (234873, 0)	1.0
  (234874, 0)	1.0
  (234875, 0)	1.0
  (235156, 0)	1.0
  (233413, 0)	1.0
  (233568, 0)	1.0
  (241433, 0)	1.0
  (241434, 0)	1.0
  (241435, 0)	1.0
  (241436, 0)	-1.0
  (241437, 0)	1.0
  (241076, 0)	-1.0
  :	:
  (330963, 3551)	-1.0
  (330967, 3551)	-1.0
  (329646, 3551)	-1.0
  (331536, 3551)	-1.0
  (330033, 3551)	-1.0
  (329648, 3551)	-1.0
  (331247, 3551)	-1.0
  (330208, 3551)	-1.0
  (331399, 3551)	-1.0
  (330899, 3551)	-1.0
  (330804, 3551)	-1.0
  (330811, 3551)	-1.0
  (416189, 3551)	-1.0
  (416397, 3551)	-1.0
  (416876, 3551)	-1.0
  (412842, 3551)	-1.0
  (415697, 3551)	-1

#### `mtcv_clustered()`

In [179]:
clusters

array([ 8014,  3958,  3466, ..., 10806, 19103, 13660])

In [388]:
def mtcv_clustered(Y_in, clusters, nfolds=None, pfolds=None, seed=None):
    """
    splits rows of Y_in, based on clusters into either 
        a) equallY_in into nfolds 
        b) or according to the ratios defined bY_in pfolds
        
        Y_in: is the compound x target matrix
    """

    assert clusters.shape[0] == Y_in.shape[0]
    print(f"* Y_in       : {type(Y_in)} \t {Y_in.shape}")
    print(f"* clusters: {type(clusters)}  \t {clusters.shape} ")
    print(f"           Min: {clusters.min()} Max:{clusters.max()}\n{clusters} \n")
    
    
#     cl_uniq = np.sort(clusters.unique())
    cl_uniq = clusters.unique()
    print(f"* cl_uniq: {type(cl_uniq)} {len(cl_uniq)}\n{cl_uniq}\n")

          
    ## assign a unqiue id to the cluster numbers  
    cl2id   = pd.Series(np.arange(cl_uniq.shape[0]), index=cl_uniq)
        
    print(f"* cl2id: {type(cl2id)} {len(cl2id)}\n{cl2id}\n")
    print(f" cl2id[0]: {cl2id[0]}")
    print(f" cl2id[2321]: {cl2id[2321]}")
    print(f" cl2id[13660]: {cl2id[13660]}")
    print(f" cl2id[2321]: {cl2id[2321]}")
    print(f" cl2id[5601]: \n{cl2id[5599:5603]} \n")
    
    
    cid     = cl2id[clusters]
    print(f"* cid:   {type(cid)} {len(cid)}\n{cid}\n")
#     print(f" cid.values:\n{cid.values}")
    
    ## creating cluster2compound matrix
    ##
    ## Number of rows: cid.values
    print(f"* Create C sparse matrix - indicating the clusters compounds are assigned to.... ")
    C    = scipy.sparse.csr_matrix(
            (np.ones(cid.shape[0], dtype=np.int8), (cid.values, np.arange(cid.shape[0])))
            )

    print(f"* Shape of sparse matrix: Rows(#clusters):{C.shape[0]} Columns(#compounds): {C.shape[1]} \n")
    ## Create 
    Y_bin      = Y_in.copy()
    print(f"* Y_bin     :   {type(Y_bin)} {Y_bin.shape}")
    print(f"* Y_bin.data:   {type(Y_bin.data)} {Y_bin.data.shape}")
    Y_bin_neg = Y_bin < 0
    Y_bin_pos = Y_bin > 0 
    print(f" Sum: {(Y_bin_neg).sum()}")
    print(f" Sum: {(Y_bin_pos).sum()}")
    print(f" Sum: {(Y_bin).sum()}")
    
    Y_bin.data = np.ones(Y_bin.data.shape[0], dtype=np.int8)
    print(f"* Y_bin.data:   {type(Y_bin.data)} {Y_bin.data.shape}\n")
    Y_bin_neg = Y_bin < 0
    Y_bin_pos = Y_bin > 0 
    print(f" Sum: {(Y_bin_neg).sum()}")
    print(f" Sum: {(Y_bin_pos).sum()}")
    print(f" Sum: {(Y_bin).sum()}")
    print() 
    
    ## compute number of compounds per cluster/fingerprint
    cl_counts = C.dot(Y_bin)
    print(f"* cl_counts     :   {type(cl_counts)} {cl_counts.shape}")    
    print(f"* cl_counts.data:   {type(cl_counts.data)} {cl_counts.data.shape}")
    print(f" {cl_counts}\n")
    
    cl_counts_neg = cl_counts < 0 
    print(f"* cl_counts_neg     :   {type(cl_counts_neg)}      {cl_counts_neg.shape}")    
    print(f"* cl_counts_neg.data:   {type(cl_counts_neg.data)} {cl_counts_neg.data.shape}")
    print(f" Sum: {cl_counts_neg.sum()}")
    print(f" {cl_counts_neg} \n")    

    ## get cluster folds
    print(f"* Call mtcv with nfolds: {nfolds}  \t pfolds: {pfolds} \t  seed: {seed}")
    folds = mtcv(mask=cl_counts, nfolds=nfolds, pfolds=pfolds, seed=seed)

    print(f"* cid:   {type(cid)} {len(cid)}\n{cid}\n")
    print(f'* folds:  {folds.shape} \n {folds[cid]} \n')
    return folds[cid]


In [392]:
def mtcv(mask, nfolds=None, pfolds=None, seed=None):
    """
    Return a vector of folds

    Args:
    mask     binary mask matrix of [compounds x targets]
    nfolds   number of folds (integer)
    pfolds   array, specifying fold sizes (probability)
             If specified nfolds is ignored.
             Must sum to 1.
    """
    if nfolds is None and pfolds is None:
        raise ValueError("nfolds or pfolds must be specified.")
    if pfolds is not None and np.abs(np.sum(pfolds) - 1.0) > 1e-5:
        raise ValueError("pfolds must sum to 1.0.")

    if pfolds is None:
        pfolds = np.ones(nfolds) / nfolds
    else:
        nfolds = len(pfolds)
    
    print(f" Call mtcv with nfolds: {nfolds}  \t pfolds: {pfolds} \t  seed: {seed}\n\n")
    
    target_sizes = np.array(mask.sum(0)).flatten()
    comp_sizes   = np.array(mask.sum(1)).flatten()

    print(f'* mask        : {mask.shape} \t\n {mask} \n')
    print(f'* Target sizes (mask sum along rows) : {target_sizes.shape} \t {target_sizes} \n')
    print(f'* comp sizes   (mask sum over columns): {comp_sizes.shape}   \t {comp_sizes} \n')
    if seed is not None:
        np.random.seed(seed)

    df = pd.DataFrame({"row": np.arange(mask.shape[0]), "size": comp_sizes})
    print(f'\n* df info')
    print(df.info())
    print(df)
    
    df1 = df.sample(frac=1)
    print(f'\n df1 info \n')
    print(df1.info())
    print(df1)
    
    df1.sort_values("size", inplace=True, ascending=False)
    print(f'\n df1 info (after sort by size descending)\n')
    print(df1.info())
    print(df1)
    
    fold_sizes = np.zeros((nfolds, mask.shape[1]), dtype=np.int8)
#     #max_fold_sizes = np.ceil(target_sizes / nfolds).astype(np.int)
    max_fold_sizes = np.ceil(np.outer(target_sizes, pfolds))

    print(f'* fold_sizes : {fold_sizes.shape} \n {fold_sizes} \n')
#     print(f'* non zero folds: {(fold_sizes>0).sum()}\n')
    print(f'* max fold sizes: {max_fold_sizes.shape} \n {max_fold_sizes}\n')
    
    ## output vector (initialized to -1)
    folds = np.zeros(mask.shape[0], dtype=np.int8) - 1
    print(f'* folds : {folds.shape} \n {folds} \n')
    
    farray = np.arange(nfolds)    
    
    ## Starting at the cluster with the highest number of positive actions
    ## 
    print(f'* mask        : {mask.shape} ')
    for i in range(mask.shape[0]):
        idx   = df1.index[i]
 

        #for j in np.random.permutation(nfolds):
        choices = np.random.choice(farray, size=nfolds, replace=False, p=pfolds)
#         print(f' i: {i}   idx (cluster_id): {idx}   folds[{idx}] = {folds[idx]}   choices : {choices}')
        
        
        for j in choices: ## np.random.choice(farray, size=nfolds, replace=False, p=pfolds):
            fit_criteria = mask[idx,:].dot((fold_sizes[j,:] + mask[idx,:] > max_fold_sizes[:,j]).transpose())
#             print(f'    fit_criteria = {fit_criteria}')
            if fit_criteria == 0:
                ## compound fits into fold j
                folds[idx] = j
                fold_sizes[j,:] += mask[idx,:]
                
#                 print(f'    folds[idx] <- j:  {folds[idx]}')
#                 print(f'    fold_sizes[j] <- mask[{idx}]: {fold_sizes[j]}')
                break
                
        if folds[idx] == -1:
            j = np.random.randint(0, nfolds)
#             print(f'    force to fold {j}')
            folds[idx] = j
            fold_sizes[j,:] += mask[idx,:]
        
#         if (i == 100):
#             break
            
    print(' finished loop - i is:', i)
    return folds





In [393]:
folds = mtcv_clustered(Ymask, pd.Series(clusters), conf.nfolds)


* Y_in       : <class 'scipy.sparse.coo.coo_matrix'> 	 (423736, 3552)
* clusters: <class 'pandas.core.series.Series'>  	 (423736,) 
           Min: 0 Max:26049
0          8014
1          3958
2          3466
3          2321
4         14018
          ...  
423731     3148
423732    16950
423733    10806
423734    19103
423735    13660
Length: 423736, dtype: int64 

* cl_uniq: <class 'numpy.ndarray'> 26050
[ 8014  3958  3466 ... 19783 20080 19103]

* cl2id: <class 'pandas.core.series.Series'> 26050
8014         0
3958         1
3466         2
2321         3
14018        4
         ...  
21688    26045
23920    26046
19783    26047
20080    26048
19103    26049
Length: 26050, dtype: int64

 cl2id[0]: 21185
 cl2id[2321]: 3
 cl2id[13660]: 5601
 cl2id[2321]: 3
 cl2id[5601]: 
5328     5599
14954    5600
13660    5601
8773     5602
dtype: int64 

* cid:   <class 'pandas.core.series.Series'> 423736
8014         0
3958         1
3466         2
2321         3
14018        4
         ...  
3148   

In [375]:
print(folds)

None


In [63]:
print(Ymask)

NameError: name 'Ymask' is not defined