### DataSet featurize

In [1]:
from get_data import get_data
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder



In [72]:
data_path = "../../chemprop-master/chemprop/data/bace.csv"
smiles_columns = ['mol']
target_columns = ['Class']

In [73]:
data = get_data(path=data_path, 
                smiles_columns=smiles_columns, 
                features_generator=['rdkit_2d'],
                target_columns=target_columns, 
                skip_none_targets=True
               )

1513it [00:00, 189596.43it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1513/1513 [00:30<00:00, 50.05it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1513/1513 [00:00<00:00, 1625.56it/s]


In [74]:
features_df = pd.DataFrame(data.features())

In [17]:
np.save('bace_rdkit',features_df)

In [12]:
features_df = np.array(data.features())
numeric = features_df[:,[not (i in categorical_idx) for i in range(features_df.shape[1])]]
categorical = features_df[:,[(i in categorical_idx) for i in range(features_df.shape[1])]]
categorical = OneHotEncoder().fit_transform(categorical)
features_df = np.concatenate([numeric, categorical.toarray()], axis = 1)

In [35]:
categorical.shape

(1513, 402)

In [147]:
help(load_features)

Help on function load_features in module chemprop.features.utils:

load_features(path: str) -> numpy.ndarray
    Loads features saved in a variety of formats.
    
    Supported formats:
    
    * :code:`.npz` compressed (assumes features are saved with name "features")
    * .npy
    * :code:`.csv` / :code:`.txt` (assumes comma-separated features with a header and with one line per molecule)
    * :code:`.pkl` / :code:`.pckl` / :code:`.pickle` containing a sparse numpy array
    
    .. note::
    
       All formats assume that the SMILES loaded elsewhere in the code are in the same
       order as the features loaded here.
    
    :param path: Path to a file containing features.
    :return: A 2D numpy array of size :code:`(num_molecules, features_size)` containing the features.



### ChemBL Data Featurize and train scalers

In [1]:
from get_data import get_data
import pandas as pd



In [3]:
data_path = "D:\Joe\Acads\Sem8\DDP\Code files\FSR\Data\chembl.csv"
smiles_columns = ['canonical_smiles']
target_columns = ['target']

In [4]:
data = pd.read_csv(data_path,index_col=0)

In [5]:
sub_data = data.sample(500000, random_state = 42)

In [6]:
sub_data.to_csv('D:\Joe\Acads\Sem8\DDP\Code files\FSR\Data\chembl_500k.csv')

In [7]:
data_path = "D:\Joe\Acads\Sem8\DDP\Code files\FSR\Data\chembl_500k.csv"
data_featurized = get_data(path=data_path, 
                smiles_columns=smiles_columns, 
                features_generator=['rdkit_2d'],
                target_columns=target_columns, 
                skip_none_targets=True
               )

500000it [00:03, 161449.09it/s]
100%|████████████████████████████████████████████████████████████████████████| 500000/500000 [3:52:23<00:00, 35.86it/s]
100%|█████████████████████████████████████████████████████████████████████████| 500000/500000 [09:28<00:00, 880.10it/s]


In [9]:
df = pd.DataFrame(data_featurized.features())
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,1.961497,1716.071114,43.973598,36.064274,36.880771,27.800748,20.714359,21.594012,16.64749,17.537171,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.083173
1,1.282396,1582.368656,28.061903,22.85855,22.85855,19.439892,13.926419,13.926419,10.41934,10.41934,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.212126
2,2.349904,609.104973,13.664926,10.926102,10.926102,9.274387,6.315262,6.315262,4.104164,4.104164,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.462735
3,1.361308,1372.369714,26.863232,21.677565,22.494061,18.246352,13.250401,14.733564,10.223611,12.110137,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.356467
4,2.59115,703.527523,10.413849,8.111496,8.111496,7.237183,4.656258,4.656258,3.555242,3.555242,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.52386


In [10]:
df2 = df.copy()

In [12]:
import numpy as np
maxdf = df.replace(np.inf,0).max()

In [13]:
maxdf[0]

9.583925636408097

In [14]:
df3 = df.transpose()[:]

In [15]:
for idx, col in df3.iterrows():
    df.iloc[:,idx] = col.replace(np.inf,maxdf[idx])

In [17]:
np.save('chembl_500k_features.npy',df)

In [18]:
import numpy as np
import matplotlib.pyplot as plt
import bisect
cardinality = df.nunique().values
a = np.sort(cardinality)

In [19]:
a # We will use the 203 -> 409 jump as division point for categorical and numerical

array([     1,      1,      1,      2,      3,      3,      3,      3,
            3,      3,      4,      4,      4,      4,      4,      5,
            5,      5,      5,      5,      6,      6,      6,      6,
            6,      6,      6,      6,      6,      6,      7,      7,
            7,      7,      7,      7,      7,      7,      8,      8,
            8,      8,      8,      8,      8,      8,      8,     10,
           10,     11,     12,     12,     14,     14,     14,     15,
           15,     15,     15,     15,     16,     18,     18,     19,
           19,     20,     21,     21,     22,     22,     24,     24,
           25,     26,     26,     28,     29,     29,     29,     30,
           31,     32,     33,     34,     36,     37,     39,     39,
           40,     41,     42,     50,     52,     53,     55,     58,
           59,     65,     74,     86,     96,    111,    125,    151,
          154,    188,    194,    203,    409,    782,   1031,   1303,
      

In [20]:
'''cardinality[59]'''

203

In [None]:
'''bisect.bisect(a, 117)'''

In [22]:
'''np.argsort(cardinality)'''

array([102, 186,  89, 149, 163, 167, 152, 151, 195, 164, 147, 191, 193,
       175, 192, 137, 177, 126, 145, 153, 146, 184,  60, 188, 156, 160,
       142, 174, 196, 162, 194, 127, 159, 176, 183, 168, 173, 172, 128,
       132, 190, 198, 166, 170, 165, 118, 171, 133, 138, 189, 187, 185,
       178, 157, 136, 123,  62, 122, 161, 114, 169, 134, 154,  51, 144,
       117, 143, 148, 120, 135, 182, 181, 140, 131,  54, 139, 121, 179,
       180,  63,  52, 158,  64,  53,  83,  55, 116, 115,  56, 150, 155,
       119, 141, 125, 130,  80, 124, 129, 197,  58,  49,  16,  93,  57,
       100,  50,  61,  59,  31,  75,  65,  70,  94,  92,  74,  85,  90,
        97,  29,  71,  69, 101,  73,  26,  15,  72,  84,  27,  68,  91,
        24,  28,  30,  96,  87,  22,  21,  66,  81,  76,  67,  20,  99,
        98,   2,  86,  77, 103,  14,  79,  34,  23,  78,  82,  88,  17,
        18,  19,  35,  32,   5,  95, 105,  36, 113,  25,  48,   3,  46,
         4,  39,  45, 104,  41,  43,  33, 111, 112, 107,  47,   

In [6]:
'''import numpy as np
arr = np.array([     1,      1,      1,      2,      3,      3,      3,      3,
            3,      3,      4,      4,      4,      4,      4,      5,
            5,      5,      5,      5,      6,      6,      6,      6,
            6,      6,      6,      6,      6,      6,      7,      7,
            7,      7,      7,      7,      7,      7,      8,      8,
            8,      8,      8,      8,      8,      8,      8,     10,
           10,     11,     12,     12,     14,     14,     14,     15,
           15,     15,     15,     15,     16,     18,     18,     19,
           19,     20,     21,     21,     22,     22,     24,     24,
           25,     26,     26,     28,     29,     29,     29,     30,
           31,     32,     33,     34,     36,     37,     39,     39,
           40,     41,     42,     50,     52,     53,     55,     58,
           59,     65,     74,     86,     96,    111,    125,    151,
          154,    188,    194,    203,    409,    782,   1031,   1303,
         1446,   1475,   1781,   1818,   2326,   2399,   2649,   3280,
         3360,   3480,   3817,   4726,   4994,   5113,   5959,   6373,
         6945,   7361,   7438,   7540,  12634,  12731,  13267,  14117,
        17982,  18338,  19640,  19997,  23249,  27933,  29535,  35111,
        37846,  40890,  45022,  45414,  47049,  47272,  48733,  49960,
        59870,  60456,  65350,  72356,  74678,  75710,  96681, 110909,
       119646, 179153, 183200, 204343, 214037, 221850, 250681, 280577,
       304492, 329736, 346456, 350986, 395857, 396677, 404856, 407933,
       416168, 416923, 435385, 444792, 448489, 452745, 453862, 454614,
       458404, 463408, 463466, 464756, 470018, 470086, 474499, 474729,
       479038, 479520, 480491, 482540, 482540, 486876, 486990, 487979])'''

In [8]:
arr[107]

203

In [3]:
'''l1 = set([102, 186,  89, 149, 152, 195, 151, 137, 167,  60, 191, 145, 147,
       163, 164, 175, 126, 146, 127, 184, 193, 192, 177, 173, 142, 138,
       196, 153, 174, 156, 188, 159, 160, 162, 172, 171, 170, 118, 183,
       190, 176, 132, 128, 194, 168, 198, 166, 187, 165, 189, 178, 136,
       133, 185,  62, 134, 161, 154, 169, 179, 180, 157, 121, 122, 123,
       117, 114, 182, 181, 144, 148,  51, 143, 120,  54, 135, 131, 140,
        52,  63, 158, 139,  64,  83,  55,  53, 155, 150,  56, 116, 115,
       119, 129, 197,  80, 125, 141, 130, 124,  16, 100,  93,  58,  49,
        57,  50,  59,  61])
l2 = set([163, 102, 186, 151,  89, 145, 146, 164,  60, 147, 149, 195, 137,
       168, 170, 160, 175, 152, 156, 126, 167, 191, 193, 118, 142, 162,
       138, 188, 159, 182, 171, 181, 174, 192, 176, 177, 184, 153, 183,
       194, 190, 196, 173, 172, 127, 132, 165, 136, 166, 187, 198, 128,
       178, 117, 161, 189, 135, 185, 133, 120,  62, 134,  51, 144, 157,
       154,  54, 143, 148, 123, 122, 114, 169, 121, 179, 180,  55, 139,
        63, 119, 158, 140,  83,  52,  64, 131,  56,  53, 115, 116, 150,
       129, 155,  80,  16, 197, 125, 141, 100, 130, 124,  93,  58,  57,
        49,  50,  59,  61])'''

In [24]:
'''print(l1.issubset(l2))
print(l2.issubset(l1))'''

True
True


In [25]:
cat_cols = df.iloc[:,[163, 102, 186, 151,  89, 145, 146, 164,  60, 147, 149, 195, 137,
       168, 170, 160, 175, 152, 156, 126, 167, 191, 193, 118, 142, 162,
       138, 188, 159, 182, 171, 181, 174, 192, 176, 177, 184, 153, 183,
       194, 190, 196, 173, 172, 127, 132, 165, 136, 166, 187, 198, 128,
       178, 117, 161, 189, 135, 185, 133, 120,  62, 134,  51, 144, 157,
       154,  54, 143, 148, 123, 122, 114, 169, 121, 179, 180,  55, 139,
        63, 119, 158, 140,  83,  52,  64, 131,  56,  53, 115, 116, 150,
       129, 155,  80,  16, 197, 125, 141, 100, 130, 124,  93,  58,  57,
        49,  50,  59,  61]].round(4)

In [26]:
cat_cols.nunique().sum()

3038

In [1]:
'''cat_cols_bace = features_df.iloc[:,[163, 102, 186, 151,  89, 145, 146, 164,  60, 147, 149, 195, 137,
       168, 170, 160, 175, 152, 156, 126, 167, 191, 193, 118, 142, 162,
       138, 188, 159, 182, 171, 181, 174, 192, 176, 177, 184, 153, 183,
       194, 190, 196, 173, 172, 127, 132, 165, 136, 166, 187, 198, 128,
       178, 117, 161, 189, 135, 185, 133, 120,  62, 134,  51, 144, 157,
       154,  54, 143, 148, 123, 122, 114, 169, 121, 179, 180,  55, 139,
        63, 119, 158, 140,  83,  52,  64, 131,  56,  53, 115, 116, 150,
       129, 155,  80,  16, 197, 125, 141, 100, 130, 124,  93,  58,  57,
        49,  50,  59,  61]]'''

'cat_cols_bace = features_df.iloc[:,[163, 102, 186, 151,  89, 145, 146, 164,  60, 147, 149, 195, 137,\n       168, 170, 160, 175, 152, 156, 126, 167, 191, 193, 118, 142, 162,\n       138, 188, 159, 182, 171, 181, 174, 192, 176, 177, 184, 153, 183,\n       194, 190, 196, 173, 172, 127, 132, 165, 136, 166, 187, 198, 128,\n       178, 117, 161, 189, 135, 185, 133, 120,  62, 134,  51, 144, 157,\n       154,  54, 143, 148, 123, 122, 114, 169, 121, 179, 180,  55, 139,\n        63, 119, 158, 140,  83,  52,  64, 131,  56,  53, 115, 116, 150,\n       129, 155,  80,  16, 197, 125, 141, 100, 130, 124,  93,  58,  57,\n        49,  50,  59,  61]]'

In [2]:
'''cat_unique = [set(cat_cols[j]) for j in cat_cols.columns]
cat_bace_unique = [set(cat_cols_bace[j]) for j in cat_cols_bace.columns]'''

'cat_unique = [set(cat_cols[j]) for j in cat_cols.columns]\ncat_bace_unique = [set(cat_cols_bace[j]) for j in cat_cols_bace.columns]'

In [27]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [28]:
ohe_list = []
for col in cat_cols.columns:
    ohe = OneHotEncoder(sparse = False, handle_unknown='ignore')
    ohe.fit(cat_cols[col].values.reshape(-1,1))
    ohe_list.append(ohe)

In [29]:
import pickle
with open(r"D:\Joe\Acads\Sem8\DDP\Code files\FSR\Data\features\ohe_list", "wb") as f:
    pickle.dump(ohe_list, f)

In [3]:
'''inp = cat_cols.values
op = None
for idx in range(108):
    temp = ohe_list[idx].transform(inp[:,idx].reshape(-1,1))
    if not isinstance(op, np.ndarray):
        op = temp
    else:
        op = np.concatenate([op, temp], axis = 1)'''

'inp = cat_cols.values\nop = None\nfor idx in range(108):\n    temp = ohe_list[idx].transform(inp[:,idx].reshape(-1,1))\n    if not isinstance(op, np.ndarray):\n        op = temp\n    else:\n        op = np.concatenate([op, temp], axis = 1)'

#### Numerical features now

In [31]:
numeric_col_names = np.sort([75,  31,  70,  94,  65,  74,  92,  85,  71,
        97,  90,  69,  29,  73, 101,  15,  72,  68,  26,  84,  24,  91,
        27,  28,  96,  22,  87,  21,  66,  76,  81,  30,  67,  20,  99,
        14,  98,  86,  23,  79,  77,   2,  17,  82,  78,  19, 105,  18,
        88, 113, 103,  34,  32,   5,  95,  35, 104, 111, 112,  25, 107,
        39,  48,  45,  36, 109, 110,   3, 106,  41,  42,  43, 108,  46,
         4,  44,  33,   0,  37,   1,  47,  40,  38,   6, 199,   7,  10,
        12,   8,  11,  13,   9])
numeric_col_names

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  65,  66,  67,  68,
        69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  81,  82,
        84,  85,  86,  87,  88,  90,  91,  92,  94,  95,  96,  97,  98,
        99, 101, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
       199])

In [32]:
num_cols = df.iloc[:,numeric_col_names]

In [33]:
maxs = num_cols.max().values
mins = num_cols.min().values

In [34]:
np.sort(num_cols.values.reshape(-1))[-10:]

array([4.40960086e+242, 1.26343659e+249, 3.70017935e+251, 2.53456651e+252,
       1.08404908e+255, 1.15467445e+255, 2.86611779e+263, 1.78133756e+270,
       4.70001914e+275, 2.45138414e+302])

In [35]:
np.argsort(num_cols.max().values)

array([91, 44, 28, 42, 25, 40, 38, 26, 27, 43,  0, 41, 29, 37, 39, 86, 90,
       12, 89, 10, 53, 85, 58,  8, 57, 45, 66, 76, 82,  9,  6, 88, 87,  7,
       70,  5, 11, 56,  3,  4, 21, 79, 20, 52, 34, 73,  2, 18, 55, 75, 65,
       50, 64, 30, 49, 33, 23, 71, 51, 60, 13, 84, 19, 16, 61, 17, 15, 72,
       62, 68, 63, 59, 77, 83, 78, 22, 74, 14, 67, 69, 46, 54, 81, 36, 48,
       80, 31, 24, 47, 35,  1, 32], dtype=int64)

In [36]:
col_min_max = {np.inf: num_cols[np.isfinite(num_cols)].max(),    # column-wise max
              -np.inf: num_cols[np.isfinite(num_cols)].min()}    # column-wise min

num_cols = num_cols.replace({col: col_min_max for col in num_cols.columns})

One of the feautures with a very large range (order 1e0 to 1e100) is converted to log scale to accomodate for the difference in its lowest to largest value

In [None]:
num_cols.iloc[:,32] = np.log(num_cols.iloc[:,32]+1)

In [38]:
num_cols.iloc[:,32]

0         28.538621
1         21.069028
2         10.114249
3         19.870362
4          8.491473
            ...    
499995    11.725137
499996    10.674614
499997    15.792459
499998    20.367895
499999     8.467165
Name: 33, Length: 500000, dtype: float64

In [39]:
scaler = MinMaxScaler()
scaler.fit(num_cols)

MinMaxScaler()

In [40]:
import pickle
with open(r"D:\Joe\Acads\Sem8\DDP\Code files\FSR\Data\features\min_max_scaler", "wb") as f:
    pickle.dump(scaler, f)

In [154]:
sum([i==i for i in num_cols.iloc[70,:].values])

92