In [None]:
!pip install rdkit

Collecting rdkit
  Using cached rdkit-2023.9.6-cp310-cp310-win_amd64.whl (21.0 MB)
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.6



[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem, Draw, Descriptors
from rdkit.Chem.Draw import IPythonConsole
from sklearn.preprocessing import FunctionTransformer

In [4]:
data = pd.read_excel('data/19_35000.xlsx', index_col=0).reset_index(drop=True)
data.head()

Unnamed: 0,Title,IC50,SMILES
0,CHEMBL2206459,1.5e-05,[H]\N=C(N)\N[C@@H](C1)[C@@H](NC(=O)C)[C@@H](C=...
1,CHEMBL3818159,1.6e-05,O=C(O)C1=C[C@H](N)[C@@H](NC(=O)C)[C@@H](C1)COC...
2,CHEMBL1956716,3.3e-05,C=CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O...
3,CHEMBL1956715,3.2e-05,CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O)O...
4,CHEMBL4444029,4.1e-05,O=C(O)c1c(O)c(=O)cc([nH]1)-c(c2C)ccc(c2)-c3noc...


In [5]:
data.columns

Index(['Title', 'IC50', 'SMILES'], dtype='object')

In [6]:
data.shape

(36377, 3)

In [7]:
data = data[['Title','SMILES', 'IC50']]

In [8]:
import multiprocessing as mp

In [9]:
from tqdm.auto import tqdm

# 1st Features

In [10]:
def mol_dsc_calc(mols):
    return pd.DataFrame({k: f(Chem.MolFromSmiles(m)) for k, f in descriptors.items()} for m in mols)

# список конституционных и физико-химических дескрипторов из библиотеки RDKit
descriptors = {"HeavyAtomCount": Descriptors.HeavyAtomCount,
               "NHOHCount": Descriptors.NHOHCount,
               "NOCount": Descriptors.NOCount,
               "NumHAcceptors": Descriptors.NumHAcceptors,
               "NumHDonors": Descriptors.NumHDonors,
               "NumHeteroatoms": Descriptors.NumHeteroatoms,
               "NumRotatableBonds": Descriptors.NumRotatableBonds,
               "NumValenceElectrons": Descriptors.NumValenceElectrons,
               "NumAromaticRings": Descriptors.NumAromaticRings,
               "NumAliphaticHeterocycles": Descriptors.NumAliphaticHeterocycles,
               "RingCount": Descriptors.RingCount,
               "MW": Descriptors.MolWt,
               "LogP": Descriptors.MolLogP,
               "MR": Descriptors.MolMR,
               "TPSA": Descriptors.TPSA}

# sklearn трансформер для использования в конвейерном моделировании
descriptors_transformer = FunctionTransformer(mol_dsc_calc)
X = descriptors_transformer.transform(data['SMILES'])
X.head()

data_dsc = data.join(X)
data_dsc.to_excel('data_dsc.xlsx')
data_dsc.head()


[23:49:07] Conflicting single bond directions around double bond at index 55.
[23:49:07]   BondStereo set to STEREONONE and single bond directions set to NONE.
[23:49:07] Conflicting single bond directions around double bond at index 55.
[23:49:07]   BondStereo set to STEREONONE and single bond directions set to NONE.
[23:49:07] Conflicting single bond directions around double bond at index 55.
[23:49:07]   BondStereo set to STEREONONE and single bond directions set to NONE.
[23:49:07] Conflicting single bond directions around double bond at index 55.
[23:49:07]   BondStereo set to STEREONONE and single bond directions set to NONE.
[23:49:07] Conflicting single bond directions around double bond at index 55.
[23:49:07]   BondStereo set to STEREONONE and single bond directions set to NONE.
[23:49:07] Conflicting single bond directions around double bond at index 55.
[23:49:07]   BondStereo set to STEREONONE and single bond directions set to NONE.
[23:49:07] Conflicting single bond direc

Unnamed: 0,Title,SMILES,IC50,HeavyAtomCount,NHOHCount,NOCount,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumAliphaticHeterocycles,RingCount,MW,LogP,MR,TPSA
0,CHEMBL2206459,[H]\N=C(N)\N[C@@H](C1)[C@@H](NC(=O)C)[C@@H](C=...,1.5e-05,24,7,9,4,6,10,7,138,0,0,1,362.367,0.38187,90.4296,157.76
1,CHEMBL3818159,O=C(O)C1=C[C@H](N)[C@@H](NC(=O)C)[C@@H](C1)COC...,1.6e-05,21,4,6,4,3,6,7,120,0,0,1,298.383,1.0545,79.7279,101.65
2,CHEMBL1956716,C=CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O...,3.3e-05,21,3,6,5,2,6,7,118,0,0,1,295.359,-0.3617,77.0531,104.48
3,CHEMBL1956715,CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O)O...,3.2e-05,20,3,6,5,2,6,6,114,0,0,1,283.348,-0.5278,72.5301,104.48
4,CHEMBL4444029,O=C(O)c1c(O)c(=O)cc([nH]1)-c(c2C)ccc(c2)-c3noc...,4.1e-05,24,4,9,6,4,9,3,122,3,0,3,329.268,1.09742,81.9775,149.28


In [3]:
data_dsc = pd.read_excel('data_dsc.xlsx', index_col=0)

In [11]:
data_dsc

Unnamed: 0,Title,SMILES,IC50,HeavyAtomCount,NHOHCount,NOCount,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumAliphaticHeterocycles,RingCount,MW,LogP,MR,TPSA
0,CHEMBL2206459,[H]\N=C(N)\N[C@@H](C1)[C@@H](NC(=O)C)[C@@H](C=...,0.000015,24,7,9,4,6,10,7,138,0,0,1,362.367,0.38187,90.4296,157.76
1,CHEMBL3818159,O=C(O)C1=C[C@H](N)[C@@H](NC(=O)C)[C@@H](C1)COC...,0.000016,21,4,6,4,3,6,7,120,0,0,1,298.383,1.05450,79.7279,101.65
2,CHEMBL1956716,C=CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O...,0.000033,21,3,6,5,2,6,7,118,0,0,1,295.359,-0.36170,77.0531,104.48
3,CHEMBL1956715,CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O)O...,0.000032,20,3,6,5,2,6,6,114,0,0,1,283.348,-0.52780,72.5301,104.48
4,CHEMBL4444029,O=C(O)c1c(O)c(=O)cc([nH]1)-c(c2C)ccc(c2)-c3noc...,0.000041,24,4,9,6,4,9,3,122,3,0,3,329.268,1.09742,81.9775,149.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36372,CHEMBL3699085,CC(=O)N[C@H]1CC[C@@H]([C@@H]12)[C@@H](O[Si](C)...,2366.790000,44,3,10,6,3,11,8,238,2,0,4,622.839,5.88880,171.2541,127.35
36373,CHEMBL109004,CC(=O)Nc(cc1)c(OC(=O)C)cc1C(=O)O,2372.100000,17,2,6,4,2,6,3,90,1,0,1,237.211,1.26850,59.0140,92.70
36374,CHEMBL2259758,c1cccc(c12)cccc2CNC(=O)CCCCCCO[C@]3(C(=O)O)C[C...,2600.970000,61,7,17,13,7,18,21,330,4,1,5,866.987,2.32120,224.7361,250.36
36375,CHEMBL109781,NCCCC(=O)Nc(c(cc1)NC(=O)C)cc1C(=O)O,2793.000000,20,5,7,4,4,7,6,108,1,0,1,279.296,1.02060,74.6231,121.52


In [12]:
x, y = data_dsc.drop(columns=['Title', 'SMILES', 'IC50']), data_dsc['IC50']

In [13]:
from sklearn.model_selection import train_test_split

In [164]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

## CatBoost

In [15]:
from catboost import CatBoostRegressor

In [16]:
cb = CatBoostRegressor(verbose=100, task_type='CPU', loss_function='RMSE', eval_metric='R2')

In [10]:
cb.fit(x_train, y_train, eval_set=(x_test, y_test))

Learning rate set to 0.085571
0:	learn: 0.0119448	test: 0.0000101	best: 0.0000101 (0)	total: 162ms	remaining: 2m 41s
100:	learn: 0.4152909	test: -0.4398710	best: 0.0136328 (7)	total: 536ms	remaining: 4.77s
200:	learn: 0.5357876	test: -0.6761682	best: 0.0136328 (7)	total: 892ms	remaining: 3.55s
300:	learn: 0.6201801	test: -0.7818011	best: 0.0136328 (7)	total: 1.25s	remaining: 2.9s
400:	learn: 0.6799376	test: -0.9104370	best: 0.0136328 (7)	total: 1.6s	remaining: 2.4s
500:	learn: 0.7262149	test: -1.0304158	best: 0.0136328 (7)	total: 1.96s	remaining: 1.96s
600:	learn: 0.7596492	test: -1.1393606	best: 0.0136328 (7)	total: 2.32s	remaining: 1.54s
700:	learn: 0.7850367	test: -1.2348512	best: 0.0136328 (7)	total: 2.68s	remaining: 1.14s
800:	learn: 0.7987940	test: -1.2802035	best: 0.0136328 (7)	total: 3.04s	remaining: 755ms
900:	learn: 0.8120605	test: -1.3126428	best: 0.0136328 (7)	total: 3.4s	remaining: 373ms
999:	learn: 0.8202029	test: -1.3415430	best: 0.0136328 (7)	total: 3.76s	remaining: 0us

<catboost.core.CatBoostRegressor at 0x1feb9aedd80>

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [12]:
linreg = LinearRegression()

In [13]:
linreg.fit(x_train, y_train)

In [14]:
r2_score(y_test, linreg.predict(x_test))

-0.02150295018926074

## Random Forest

In [15]:
from sklearn.ensemble import RandomForestRegressor

In [20]:
rf = RandomForestRegressor(n_jobs=-1, criterion='squared_error', verbose=3)
rf.fit(x_train, y_train)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=-1)]: Done  87 out of 100 | elapsed:    1.7s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.9s finished


In [21]:
r2_score(y_test, rf.predict(x_test))

[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done  87 out of 100 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=24)]: Done 100 out of 100 | elapsed:    0.0s finished


-0.35715432635511735

## MLP

In [1]:
from torch import nn

In [35]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_layers):
        super().__init__()
        self.input_layer = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.ReLU())
        self.hidden_layers = nn.ModuleList()
        for i in range(n_layers):
            self.hidden_layers.append(nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU()))
        self.output_layer = nn.Linear(hidden_dim, 1)
    def forward(self, x):
        x = self.input_layer(x)
        for layer in self.hidden_layers:
            x = layer(x)
        return self.output_layer(x)

In [22]:
import torch

In [75]:
import numpy as np

In [100]:
class MolDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        super(MolDataset, self).__init__()
        self.x = x.to_numpy(dtype=np.float32)
        self.y = y.to_numpy(dtype=np.float32)
        
    def __len__(self):
        return len(self.x)
        
        
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [101]:
x_train.shape[1]

15

In [181]:
model = MLP(15, 64, 3)

In [192]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

loss_function = torch.nn.MSELoss()

In [193]:
def train_epoch(model, loader, loss_function, optimizer, device):
    model.train()
    model.to(device)
    loss_function.to(device)
    
    preds = []
    targets = []
    total_loss = 0.
    
    for batch in loader:
        data, y = batch
        data, y = data.to(device), y.to(device)
        output = model(data).flatten()
        loss = loss_function(output,y)
        
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        preds.append(output.cpu())
        targets.append(y.cpu())
    
    preds = torch.cat(preds, dim=0).detach().cpu()
    targets = torch.cat(targets, dim=0).detach().cpu()
    mae = loss_function(targets, preds).item()

    return {"TRAIN Loss": {total_loss / len(loader)}, "MAE": {mae}, "R2": r2_score(targets.numpy(), preds.numpy())}
        
@torch.no_grad()        
def eval_epoch(model, loader, loss_function, device):
    model.eval()
    model.to(device)
    loss_function.to(device)
    
    preds = []
    targets = []
    total_loss = 0.
    
    for batch in loader:
        data, y = batch
        data, y = data.to(device), y.to(device)
        
        output = model(data).flatten()
        
        loss = loss_function(output,y)
        
        total_loss += loss.item() 
        
        preds.append(output.cpu())
        targets.append(y.cpu())
    
    preds = torch.cat(preds, dim=0)
    targets = torch.cat(targets, dim=0)
    mae = loss_function(targets, preds).item()

    return {"EVAL Loss": {total_loss / len(loader)}, "MAE": {mae}, "R2": r2_score(targets.numpy(), preds.numpy())}

In [216]:
y_train.mean()

3.785454431697749

In [215]:
y_train.std()

2.6385381048010323

In [207]:
perc_99 = np.percentile(y_train, 99)
x_train = x_train[y_train <= perc_99]


In [208]:
y_train = y_train[y_train <= perc_99]

In [209]:
x_train

Unnamed: 0,HeavyAtomCount,NHOHCount,NOCount,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumAliphaticHeterocycles,RingCount,MW,LogP,MR,TPSA
8987,18,2,3,1,2,5,2,92,2,0,2,264.687,4.12310,70.4934,41.13
5082,19,0,5,6,0,6,3,96,3,0,3,272.333,2.52180,74.5510,52.31
20537,31,1,7,7,1,11,7,162,3,1,4,451.426,4.28270,104.4887,86.48
8576,25,1,5,5,1,6,5,126,3,0,3,350.447,3.88624,100.5327,67.77
19377,21,1,2,3,1,5,3,112,1,1,2,341.307,5.38140,96.3607,24.39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4966,18,1,5,5,1,6,3,94,2,0,2,265.290,2.68842,67.0567,68.54
1103,29,2,6,4,3,7,7,150,3,0,3,408.527,4.23070,120.5289,75.61
9750,22,0,3,2,0,7,2,116,2,1,3,376.692,4.78830,90.7668,37.38
1308,27,2,6,5,2,6,4,138,3,0,4,364.401,3.32630,101.2140,84.47


In [210]:
train_dataset, val_dataset = MolDataset(x_train, y_train), MolDataset(x_test, y_test)

In [211]:
from torch.utils.data import DataLoader

In [212]:
train_loader, val_loader = DataLoader(train_dataset, batch_size=16), DataLoader(val_dataset, batch_size=16)

In [213]:
from sklearn.metrics import r2_score

In [214]:
train_logs, eval_logs = [], []

for i in tqdm(range(200)):
    train_log = train_epoch(
        model=model, 
        loader=train_loader, 
        loss_function=loss_function, 
        optimizer=optimizer,
        device='cpu'
    )
    
    eval_log = eval_epoch(
        model=model, 
        loader=val_loader, 
        loss_function=loss_function, 
        device='cpu'
    )

    print(train_log, eval_log)
    
    train_logs.append(train_log)
    eval_logs.append(eval_log)

  0%|          | 0/200 [00:00<?, ?it/s]

{'TRAIN Loss': {6.014655988401608}, 'MAE': {6.013247489929199}, 'R2': 0.13622807899865097} {'EVAL Loss': {2727.311870926294}, 'MAE': {2730.0078125}, 'R2': -0.008549474004163171}
{'TRAIN Loss': {5.992975754449753}, 'MAE': {5.991591453552246}, 'R2': 0.13933888209795675} {'EVAL Loss': {2727.812886809004}, 'MAE': {2730.509521484375}, 'R2': -0.008734726612690702}
{'TRAIN Loss': {5.984808745351878}, 'MAE': {5.983328342437744}, 'R2': 0.1405258976459829} {'EVAL Loss': {2727.383497971642}, 'MAE': {2730.079833984375}, 'R2': -0.008575940773077662}
{'TRAIN Loss': {5.976899441382153}, 'MAE': {5.975529670715332}, 'R2': 0.14164608069204088} {'EVAL Loss': {2726.94023645846}, 'MAE': {2729.6357421875}, 'R2': -0.00841203866857132}
{'TRAIN Loss': {5.9737315385174075}, 'MAE': {5.972428321838379}, 'R2': 0.14209158097271735} {'EVAL Loss': {2727.243002054771}, 'MAE': {2729.939208984375}, 'R2': -0.008524011285481636}
{'TRAIN Loss': {5.967765446729342}, 'MAE': {5.966405868530273}, 'R2': 0.14295669712936598} {'E

KeyboardInterrupt: 

# 2nd Features

In [217]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors, AllChem


def rdkit_fp(smiles_column: pd.Series, radius=3, nBits=2048, useChirality=False):
    # morganFP_rdkit
    def desc_gen(mol):
        mol = Chem.MolFromSmiles(mol)
        bit_vec = np.zeros((1,), np.int16)
        DataStructs.ConvertToNumpyArray(
            AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits, useChirality=useChirality), bit_vec)
        return bit_vec

    return pd.DataFrame.from_records(smiles_column.apply(func=desc_gen), columns=[f'bit_id_{i}' for i in range(nBits)])


def rdkit_2d(smiles_column: pd.Series):
    # 2d_rdkit
    descriptors = {i[0]: i[1] for i in Descriptors._descList}
    return pd.DataFrame({k: f(Chem.MolFromSmiles(m)) for k, f in descriptors.items()} for m in smiles_column)

In [218]:
Y = rdkit_fp(data['SMILES'])
Y.head()

[00:23:33] Conflicting single bond directions around double bond at index 55.
[00:23:33]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:23:37] Conflicting single bond directions around double bond at index 7.
[00:23:37]   BondStereo set to STEREONONE and single bond directions set to NONE.


Unnamed: 0,bit_id_0,bit_id_1,bit_id_2,bit_id_3,bit_id_4,bit_id_5,bit_id_6,bit_id_7,bit_id_8,bit_id_9,...,bit_id_2038,bit_id_2039,bit_id_2040,bit_id_2041,bit_id_2042,bit_id_2043,bit_id_2044,bit_id_2045,bit_id_2046,bit_id_2047
0,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [219]:
data_fp = data.join(Y)
data_fp.head()

Unnamed: 0,Title,SMILES,IC50,bit_id_0,bit_id_1,bit_id_2,bit_id_3,bit_id_4,bit_id_5,bit_id_6,...,bit_id_2038,bit_id_2039,bit_id_2040,bit_id_2041,bit_id_2042,bit_id_2043,bit_id_2044,bit_id_2045,bit_id_2046,bit_id_2047
0,CHEMBL2206459,[H]\N=C(N)\N[C@@H](C1)[C@@H](NC(=O)C)[C@@H](C=...,1.5e-05,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,CHEMBL3818159,O=C(O)C1=C[C@H](N)[C@@H](NC(=O)C)[C@@H](C1)COC...,1.6e-05,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL1956716,C=CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O...,3.3e-05,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL1956715,CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O)O...,3.2e-05,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL4444029,O=C(O)c1c(O)c(=O)cc([nH]1)-c(c2C)ccc(c2)-c3noc...,4.1e-05,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [268]:
x, y = data_fp[[f'bit_id_{i}' for i in range(2048)]], data_fp['IC50']

In [269]:
from sklearn.decomposition import TruncatedSVD

In [338]:
num_svd = TruncatedSVD(32)

In [339]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [340]:
x_train = num_svd.fit_transform(x_train)
x_test = num_svd.transform(x_test)

In [341]:
cb = CatBoostRegressor(verbose=100, task_type='CPU', loss_function='RMSE', eval_metric='R2')

In [342]:
cb.fit(x_train, y_train, eval_set=(x_test, y_test))

Learning rate set to 0.085571
0:	learn: 0.0050911	test: 0.0003295	best: 0.0003295 (0)	total: 5.86ms	remaining: 5.86s
100:	learn: 0.4646872	test: 0.0495996	best: 0.0495996 (100)	total: 318ms	remaining: 2.83s
200:	learn: 0.6141535	test: 0.0659466	best: 0.0659466 (200)	total: 636ms	remaining: 2.53s
300:	learn: 0.6964898	test: 0.0728981	best: 0.0731441 (293)	total: 947ms	remaining: 2.2s
400:	learn: 0.7486132	test: 0.0712829	best: 0.0732473 (338)	total: 1.25s	remaining: 1.88s
500:	learn: 0.7878282	test: 0.0727070	best: 0.0732473 (338)	total: 1.56s	remaining: 1.56s
600:	learn: 0.8217719	test: 0.0723003	best: 0.0732473 (338)	total: 1.88s	remaining: 1.25s
700:	learn: 0.8448803	test: 0.0718712	best: 0.0732473 (338)	total: 2.2s	remaining: 937ms
800:	learn: 0.8619836	test: 0.0706067	best: 0.0732473 (338)	total: 2.51s	remaining: 624ms
900:	learn: 0.8780567	test: 0.0699637	best: 0.0732473 (338)	total: 2.84s	remaining: 312ms
999:	learn: 0.8903659	test: 0.0705520	best: 0.0732473 (338)	total: 3.13s	re

<catboost.core.CatBoostRegressor at 0x2aee95900>

In [290]:
import random

In [291]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        # torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.deterministic = False
        

In [292]:
seed_everything(42)

In [295]:
train_inds, test_inds = train_test_split(np.arange(data.shape[0]))
train_inds

array([12238, 30853,  4513, ..., 11284,   860, 15795])

In [485]:
x, y = data_fp[[f'bit_id_{i}' for i in range(2048)]], data_fp['IC50']
num_svd = TruncatedSVD(32, algorithm='arpack', random_state=42)
# emb_svd = TruncatedSVD(32)
x_train, x_test, y_train, y_test = x.to_numpy()[train_inds], x.to_numpy()[test_inds], y.to_numpy()[train_inds], y.to_numpy()[test_inds]

In [486]:
x_train = num_svd.fit_transform(x_train.astype(float))
x_test = num_svd.transform(x_test.astype(float))

In [487]:
embs = torch.load('all_embs.pt')

In [488]:
embs_train, embs_test = embs[train_inds], embs[test_inds]
# embs_train = emb_svd.fit_transform(embs_train)
# embs_test = emb_svd.transform(embs_test)

In [489]:
x_train, x_test = pd.DataFrame(x_train), pd.DataFrame(x_test)
x_train['emb'], x_test['emb'] = embs_train.tolist(), embs_test.tolist()

In [492]:
cb = CatBoostRegressor(verbose=100, task_type='CPU', loss_function='RMSE', eval_metric='R2', embedding_features=['emb'])

In [493]:
cb.fit(x_train, y_train, eval_set=(x_test, y_test))

Learning rate set to 0.085571
0:	learn: 0.0226141	test: 0.0013212	best: 0.0013212 (0)	total: 3.64ms	remaining: 3.63s
100:	learn: 0.4642405	test: 0.0603212	best: 0.0608631 (96)	total: 278ms	remaining: 2.47s
200:	learn: 0.5954694	test: 0.0653206	best: 0.0656686 (199)	total: 544ms	remaining: 2.16s
300:	learn: 0.6795844	test: 0.0648004	best: 0.0656686 (199)	total: 810ms	remaining: 1.88s
400:	learn: 0.7353616	test: 0.0653668	best: 0.0659958 (323)	total: 1.08s	remaining: 1.61s
500:	learn: 0.7814218	test: 0.0639057	best: 0.0659958 (323)	total: 1.34s	remaining: 1.34s
600:	learn: 0.8141752	test: 0.0624329	best: 0.0659958 (323)	total: 1.61s	remaining: 1.07s
700:	learn: 0.8415532	test: 0.0632880	best: 0.0659958 (323)	total: 1.88s	remaining: 802ms
800:	learn: 0.8612165	test: 0.0630984	best: 0.0659958 (323)	total: 2.15s	remaining: 535ms
900:	learn: 0.8785521	test: 0.0620002	best: 0.0659958 (323)	total: 2.42s	remaining: 266ms
999:	learn: 0.8919296	test: 0.0616994	best: 0.0659958 (323)	total: 2.69s	r

<catboost.core.CatBoostRegressor at 0x2b62ebd60>

In [None]:
Z = rdkit_2d(data['SMILES'])
Z.head()

[09:54:33] Conflicting single bond directions around double bond at index 55.
[09:54:33]   BondStereo set to STEREONONE and single bond directions set to NONE.
[09:54:33] Conflicting single bond directions around double bond at index 55.
[09:54:33]   BondStereo set to STEREONONE and single bond directions set to NONE.
[09:54:33] Conflicting single bond directions around double bond at index 55.
[09:54:33]   BondStereo set to STEREONONE and single bond directions set to NONE.
[09:54:33] Conflicting single bond directions around double bond at index 55.
[09:54:33]   BondStereo set to STEREONONE and single bond directions set to NONE.
[09:54:33] Conflicting single bond directions around double bond at index 55.
[09:54:33]   BondStereo set to STEREONONE and single bond directions set to NONE.
[09:54:33] Conflicting single bond directions around double bond at index 55.
[09:54:33]   BondStereo set to STEREONONE and single bond directions set to NONE.
[09:54:33] Conflicting single bond direc

In [None]:
data_2d = data.join(Z)
data_2d.head()

In [None]:
data_2d.to_csv('data_2d.csv')

^C
Note: you may need to restart the kernel to use updated packages.


In [None]:
cat_columns = ['']