In [None]:
!pip install rdkit

Collecting rdkit
  Using cached rdkit-2023.9.6-cp310-cp310-win_amd64.whl (21.0 MB)
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.6



[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem, Draw, Descriptors
from rdkit.Chem.Draw import IPythonConsole
from sklearn.preprocessing import FunctionTransformer

In [2]:
data = pd.read_excel('data/19_35000.xlsx', index_col=0).reset_index(drop=True)
data.head()

Unnamed: 0,Title,IC50,SMILES
0,CHEMBL2206459,1.5e-05,[H]\N=C(N)\N[C@@H](C1)[C@@H](NC(=O)C)[C@@H](C=...
1,CHEMBL3818159,1.6e-05,O=C(O)C1=C[C@H](N)[C@@H](NC(=O)C)[C@@H](C1)COC...
2,CHEMBL1956716,3.3e-05,C=CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O...
3,CHEMBL1956715,3.2e-05,CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O)O...
4,CHEMBL4444029,4.1e-05,O=C(O)c1c(O)c(=O)cc([nH]1)-c(c2C)ccc(c2)-c3noc...


In [3]:
data.columns

Index(['Title', 'IC50', 'SMILES'], dtype='object')

In [4]:
data.shape

(36377, 3)

In [5]:
data = data[['Title','SMILES', 'IC50']]

In [6]:
import multiprocessing as mp

In [7]:
from tqdm.auto import tqdm

In [None]:
def mol_dsc_calc(mols):
    return pd.DataFrame({k: f(Chem.MolFromSmiles(m)) for k, f in descriptors.items()} for m in mols)

# список конституционных и физико-химических дескрипторов из библиотеки RDKit
descriptors = {"HeavyAtomCount": Descriptors.HeavyAtomCount,
               "NHOHCount": Descriptors.NHOHCount,
               "NOCount": Descriptors.NOCount,
               "NumHAcceptors": Descriptors.NumHAcceptors,
               "NumHDonors": Descriptors.NumHDonors,
               "NumHeteroatoms": Descriptors.NumHeteroatoms,
               "NumRotatableBonds": Descriptors.NumRotatableBonds,
               "NumValenceElectrons": Descriptors.NumValenceElectrons,
               "NumAromaticRings": Descriptors.NumAromaticRings,
               "NumAliphaticHeterocycles": Descriptors.NumAliphaticHeterocycles,
               "RingCount": Descriptors.RingCount,
               "MW": Descriptors.MolWt,
               "LogP": Descriptors.MolLogP,
               "MR": Descriptors.MolMR,
               "TPSA": Descriptors.TPSA}

# sklearn трансформер для использования в конвейерном моделировании
descriptors_transformer = FunctionTransformer(mol_dsc_calc)
X = descriptors_transformer.transform(data['SMILES'])
X.head()

data_dsc = data.join(X)
data_dsc.to_excel('data_dsc.xlsx')
data_dsc.head()


In [3]:
data_dsc = pd.read_excel('data_dsc.xlsx', index_col=0)

In [4]:
data_dsc

Unnamed: 0,Title,SMILES,IC50,HeavyAtomCount,NHOHCount,NOCount,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumAliphaticHeterocycles,RingCount,MW,LogP,MR,TPSA
0,CHEMBL2206459,[H]\N=C(N)\N[C@@H](C1)[C@@H](NC(=O)C)[C@@H](C=...,0.000015,24,7,9,4,6,10,7,138,0,0,1,362.367,0.38187,90.4296,157.76
1,CHEMBL3818159,O=C(O)C1=C[C@H](N)[C@@H](NC(=O)C)[C@@H](C1)COC...,0.000016,21,4,6,4,3,6,7,120,0,0,1,298.383,1.05450,79.7279,101.65
2,CHEMBL1956716,C=CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O...,0.000033,21,3,6,5,2,6,7,118,0,0,1,295.359,-0.36170,77.0531,104.48
3,CHEMBL1956715,CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O)O...,0.000032,20,3,6,5,2,6,6,114,0,0,1,283.348,-0.52780,72.5301,104.48
4,CHEMBL4444029,O=C(O)c1c(O)c(=O)cc([nH]1)-c(c2C)ccc(c2)-c3noc...,0.000041,24,4,9,6,4,9,3,122,3,0,3,329.268,1.09742,81.9775,149.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36372,CHEMBL3699085,CC(=O)N[C@H]1CC[C@@H]([C@@H]12)[C@@H](O[Si](C)...,2366.790000,44,3,10,6,3,11,8,238,2,0,4,622.839,5.88880,171.2541,127.35
36373,CHEMBL109004,CC(=O)Nc(cc1)c(OC(=O)C)cc1C(=O)O,2372.100000,17,2,6,4,2,6,3,90,1,0,1,237.211,1.26850,59.0140,92.70
36374,CHEMBL2259758,c1cccc(c12)cccc2CNC(=O)CCCCCCO[C@]3(C(=O)O)C[C...,2600.970000,61,7,17,13,7,18,21,330,4,1,5,866.987,2.32120,224.7361,250.36
36375,CHEMBL109781,NCCCC(=O)Nc(c(cc1)NC(=O)C)cc1C(=O)O,2793.000000,20,5,7,4,4,7,6,108,1,0,1,279.296,1.02060,74.6231,121.52


In [5]:
x, y = data_dsc.drop(columns=['Title', 'SMILES', 'IC50']), data_dsc['IC50']

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [8]:
from catboost import CatBoostRegressor

In [9]:
cb = CatBoostRegressor(verbose=100, task_type='CPU', loss_function='RMSE', eval_metric='R2')

In [10]:
cb.fit(x_train, y_train, eval_set=(x_test, y_test))

Learning rate set to 0.085571
0:	learn: 0.0119448	test: 0.0000101	best: 0.0000101 (0)	total: 162ms	remaining: 2m 41s
100:	learn: 0.4152909	test: -0.4398710	best: 0.0136328 (7)	total: 536ms	remaining: 4.77s
200:	learn: 0.5357876	test: -0.6761682	best: 0.0136328 (7)	total: 892ms	remaining: 3.55s
300:	learn: 0.6201801	test: -0.7818011	best: 0.0136328 (7)	total: 1.25s	remaining: 2.9s
400:	learn: 0.6799376	test: -0.9104370	best: 0.0136328 (7)	total: 1.6s	remaining: 2.4s
500:	learn: 0.7262149	test: -1.0304158	best: 0.0136328 (7)	total: 1.96s	remaining: 1.96s
600:	learn: 0.7596492	test: -1.1393606	best: 0.0136328 (7)	total: 2.32s	remaining: 1.54s
700:	learn: 0.7850367	test: -1.2348512	best: 0.0136328 (7)	total: 2.68s	remaining: 1.14s
800:	learn: 0.7987940	test: -1.2802035	best: 0.0136328 (7)	total: 3.04s	remaining: 755ms
900:	learn: 0.8120605	test: -1.3126428	best: 0.0136328 (7)	total: 3.4s	remaining: 373ms
999:	learn: 0.8202029	test: -1.3415430	best: 0.0136328 (7)	total: 3.76s	remaining: 0us

<catboost.core.CatBoostRegressor at 0x1feb9aedd80>

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [12]:
linreg = LinearRegression()

In [13]:
linreg.fit(x_train, y_train)

In [14]:
r2_score(y_test, linreg.predict(x_test))

-0.02150295018926074

In [15]:
from sklearn.ensemble import RandomForestRegressor

In [20]:
rf = RandomForestRegressor(n_jobs=-1, criterion='squared_error', verbose=3)
rf.fit(x_train, y_train)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=-1)]: Done  87 out of 100 | elapsed:    1.7s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.9s finished


In [21]:
r2_score(y_test, rf.predict(x_test))

[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done  87 out of 100 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=24)]: Done 100 out of 100 | elapsed:    0.0s finished


-0.35715432635511735

In [1]:
from torch import nn

ModuleNotFoundError: No module named 'torch'

In [3]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_layers):
        super().__init__()
        self.input_layer = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.ReLU())
        self.hidden_layers = nn.ModuleList()
        for i in n_layers:
            self.layers.append(nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU()))
        self.output_layer = nn.Linear(hidden_dim, 1)
    def forward(self, x):
        x = self.input_layer(x)
        for layer in self.hidden_layers:
            x = layer(x)
        return self.output_layer(x)

NameError: name 'nn' is not defined

In [13]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors, AllChem


def rdkit_fp(smiles_column: pd.Series, radius=3, nBits=2048, useChirality=False):
    # morganFP_rdkit
    def desc_gen(mol):
        mol = Chem.MolFromSmiles(mol)
        bit_vec = np.zeros((1,), np.int16)
        DataStructs.ConvertToNumpyArray(
            AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits, useChirality=useChirality), bit_vec)
        return bit_vec

    return pd.DataFrame.from_records(smiles_column.apply(func=desc_gen), columns=[f'bit_id_{i}' for i in range(nBits)])


def rdkit_2d(smiles_column: pd.Series):
    # 2d_rdkit
    descriptors = {i[0]: i[1] for i in Descriptors._descList}
    return pd.DataFrame({k: f(Chem.MolFromSmiles(m)) for k, f in descriptors.items()} for m in smiles_column)

In [14]:
Y = rdkit_fp(data['SMILES'])
Y.head()

[22:47:27] Conflicting single bond directions around double bond at index 55.
[22:47:27]   BondStereo set to STEREONONE and single bond directions set to NONE.
[22:47:37] Conflicting single bond directions around double bond at index 7.
[22:47:37]   BondStereo set to STEREONONE and single bond directions set to NONE.


Unnamed: 0,bit_id_0,bit_id_1,bit_id_2,bit_id_3,bit_id_4,bit_id_5,bit_id_6,bit_id_7,bit_id_8,bit_id_9,...,bit_id_2038,bit_id_2039,bit_id_2040,bit_id_2041,bit_id_2042,bit_id_2043,bit_id_2044,bit_id_2045,bit_id_2046,bit_id_2047
0,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
data_fp = data.join(Y)
data_fp.head()

Unnamed: 0,Title,SMILES,IC50,bit_id_0,bit_id_1,bit_id_2,bit_id_3,bit_id_4,bit_id_5,bit_id_6,...,bit_id_2038,bit_id_2039,bit_id_2040,bit_id_2041,bit_id_2042,bit_id_2043,bit_id_2044,bit_id_2045,bit_id_2046,bit_id_2047
0,CHEMBL2206459,[H]\N=C(N)\N[C@@H](C1)[C@@H](NC(=O)C)[C@@H](C=...,1.5e-05,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,CHEMBL3818159,O=C(O)C1=C[C@H](N)[C@@H](NC(=O)C)[C@@H](C1)COC...,1.6e-05,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL1956716,C=CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O...,3.3e-05,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL1956715,CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O)O...,3.2e-05,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL4444029,O=C(O)c1c(O)c(=O)cc([nH]1)-c(c2C)ccc(c2)-c3noc...,4.1e-05,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
data_fp.to_csv('data_fp.csv')

In [None]:
Z = rdkit_2d(data['SMILES'])
Z.head()

In [20]:
data_2d = data.join(Z)
data_2d.head()

NameError: name 'Z' is not defined

In [None]:
data_2d.to_csv('data_2d.csv')

^C
Note: you may need to restart the kernel to use updated packages.


In [None]:
cat_columns = ['']