![methods](../../img/image.png)

# libs init

In [13]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import DataStructs
from rdkit.Chem import MACCSkeys
from mordred import Calculator, descriptors
from padelpy import padeldescriptor

# Загрузка предобработанного датасета

In [14]:
# путь к файлу, полученному в мини-таске 1/4

# 1
# path = '../../data/target_cox2_IC50__preprocessed_aggregated.csv'

# 4
path = '../../data/new_molecules__target_cox2_pIC50__smiles.csv'
df = pd.read_csv(path)
df

Unnamed: 0,Smiles
0,C#CCCCC(=O)c1cc2c(c(C(=O)CCCC#C)c1)O[C@H](C(=O...
1,C/C(=N\NC(N)=S)c1nnn(C2Oc3ccc(S(F)(F)(F)(F)F)c...
2,C=C1C(=O)O[C@@H]2[C@H]1[C@@H](c1cc(O)cc(Br)c1O...
3,CC(=O)N1NC(c2ccccc2)=CC1c1nccc2ccccc12
4,CC(C(=O)O)=C(C)C(=O)O
5,CC1=C(c2ccc(N3CC(C)N=C3c3cccc(Br)c3)cn2)SC2Nc3...
6,CCCCC=CCCCC
7,CCCCCCCCCCCCCc1ccc(C)cc1Cl
8,CN(C(=O)CCCl)C(=O)CCCl
9,COC1c2c(C34CC5CC(CC(C5)C3)C4)c(C34CC5CC(CC(C5)...


# Преобразование SMILES → молекулы RDKit

In [15]:
df['Mol'] = df['Smiles'].apply(Chem.MolFromSmiles)
df = df[df['Mol'].notna()].reset_index(drop=True)
print(f"После фильтрации по валидным молекулам: {df.shape[0]} строк")

После фильтрации по валидным молекулам: 42 строк


# Расчёт 2D-дескрипторов RDKit

In [16]:
desc_list = Descriptors._descList	# список (name, function)

def rdkit2d(mol):
    vals = {}
    for name, func in desc_list:
        try:
            vals[name] = func(mol)
        except:
            vals[name] = np.nan
    return vals

rdkit_desc = df['Mol'].apply(rdkit2d).apply(pd.Series)
print(f"RDKit 2D-дескрипторов: {rdkit_desc.shape[1]}")

RDKit 2D-дескрипторов: 217


# Генерация Morgan fingerprints

In [17]:
def morgan_fp(mol, radius=2, nBits=1024):
    arr = np.zeros((nBits,), dtype=int)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
    

    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

## ECFP4 (radius=2)

In [18]:
ecfp4_arr = np.stack(df['Mol'].apply(lambda m: morgan_fp(m, radius=2, nBits=1024)).values)
ecfp4_cols = [f'ECFP4_{i}' for i in range(ecfp4_arr.shape[1])]
ecfp4_df   = pd.DataFrame(ecfp4_arr, columns=ecfp4_cols, index=df.index)
print(f"ECFP4 bits: {ecfp4_df.shape[1]}")

ECFP4 bits: 1024




## ECFP6 (radius=3)

In [19]:
ecfp6_arr = np.stack(df['Mol'].apply(lambda m: morgan_fp(m, radius=3, nBits=1024)).values)
ecfp6_cols = [f'ECFP6_{i}' for i in range(ecfp6_arr.shape[1])]
ecfp6_df   = pd.DataFrame(ecfp6_arr, columns=ecfp6_cols, index=df.index)
print(f"ECFP6 bits: {ecfp6_df.shape[1]}")

ECFP6 bits: 1024




# Генерация MACCS fingerprints

In [20]:
def maccs_fp(mol):
    fp  = MACCSkeys.GenMACCSKeys(mol)
    arr = np.zeros((fp.GetNumBits(),), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr[1:]

maccs_arr = np.stack(df['Mol'].apply(maccs_fp).values)
maccs_cols = [f'MACCS_{i+1}' for i in range(maccs_arr.shape[1])]
maccs_df   = pd.DataFrame(maccs_arr, columns=maccs_cols, index=df.index)
print(f"MACCS bits: {maccs_df.shape[1]}")

MACCS bits: 166


# Расчёт 2D-дескрипторов Mordred

In [21]:
calc = Calculator(descriptors, ignore_3D=True)
mordred_desc = calc.pandas(df['Mol'])
print(f"Mordred-дескрипторов (2D): {mordred_desc.shape[1]}")

100%|██████████| 42/42 [00:19<00:00,  2.13it/s]


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
Mordred-дескрипторов (2D): 1613


# Расчёт дескрипторов PaDEL (CDK, PubChem, MACCS, ECFP)

In [22]:
SMI_FILE = '../../data/padel_input.smi'
PAD_OUT  = '../../data/padel_output.csv'
df[['Smiles']].to_csv(SMI_FILE, index=False, header=False)

padeldescriptor(
    mol_dir=SMI_FILE,
    d_file=PAD_OUT,
    fingerprints=True,
    threads=4
)

padel_df = pd.read_csv(PAD_OUT)
padel_df.index = df.index
padel_df = padel_df.drop(columns=['Name'], errors='ignore')
print(f"PaDEL-признаков: {padel_df.shape[1]}")

PaDEL-признаков: 881


# Объединение дескрипторов и fingerprint’ов

In [26]:
X = pd.concat([
    # df[['Smiles','Molecule ChEMBL ID','pIC50']],
	df[['Smiles']],
    rdkit_desc,
    ecfp4_df,
    ecfp6_df,
    maccs_df,
    mordred_desc,
    padel_df
], axis=1)
print(f"Признаков до фильтрации: {X.shape[1]-3}")

Признаков до фильтрации: 4923


# Фильтрация и отбор признаков

- Удаление пропусков
- Удаление признаков с нулевой дисперсией
- Фильтрация признаков с высокой корреляцией (|r|>0.7) —> оставляем одну из пары

In [27]:
# 10.1 Удаляем колонки с NaN
X = X.dropna(axis=1, how='any')


# Удаляем низкую дисперсию
# vt = VarianceThreshold(threshold=1e-6)
vt = VarianceThreshold(threshold=0.0)
# core = X.drop(columns=['Smiles','Molecule ChEMBL ID','pIC50'])
core = X.drop(columns=['Smiles'])
vt.fit(core)
core = core.loc[:, vt.get_support()]

# X_filtered = pd.concat([X[['Smiles','Molecule ChEMBL ID','pIC50']], core], axis=1)
X_filtered = pd.concat([X[['Smiles']], core], axis=1)
print(f"После удаления признаков с низкой дисперсией: {core.shape[1]}")


# Удаляем скоррелированные признаки (|r| > 0.7)
corr  = core.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [c for c in upper.columns if any(upper[c] > 0.7)]

X_final = X_filtered.drop(columns=to_drop)
print(f"Удалено скоррелированных: {len(to_drop)}")
print(f"Итоговых признаков: {X_final.shape[1]-3}")

  self.variances_ = np.nanvar(X, axis=0)
  self.variances_ = np.nanmin(compare_arr, axis=0)


После удаления признаков с низкой дисперсией: 3492
Удалено скоррелированных: 3105
Итоговых признаков: 385


# Сохранение готового датасета

In [28]:
X_final

Unnamed: 0,Smiles,MaxAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,MinPartialCharge,FpDensityMorgan1,BCUT2D_MWHI,BCUT2D_MWLOW,...,GATS2Z,GATS3Z,GATS7Z,GATS8Z,GATS1se,GATS5se,BCUTdv-1l,BCUTd-1l,NdsCH,SdssC
0,C#CCCCC(=O)c1cc2c(c(C(=O)CCCC#C)c1)O[C@H](C(=O...,12.99895,-1.37758,0.269472,13.794118,458.51,-0.477762,0.852941,16.49885,9.903533,...,0.832476,0.882454,1.091693,1.121436,0.655346,1.193142,1.85663,0.910541,1,-2.575788
1,C/C(=N\NC(N)=S)c1nnn(C2Oc3ccc(S(F)(F)(F)(F)F)c...,13.126592,-9.970684,0.245493,18.53125,498.459,-0.477718,1.34375,32.486837,10.052356,...,0.445938,1.004942,0.808227,1.076673,0.590332,0.925839,0.652905,0.971497,1,-1.92557
2,C=C1C(=O)O[C@@H]2[C@H]1[C@@H](c1cc(O)cc(Br)c1O...,12.885088,-0.738483,0.394922,38.333333,435.314,-0.507909,1.296296,79.918731,9.675737,...,0.519321,0.694848,1.291322,2.111252,0.755551,0.81593,0.256011,0.976209,0,0.013323
3,CC(=O)N1NC(c2ccccc2)=CC1c1nccc2ccccc12,12.145024,-0.232587,0.78436,16.791667,315.376,-0.294749,1.041667,16.158548,10.031634,...,0.843062,1.021021,1.184799,1.175364,0.639542,1.011518,0.995954,0.973009,1,0.877268
4,CC(C(=O)O)=C(C)C(=O)O,10.123981,-1.194537,0.55291,12.2,144.126,-0.477871,0.9,16.372138,10.147147,...,0.443812,0.915802,0.0,invalid value encountered in scalar divide (GA...,0.899083,1.044928,0.99476,0.971501,0,-2.652963
5,CC1=C(c2ccc(N3CC(C)N=C3c3cccc(Br)c3)cn2)SC2Nc3...,4.89507,0.193122,0.463681,21.53125,504.457,-0.354446,1.09375,79.918731,10.154572,...,0.610533,0.926082,1.124434,1.807114,0.898955,1.096789,0.256008,0.992964,0,3.461614
6,CCCCC=CCCCC,2.318611,1.27799,0.387226,11.0,140.27,-0.088539,0.8,13.850982,10.178337,...,1.369444,0.932143,1.003846,0.845833,1.5,0.938235,0.987144,0.987144,2,0.0
7,CCCCCCCCCCCCCc1ccc(C)cc1Cl,6.262946,0.94637,0.35052,11.0,308.937,-0.084031,0.761905,35.495692,10.025749,...,0.892986,0.841418,0.875224,0.901188,0.669527,1.389308,0.773997,0.98773,0,0.0
8,CN(C(=O)CCCl)C(=O)CCCl,11.036874,-0.263781,0.656922,9.583333,212.076,-0.285701,1.083333,35.496752,10.335314,...,0.966712,0.95647,1.697003,2.544262,0.596879,0.762033,0.767071,0.977289,0,-0.527562
9,COC1c2c(C34CC5CC(CC(C5)C3)C4)c(C34CC5CC(CC(C5)...,6.423192,0.200286,0.468886,50.363636,447.707,-0.374592,0.666667,16.477706,9.437636,...,0.979527,1.210052,1.141792,1.116284,0.887203,1.169709,0.992254,0.987669,0,0.0


In [29]:
# OUT_CSV = '../../data/target_cox2_IC50__descriptors_with_ECFP6.csv'

# ДЛЯ НОВЫХ МОЛЕКУЛ
OUT_CSV = '../../data/new_molecules__target_cox2_pIC50__descriptors_with_ECFP6.csv'
X_final.to_csv(OUT_CSV, index=False)
print(f"Сохранено: {OUT_CSV}")

Сохранено: ../../data/new_molecules__target_cox2_pIC50__descriptors_with_ECFP6.csv
