In [None]:
!python drug_generator.py --fasta cox2.fasta --number 1000 --output cox2_raw --temperature 0.8 --top_p 0.95 --device cpu

In [10]:
import glob
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import QED, Descriptors, FilterCatalog, AllChem
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import joblib

In [22]:
SDF_DIR = 'cox2_raw'         
DATASET = 'desc_rdk.csv'
OUT_CSV = 'filtered_mols.csv'

df = pd.read_csv(DATASET, sep=',')
df = df[df['Standard Value'] > 0]
df['pIC50'] = -np.log10(df['Standard Value'])

desc_cols = [col for col in df.columns if col not in ['Standard Value', 'pIC50']]
X = df[desc_cols].values
y = df['pIC50'].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_scaled, y)

def calc_rdkit_descriptors(mol, desc_names):
    values = []
    for name in desc_names:
        func = getattr(Descriptors, name)
        try:
            v = func(mol)
        except:
            v = np.nan
        values.append(v)
    return values

def qed_score(mol):
    try: return QED.qed(mol)
    except: return None
def lipinski_violations(mol):
    v = 0
    if Descriptors.MolWt(mol) > 500: v += 1
    if Descriptors.MolLogP(mol) > 5: v += 1
    if Descriptors.NumHDonors(mol) > 5: v += 1
    if Descriptors.NumHAcceptors(mol) > 10: v += 1
    return v
fc_params = FilterCatalog.FilterCatalogParams()
fc_params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.BRENK)
brenk_cat = FilterCatalog.FilterCatalog(fc_params)
def has_brenk(mol):
    return int(len(brenk_cat.GetMatches(mol)) > 0)

def get_pic50_ml(mol, desc_names, scaler, rf):
    try:
        desc_vals = calc_rdkit_descriptors(mol, desc_names)
        if np.any(np.isnan(desc_vals)):
            return None
        X_pred = scaler.transform([desc_vals])
        return float(rf.predict(X_pred)[0])
    except Exception as e:
        return None

sdf_files = glob.glob(f'{SDF_DIR}/*.sdf')
rows = []
for sdf in sdf_files:
    suppl = Chem.SDMolSupplier(sdf)
    for mol in suppl:
        if mol is None:
            continue
        try:
            Chem.SanitizeMol(mol)
        except:
            continue
        smi = Chem.MolToSmiles(mol)
        pic50 = get_pic50_ml(mol, desc_cols, scaler, rf)
        qed = qed_score(mol)
        lip = lipinski_violations(mol)
        tox = has_brenk(mol)
        if (
            pic50 is not None and pic50 > 0.50 and
            qed is not None and qed >= 0.7 and
            tox == 0 and
            lip <= 3
        ):
            rows.append({
                'SMILES': smi,
                'pIC50_pred': pic50,
                'QED': qed,
                'Toxicophore': tox,
                'Lipinski_violations': lip
            })

print(f"Прошло фильтр: {len(rows)}")

df_out = pd.DataFrame(rows)
df_out.to_csv(OUT_CSV, sep=';', index=False)
print(f"Результаты сохранены в {OUT_CSV}")

[18:00:54] ERROR: Cannot process coordinates on line 18
[18:00:54] ERROR: moving to the beginning of the next molecule
[18:00:54] ERROR: Cannot process coordinates on line 33
[18:00:54] ERROR: moving to the beginning of the next molecule
[18:00:54] 

****
Post-condition Violation
Element '556' not found
Violation occurred on line 93 in file D:\bld\rdkit-meta_1730203892832\work\Code\GraphMol\PeriodicTable.h
Failed Expression: anum > -1
****

[18:00:54] ERROR: Element '556' not found
[18:00:54] ERROR: moving to the beginning of the next molecule
[18:00:54] 

****
Post-condition Violation
Element '487' not found
Violation occurred on line 93 in file D:\bld\rdkit-meta_1730203892832\work\Code\GraphMol\PeriodicTable.h
Failed Expression: anum > -1
****

[18:00:54] ERROR: Element '487' not found
[18:00:54] ERROR: moving to the beginning of the next molecule
[18:00:54] 

****
Post-condition Violation
Element '238' not found
Violation occurred on line 93 in file D:\bld\rdkit-meta_1730203892832\w

Прошло фильтр: 0
Результаты сохранены в filtered_mols.csv


[18:00:54] ERROR: Cannot process coordinates on line 11
[18:00:54] ERROR: moving to the beginning of the next molecule
[18:00:54] ERROR: Cannot process coordinates on line 20
[18:00:54] ERROR: moving to the beginning of the next molecule
[18:00:54] ERROR: Cannot process coordinates on line 21
[18:00:54] ERROR: moving to the beginning of the next molecule
[18:00:54] ERROR: Cannot process coordinates on line 21
[18:00:54] ERROR: moving to the beginning of the next molecule
[18:00:54] ERROR: Cannot process coordinates on line 17
[18:00:54] ERROR: moving to the beginning of the next molecule


In [18]:
from rdkit import Chem
import glob

sdf_files = glob.glob('cox2_raw.smi/*.sdf')

total_mols = 0
valid_mols = 0

for sdf in sdf_files:
    suppl = Chem.SDMolSupplier(sdf, sanitize=False, removeHs=False)
    for mol in suppl:
        total_mols += 1
        if mol is None:
            continue
        try:
            Chem.SanitizeMol(mol)
            valid_mols += 1
        except:
            pass

print(f"Всего молекул в SDF: {total_mols}")
print(f"Валидных с точки зрения химии: {valid_mols}")
print(f"Процент валидных: {100*valid_mols/total_mols if total_mols else 0:.2f}%"

Всего молекул в SDF: 92
Валидных с точки зрения химии: 68
Процент валидных: 73.91%


[17:50:56] ERROR: Cannot process coordinates on line 18
[17:50:56] ERROR: moving to the beginning of the next molecule
[17:50:56] ERROR: Cannot process coordinates on line 33
[17:50:56] ERROR: moving to the beginning of the next molecule
[17:50:56] 

****
Post-condition Violation
Element '556' not found
Violation occurred on line 93 in file D:\bld\rdkit-meta_1730203892832\work\Code\GraphMol\PeriodicTable.h
Failed Expression: anum > -1
****

[17:50:56] ERROR: Element '556' not found
[17:50:56] ERROR: moving to the beginning of the next molecule
[17:50:56] 

****
Post-condition Violation
Element '487' not found
Violation occurred on line 93 in file D:\bld\rdkit-meta_1730203892832\work\Code\GraphMol\PeriodicTable.h
Failed Expression: anum > -1
****

[17:50:56] ERROR: Element '487' not found
[17:50:56] ERROR: moving to the beginning of the next molecule
[17:50:56] 

****
Post-condition Violation
Element '238' not found
Violation occurred on line 93 in file D:\bld\rdkit-meta_1730203892832\w