In [21]:
import pandas as pd

In [22]:
df = pd.read_csv('data/2025-04-13T06-47_export.csv')
df = df.sort_values(by='PDSC', ascending=False)

In [23]:
from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import Descriptors
from collections import OrderedDict

with open('SPMM/property_name.txt', 'r') as f:
    names = [n.strip() for n in f.readlines()][:53]

descriptor_dict = OrderedDict()
for n in names:
    if n == 'QED':
        descriptor_dict[n] = lambda x: Chem.QED.qed(x)
    else:
        descriptor_dict[n] = getattr(Descriptors, n)

def calculate_property(smiles):
    RDLogger.DisableLog('rdApp.*')
    mol = Chem.MolFromSmiles(smiles)
    output = []
    for i, descriptor in enumerate(descriptor_dict):
        # print(descriptor)
        output.append(descriptor_dict[descriptor](mol))
    return output

In [24]:
vals = df.head(3).Smiles.tolist()

In [25]:
properties = [calculate_property(smiles) for smiles in vals]

In [32]:
for i in range(len(vals)):
    pd.DataFrame({
        'property': names,
        'input_value': properties[i]
    }).to_csv(f'data/p2s_input_{i}.csv', index=False)

In [36]:
p2s_0 = pd.read_csv('data/p2s_input_0.csv')

In [61]:
p2s_0[p2s_0['property'] == 'TPSA']

Unnamed: 0,property,input_value
51,TPSA,74.63


In [62]:
!cd SPMM && python "pv2smiles_orig.py" --n_generate 100 --checkpoint "../checkpoint_SPMM.ckpt" --device "cuda"

seed: 234 True
Creating model
LOADING PRETRAINED MODEL..
load checkpoint from ../checkpoint_SPMM.ckpt
_IncompatibleKeys(missing_keys=['property_encoder.embeddings.word_embeddings.weight', 'property_encoder_m.embeddings.word_embeddings.weight'], unexpected_keys=['temp'])
PV-to-SMILES generation in stochastic manner with k=2...
mean of controlled properties' normalized RMSE: 0.8478984236717224
validity: 0.78
uniqueness: 1.0
Generated molecules are saved in 'generated_molecules.txt'



  0%|          | 0/100 [00:00<?, ?it/s]
  1%|          | 1/100 [00:06<10:22,  6.28s/it]
  2%|▏         | 2/100 [00:10<08:19,  5.10s/it]
  3%|▎         | 3/100 [00:16<08:55,  5.53s/it]
  4%|▍         | 4/100 [00:21<08:10,  5.11s/it]
  5%|▌         | 5/100 [00:24<07:24,  4.68s/it]
  6%|▌         | 6/100 [00:31<08:04,  5.15s/it]
  7%|▋         | 7/100 [00:36<07:57,  5.13s/it]
  8%|▊         | 8/100 [00:41<08:06,  5.28s/it]
  9%|▉         | 9/100 [00:45<07:16,  4.80s/it]
 10%|█         | 10/100 [00:50<07:13,  4.82s/it]
 11%|█         | 11/100 [00:53<06:15,  4.22s/it]
 12%|█▏        | 12/100 [00:56<05:52,  4.01s/it]
 13%|█▎        | 13/100 [01:01<05:56,  4.09s/it]
 14%|█▍        | 14/100 [01:03<05:08,  3.59s/it]
 15%|█▌        | 15/100 [01:06<04:51,  3.43s/it]
 16%|█▌        | 16/100 [01:11<05:24,  3.86s/it]
 17%|█▋        | 17/100 [01:15<05:31,  3.99s/it]
 18%|█▊        | 18/100 [01:19<05:32,  4.05s/it]
 19%|█▉        | 19/100 [01:23<05:27,  4.05s/it]
 20%|██        | 20/100 [01:28<05:41,

In [63]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit import RDLogger

RDLogger.DisableLog('rdApp.*')

base = 'SPMM/generated_molecules_from_pv.txt'
output = 'SPMM/valid_generated_molecules_from_pv.txt'

def validate_smiles(smiles, error_log_file=None):
    results = {
        'valid_syntax': False,
        'valid_molecule': False,
        'neutral': False,
        'allowed_atoms': False,
        'molecular_weight': False,
        'solubility': False,
        'is_phenol_or_aromatic_amine': False,
        'all_valid': False,
        'error': None
    }
    
    try:
        mol = Chem.MolFromSmiles(smiles, sanitize=False)
        if mol is None:
            results['error'] = "Не удалось разобрать SMILES-строку (синтаксическая ошибка)"
            return results
        
        results['valid_syntax'] = True
        
        try:
            Chem.SanitizeMol(mol)
            sanitized_mol = Chem.MolFromSmiles(smiles)
            
            if sanitized_mol is None:
                results['error'] = "Не удалось санитизировать молекулу"
                return results
                
            results['valid_molecule'] = True
            
            for atom in sanitized_mol.GetAtoms():
                if atom.GetFormalCharge() != 0:
                    results['error'] = f"Атом {atom.GetIdx()} имеет заряд {atom.GetFormalCharge()}"
                    return results
                if atom.GetNumRadicalElectrons() > 0:
                    results['error'] = f"Атом {atom.GetIdx()} имеет неспаренные электроны"
                    return results
            
            results['neutral'] = True
            
            allowed_atoms = {'C', 'H', 'O', 'N', 'P', 'S'}
            for atom in sanitized_mol.GetAtoms():
                if atom.GetSymbol() not in allowed_atoms:
                    results['error'] = f"Недопустимый атом: {atom.GetSymbol()}"
                    return results
            
            results['allowed_atoms'] = True
            
            mw = Descriptors.MolWt(sanitized_mol)
            if mw > 1000:
                results['error'] = f"Молекулярная масса превышает 1000 г/моль: {mw:.2f}"
                return results
            
            results['molecular_weight'] = True
            
            logp = Descriptors.MolLogP(sanitized_mol)
            if logp <= 1:
                results['error'] = f"LogP меньше или равен 1: {logp:.2f}"
                return results
            
            results['solubility'] = True
            
            aromatic_amine_pattern = Chem.MolFromSmarts('c-[NH2,NH1,NH0;!$(NC=O);!$(N=*);!$(N#*)]')
            phenol_pattern = Chem.MolFromSmarts('c-[OH]')
            
            has_aromatic_amine = mol.HasSubstructMatch(aromatic_amine_pattern)
            has_phenol = mol.HasSubstructMatch(phenol_pattern)
            
            if has_phenol or has_aromatic_amine:
                results['is_phenol_or_aromatic_amine'] = True
            else:
                if has_phenol or has_aromatic_amine:
                    results['error'] = "Молекула содержит фенол или ароматический амин, но имеет дополнительные группы"
                else:
                    results['error'] = "Молекула не является фенолом или ароматическим амином"
                return results
            
            results['all_valid'] = True
            
            return results
            
        except Exception as e:
            if error_log_file:
                with open(error_log_file, 'a') as log:
                    log.write(f"SMILES: {smiles}\nОшибка: {str(e)}\n\n")
            results['error'] = f"Ошибка санитизации: {str(e)}"
            return results
            
    except Exception as e:
        if error_log_file:
            with open(error_log_file, 'a') as log:
                log.write(f"SMILES: {smiles}\nОшибка: {str(e)}\n\n")
        results['error'] = f"Неожиданная ошибка: {str(e)}"
        return results
    return results

valid_smiles = []
validation_results = []

with open(base, 'r') as f:
    smiles_list = [line.strip() for line in f.readlines() if line.strip()]

print(f"Найдено {len(smiles_list)} строк в файле")

for smiles in smiles_list:
    result = validate_smiles(smiles)
    validation_results.append(result)
    if result['all_valid']:
        valid_smiles.append(smiles)

total = len(smiles_list)
total_valid_syntax = sum(1 for r in validation_results if r['valid_syntax'])
total_valid_molecule = sum(1 for r in validation_results if r['valid_molecule'])
total_neutral = sum(1 for r in validation_results if r['neutral'])
total_allowed_atoms = sum(1 for r in validation_results if r['allowed_atoms'])
total_mw_valid = sum(1 for r in validation_results if r['molecular_weight'])
total_soluble = sum(1 for r in validation_results if r['solubility'])
total_is_phenol_or_amine = sum(1 for r in validation_results if r['is_phenol_or_aromatic_amine'])
total_all_valid = sum(1 for r in validation_results if r['all_valid'])

print(f"Отчёт по валидации для файла {base}:")
print(f"  Синтаксически валидные SMILES: {total_valid_syntax} из {total}")
print(f"  Химически валидные молекулы: {total_valid_molecule} из {total}")
print(f"  Нейтральные молекулы: {total_neutral} из {total}")
print(f"  С допустимыми атомами: {total_allowed_atoms} из {total}")
print(f"  Молекулярная масса ≤ 1000 г/моль: {total_mw_valid} из {total}")
print(f"  Растворимые в гексане (logP > 1): {total_soluble} из {total}")
print(f"  Является фенолом или ароматическим амином: {total_is_phenol_or_amine} из {total}")
print(f"  Прошли ВСЕ проверки: {total_all_valid} из {total}")
        
with open(output, 'w') as f:
    for smiles in valid_smiles:
        f.write(f"{smiles}\n")

print(f"  Сохранено {len(valid_smiles)} валидных молекул в файл {output}")

Найдено 100 строк в файле
Отчёт по валидации для файла SPMM/generated_molecules_from_pv.txt:
  Синтаксически валидные SMILES: 90 из 100
  Химически валидные молекулы: 78 из 100
  Нейтральные молекулы: 76 из 100
  С допустимыми атомами: 71 из 100
  Молекулярная масса ≤ 1000 г/моль: 70 из 100
  Растворимые в гексане (logP > 1): 70 из 100
  Является фенолом или ароматическим амином: 64 из 100
  Прошли ВСЕ проверки: 64 из 100
  Сохранено 64 валидных молекул в файл SPMM/valid_generated_molecules_from_pv.txt


In [65]:
new_found = ['CCC(C)c1ccc(OCCN(CC(O)COc2ccc(-c3ccc(Nc4ccc5c(c4)c4ccccc4n5C4CCCC4)cc3)cc2)c2ccc(-c3ccc4[nH]ccc4c3)cc2)cc1N']

In [66]:
properties = [calculate_property(smiles) for smiles in new_found]

In [68]:
pd.DataFrame({
    'property': names,
    'input_value': properties[0]
}).to_csv(f'data/p2s_input_NEW.csv', index=False)

In [69]:
p2s_new = pd.read_csv('data/p2s_input_NEW.csv')

In [71]:
!cd SPMM && python "pv2smiles_orig.py" --n_generate 100 --checkpoint "../checkpoint_SPMM.ckpt" --device "cuda"

seed: 168 True
Creating model
LOADING PRETRAINED MODEL..
load checkpoint from ../checkpoint_SPMM.ckpt
_IncompatibleKeys(missing_keys=['property_encoder.embeddings.word_embeddings.weight', 'property_encoder_m.embeddings.word_embeddings.weight'], unexpected_keys=['temp'])
PV-to-SMILES generation in stochastic manner with k=2...
mean of controlled properties' normalized RMSE: 0.8727170825004578
validity: 0.67
uniqueness: 1.0
Generated molecules are saved in 'generated_molecules.txt'



  0%|          | 0/100 [00:00<?, ?it/s]
  1%|          | 1/100 [00:05<09:48,  5.95s/it]
  2%|▏         | 2/100 [00:13<11:35,  7.10s/it]
  3%|▎         | 3/100 [00:20<11:25,  7.07s/it]
  4%|▍         | 4/100 [00:28<11:56,  7.47s/it]
  5%|▌         | 5/100 [00:37<12:34,  7.94s/it]
  6%|▌         | 6/100 [00:43<11:31,  7.36s/it]
  7%|▋         | 7/100 [00:53<12:18,  7.94s/it]
  8%|▊         | 8/100 [00:59<11:33,  7.53s/it]
  9%|▉         | 9/100 [01:04<10:12,  6.74s/it]
 10%|█         | 10/100 [01:10<09:34,  6.39s/it]
 11%|█         | 11/100 [01:15<08:44,  5.89s/it]
 12%|█▏        | 12/100 [01:21<08:47,  5.99s/it]
 13%|█▎        | 13/100 [01:25<08:00,  5.53s/it]
 14%|█▍        | 14/100 [01:32<08:23,  5.85s/it]
 15%|█▌        | 15/100 [01:38<08:30,  6.00s/it]
 16%|█▌        | 16/100 [01:44<08:16,  5.91s/it]
 17%|█▋        | 17/100 [01:53<09:22,  6.78s/it]
 18%|█▊        | 18/100 [02:00<09:20,  6.83s/it]
 19%|█▉        | 19/100 [02:05<08:34,  6.36s/it]
 20%|██        | 20/100 [02:11<08:11,

In [72]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit import RDLogger

RDLogger.DisableLog('rdApp.*')

base = 'SPMM/generated_molecules_from_pv.txt'
output = 'SPMM/valid_generated_molecules_from_pv.txt'

def validate_smiles(smiles, error_log_file=None):
    results = {
        'valid_syntax': False,
        'valid_molecule': False,
        'neutral': False,
        'allowed_atoms': False,
        'molecular_weight': False,
        'solubility': False,
        'is_phenol_or_aromatic_amine': False,
        'all_valid': False,
        'error': None
    }
    
    try:
        mol = Chem.MolFromSmiles(smiles, sanitize=False)
        if mol is None:
            results['error'] = "Не удалось разобрать SMILES-строку (синтаксическая ошибка)"
            return results
        
        results['valid_syntax'] = True
        
        try:
            Chem.SanitizeMol(mol)
            sanitized_mol = Chem.MolFromSmiles(smiles)
            
            if sanitized_mol is None:
                results['error'] = "Не удалось санитизировать молекулу"
                return results
                
            results['valid_molecule'] = True
            
            for atom in sanitized_mol.GetAtoms():
                if atom.GetFormalCharge() != 0:
                    results['error'] = f"Атом {atom.GetIdx()} имеет заряд {atom.GetFormalCharge()}"
                    return results
                if atom.GetNumRadicalElectrons() > 0:
                    results['error'] = f"Атом {atom.GetIdx()} имеет неспаренные электроны"
                    return results
            
            results['neutral'] = True
            
            allowed_atoms = {'C', 'H', 'O', 'N', 'P', 'S'}
            for atom in sanitized_mol.GetAtoms():
                if atom.GetSymbol() not in allowed_atoms:
                    results['error'] = f"Недопустимый атом: {atom.GetSymbol()}"
                    return results
            
            results['allowed_atoms'] = True
            
            mw = Descriptors.MolWt(sanitized_mol)
            if mw > 1000:
                results['error'] = f"Молекулярная масса превышает 1000 г/моль: {mw:.2f}"
                return results
            
            results['molecular_weight'] = True
            
            logp = Descriptors.MolLogP(sanitized_mol)
            if logp <= 1:
                results['error'] = f"LogP меньше или равен 1: {logp:.2f}"
                return results
            
            results['solubility'] = True
            
            aromatic_amine_pattern = Chem.MolFromSmarts('c-[NH2,NH1,NH0;!$(NC=O);!$(N=*);!$(N#*)]')
            phenol_pattern = Chem.MolFromSmarts('c-[OH]')
            
            has_aromatic_amine = mol.HasSubstructMatch(aromatic_amine_pattern)
            has_phenol = mol.HasSubstructMatch(phenol_pattern)
            
            if has_phenol or has_aromatic_amine:
                results['is_phenol_or_aromatic_amine'] = True
            else:
                if has_phenol or has_aromatic_amine:
                    results['error'] = "Молекула содержит фенол или ароматический амин, но имеет дополнительные группы"
                else:
                    results['error'] = "Молекула не является фенолом или ароматическим амином"
                return results
            
            results['all_valid'] = True
            
            return results
            
        except Exception as e:
            if error_log_file:
                with open(error_log_file, 'a') as log:
                    log.write(f"SMILES: {smiles}\nОшибка: {str(e)}\n\n")
            results['error'] = f"Ошибка санитизации: {str(e)}"
            return results
            
    except Exception as e:
        if error_log_file:
            with open(error_log_file, 'a') as log:
                log.write(f"SMILES: {smiles}\nОшибка: {str(e)}\n\n")
        results['error'] = f"Неожиданная ошибка: {str(e)}"
        return results
    return results

valid_smiles = []
validation_results = []

with open(base, 'r') as f:
    smiles_list = [line.strip() for line in f.readlines() if line.strip()]

print(f"Найдено {len(smiles_list)} строк в файле")

for smiles in smiles_list:
    result = validate_smiles(smiles)
    validation_results.append(result)
    if result['all_valid']:
        valid_smiles.append(smiles)

total = len(smiles_list)
total_valid_syntax = sum(1 for r in validation_results if r['valid_syntax'])
total_valid_molecule = sum(1 for r in validation_results if r['valid_molecule'])
total_neutral = sum(1 for r in validation_results if r['neutral'])
total_allowed_atoms = sum(1 for r in validation_results if r['allowed_atoms'])
total_mw_valid = sum(1 for r in validation_results if r['molecular_weight'])
total_soluble = sum(1 for r in validation_results if r['solubility'])
total_is_phenol_or_amine = sum(1 for r in validation_results if r['is_phenol_or_aromatic_amine'])
total_all_valid = sum(1 for r in validation_results if r['all_valid'])

print(f"Отчёт по валидации для файла {base}:")
print(f"  Синтаксически валидные SMILES: {total_valid_syntax} из {total}")
print(f"  Химически валидные молекулы: {total_valid_molecule} из {total}")
print(f"  Нейтральные молекулы: {total_neutral} из {total}")
print(f"  С допустимыми атомами: {total_allowed_atoms} из {total}")
print(f"  Молекулярная масса ≤ 1000 г/моль: {total_mw_valid} из {total}")
print(f"  Растворимые в гексане (logP > 1): {total_soluble} из {total}")
print(f"  Является фенолом или ароматическим амином: {total_is_phenol_or_amine} из {total}")
print(f"  Прошли ВСЕ проверки: {total_all_valid} из {total}")
        
with open(output, 'w') as f:
    for smiles in valid_smiles:
        f.write(f"{smiles}\n")

print(f"  Сохранено {len(valid_smiles)} валидных молекул в файл {output}")

Найдено 100 строк в файле
Отчёт по валидации для файла SPMM/generated_molecules_from_pv.txt:
  Синтаксически валидные SMILES: 87 из 100
  Химически валидные молекулы: 67 из 100
  Нейтральные молекулы: 63 из 100
  С допустимыми атомами: 61 из 100
  Молекулярная масса ≤ 1000 г/моль: 59 из 100
  Растворимые в гексане (logP > 1): 59 из 100
  Является фенолом или ароматическим амином: 53 из 100
  Прошли ВСЕ проверки: 53 из 100
  Сохранено 53 валидных молекул в файл SPMM/valid_generated_molecules_from_pv.txt


In [73]:
new_found = ['Cc1cc(NC2CCCC2)ccc1-c1cc2nc(-c3ccc(-c4nc5ccc(-c6cc(NCC(O)CNCCc7ccc(-c8ccccc8)cc7)nc(NCc7ccccc7)n6)cc5[nH]4)cc3)[nH]c2cc1C']

In [74]:
properties = [calculate_property(smiles) for smiles in new_found]

In [75]:
pd.DataFrame({
    'property': names,
    'input_value': properties[0]
}).to_csv(f'data/p2s_input_NEW2.csv', index=False)

In [76]:
p2s_new = pd.read_csv('data/p2s_input_NEW2.csv')

In [77]:
!cd SPMM && python "pv2smiles_orig.py" --n_generate 100 --checkpoint "../checkpoint_SPMM.ckpt" --device "cuda"

seed: 133 True
Creating model
LOADING PRETRAINED MODEL..
load checkpoint from ../checkpoint_SPMM.ckpt
_IncompatibleKeys(missing_keys=['property_encoder.embeddings.word_embeddings.weight', 'property_encoder_m.embeddings.word_embeddings.weight'], unexpected_keys=['temp'])
PV-to-SMILES generation in stochastic manner with k=2...
mean of controlled properties' normalized RMSE: 1.326729655265808
validity: 0.47
uniqueness: 1.0
Generated molecules are saved in 'generated_molecules.txt'



  0%|          | 0/100 [00:00<?, ?it/s]
  1%|          | 1/100 [00:15<25:17, 15.33s/it]
  2%|▏         | 2/100 [00:25<19:46, 12.10s/it]
  3%|▎         | 3/100 [00:37<19:41, 12.19s/it]
  4%|▍         | 4/100 [00:44<16:30, 10.31s/it]
  5%|▌         | 5/100 [00:51<14:04,  8.89s/it]
  6%|▌         | 6/100 [00:58<12:51,  8.21s/it]
  7%|▋         | 7/100 [01:04<11:54,  7.68s/it]
  8%|▊         | 8/100 [01:18<14:47,  9.65s/it]
  9%|▉         | 9/100 [01:25<13:23,  8.83s/it]
 10%|█         | 10/100 [01:38<14:58,  9.99s/it]
 11%|█         | 11/100 [01:47<14:17,  9.64s/it]
 12%|█▏        | 12/100 [01:56<13:59,  9.54s/it]
 13%|█▎        | 13/100 [02:05<13:28,  9.30s/it]
 14%|█▍        | 14/100 [02:12<12:18,  8.59s/it]
 15%|█▌        | 15/100 [02:25<14:22, 10.15s/it]
 16%|█▌        | 16/100 [02:32<12:37,  9.02s/it]
 17%|█▋        | 17/100 [02:43<13:16,  9.60s/it]
 18%|█▊        | 18/100 [02:54<13:44, 10.05s/it]
 19%|█▉        | 19/100 [03:04<13:30, 10.01s/it]
 20%|██        | 20/100 [03:18<15:07,

In [78]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit import RDLogger

RDLogger.DisableLog('rdApp.*')

base = 'SPMM/generated_molecules_from_pv.txt'
output = 'SPMM/valid_generated_molecules_from_pv.txt'

def validate_smiles(smiles, error_log_file=None):
    results = {
        'valid_syntax': False,
        'valid_molecule': False,
        'neutral': False,
        'allowed_atoms': False,
        'molecular_weight': False,
        'solubility': False,
        'is_phenol_or_aromatic_amine': False,
        'all_valid': False,
        'error': None
    }
    
    try:
        mol = Chem.MolFromSmiles(smiles, sanitize=False)
        if mol is None:
            results['error'] = "Не удалось разобрать SMILES-строку (синтаксическая ошибка)"
            return results
        
        results['valid_syntax'] = True
        
        try:
            Chem.SanitizeMol(mol)
            sanitized_mol = Chem.MolFromSmiles(smiles)
            
            if sanitized_mol is None:
                results['error'] = "Не удалось санитизировать молекулу"
                return results
                
            results['valid_molecule'] = True
            
            for atom in sanitized_mol.GetAtoms():
                if atom.GetFormalCharge() != 0:
                    results['error'] = f"Атом {atom.GetIdx()} имеет заряд {atom.GetFormalCharge()}"
                    return results
                if atom.GetNumRadicalElectrons() > 0:
                    results['error'] = f"Атом {atom.GetIdx()} имеет неспаренные электроны"
                    return results
            
            results['neutral'] = True
            
            allowed_atoms = {'C', 'H', 'O', 'N', 'P', 'S'}
            for atom in sanitized_mol.GetAtoms():
                if atom.GetSymbol() not in allowed_atoms:
                    results['error'] = f"Недопустимый атом: {atom.GetSymbol()}"
                    return results
            
            results['allowed_atoms'] = True
            
            mw = Descriptors.MolWt(sanitized_mol)
            if mw > 1000:
                results['error'] = f"Молекулярная масса превышает 1000 г/моль: {mw:.2f}"
                return results
            
            results['molecular_weight'] = True
            
            logp = Descriptors.MolLogP(sanitized_mol)
            if logp <= 1:
                results['error'] = f"LogP меньше или равен 1: {logp:.2f}"
                return results
            
            results['solubility'] = True
            
            aromatic_amine_pattern = Chem.MolFromSmarts('c-[NH2,NH1,NH0;!$(NC=O);!$(N=*);!$(N#*)]')
            phenol_pattern = Chem.MolFromSmarts('c-[OH]')
            
            has_aromatic_amine = mol.HasSubstructMatch(aromatic_amine_pattern)
            has_phenol = mol.HasSubstructMatch(phenol_pattern)
            
            if has_phenol or has_aromatic_amine:
                results['is_phenol_or_aromatic_amine'] = True
            else:
                if has_phenol or has_aromatic_amine:
                    results['error'] = "Молекула содержит фенол или ароматический амин, но имеет дополнительные группы"
                else:
                    results['error'] = "Молекула не является фенолом или ароматическим амином"
                return results
            
            results['all_valid'] = True
            
            return results
            
        except Exception as e:
            if error_log_file:
                with open(error_log_file, 'a') as log:
                    log.write(f"SMILES: {smiles}\nОшибка: {str(e)}\n\n")
            results['error'] = f"Ошибка санитизации: {str(e)}"
            return results
            
    except Exception as e:
        if error_log_file:
            with open(error_log_file, 'a') as log:
                log.write(f"SMILES: {smiles}\nОшибка: {str(e)}\n\n")
        results['error'] = f"Неожиданная ошибка: {str(e)}"
        return results
    return results

valid_smiles = []
validation_results = []

with open(base, 'r') as f:
    smiles_list = [line.strip() for line in f.readlines() if line.strip()]

print(f"Найдено {len(smiles_list)} строк в файле")

for smiles in smiles_list:
    result = validate_smiles(smiles)
    validation_results.append(result)
    if result['all_valid']:
        valid_smiles.append(smiles)

total = len(smiles_list)
total_valid_syntax = sum(1 for r in validation_results if r['valid_syntax'])
total_valid_molecule = sum(1 for r in validation_results if r['valid_molecule'])
total_neutral = sum(1 for r in validation_results if r['neutral'])
total_allowed_atoms = sum(1 for r in validation_results if r['allowed_atoms'])
total_mw_valid = sum(1 for r in validation_results if r['molecular_weight'])
total_soluble = sum(1 for r in validation_results if r['solubility'])
total_is_phenol_or_amine = sum(1 for r in validation_results if r['is_phenol_or_aromatic_amine'])
total_all_valid = sum(1 for r in validation_results if r['all_valid'])

print(f"Отчёт по валидации для файла {base}:")
print(f"  Синтаксически валидные SMILES: {total_valid_syntax} из {total}")
print(f"  Химически валидные молекулы: {total_valid_molecule} из {total}")
print(f"  Нейтральные молекулы: {total_neutral} из {total}")
print(f"  С допустимыми атомами: {total_allowed_atoms} из {total}")
print(f"  Молекулярная масса ≤ 1000 г/моль: {total_mw_valid} из {total}")
print(f"  Растворимые в гексане (logP > 1): {total_soluble} из {total}")
print(f"  Является фенолом или ароматическим амином: {total_is_phenol_or_amine} из {total}")
print(f"  Прошли ВСЕ проверки: {total_all_valid} из {total}")
        
with open(output, 'w') as f:
    for smiles in valid_smiles:
        f.write(f"{smiles}\n")

print(f"  Сохранено {len(valid_smiles)} валидных молекул в файл {output}")

Найдено 100 строк в файле
Отчёт по валидации для файла SPMM/generated_molecules_from_pv.txt:
  Синтаксически валидные SMILES: 71 из 100
  Химически валидные молекулы: 47 из 100
  Нейтральные молекулы: 38 из 100
  С допустимыми атомами: 38 из 100
  Молекулярная масса ≤ 1000 г/моль: 35 из 100
  Растворимые в гексане (logP > 1): 35 из 100
  Является фенолом или ароматическим амином: 32 из 100
  Прошли ВСЕ проверки: 32 из 100
  Сохранено 32 валидных молекул в файл SPMM/valid_generated_molecules_from_pv.txt


In [79]:
new_found = ['CC(C)(CCCNc1ncc(-c2ccc(-c3nc(-c4ccc(-c5ccc(-c6ccc7c(c6)[nH]c6ccccc67)cc5)cc4)nc(-c4ccc5c(c4)oc4ccc(-c6ccccc6)cc45)n3)cc2)cn1)NCC(O)CO']

In [80]:
properties = [calculate_property(smiles) for smiles in new_found]

In [81]:
pd.DataFrame({
    'property': names,
    'input_value': properties[0]
}).to_csv(f'data/p2s_input_NEW3.csv', index=False)

In [82]:
p2s_new = pd.read_csv('data/p2s_input_NEW3.csv')

In [88]:
!cd SPMM && python "pv2smiles_orig.py" --n_generate 100 --checkpoint "../checkpoint_SPMM.ckpt" --device "cuda"

seed: 82 True
Creating model
LOADING PRETRAINED MODEL..
load checkpoint from ../checkpoint_SPMM.ckpt
_IncompatibleKeys(missing_keys=['property_encoder.embeddings.word_embeddings.weight', 'property_encoder_m.embeddings.word_embeddings.weight'], unexpected_keys=['temp'])
PV-to-SMILES generation in stochastic manner with k=2...
mean of controlled properties' normalized RMSE: 3.940887928009033
validity: 0.3
uniqueness: 0.8333333333333334
Generated molecules are saved in 'generated_molecules.txt'



  0%|          | 0/100 [00:00<?, ?it/s]
  1%|          | 1/100 [00:09<15:56,  9.66s/it]
  2%|▏         | 2/100 [00:17<14:07,  8.65s/it]
  3%|▎         | 3/100 [00:24<12:46,  7.90s/it]
  4%|▍         | 4/100 [00:36<15:20,  9.59s/it]
  5%|▌         | 5/100 [00:50<17:45, 11.22s/it]
  6%|▌         | 6/100 [01:05<19:22, 12.36s/it]
  7%|▋         | 7/100 [01:33<27:08, 17.51s/it]
  8%|▊         | 8/100 [01:44<23:29, 15.32s/it]
  9%|▉         | 9/100 [01:53<20:22, 13.43s/it]
 10%|█         | 10/100 [02:02<17:58, 11.98s/it]
 11%|█         | 11/100 [02:12<17:07, 11.55s/it]
 12%|█▏        | 12/100 [02:23<16:45, 11.43s/it]
 13%|█▎        | 13/100 [02:47<21:56, 15.13s/it]
 14%|█▍        | 14/100 [02:58<19:48, 13.82s/it]
 15%|█▌        | 15/100 [03:10<18:46, 13.26s/it]
 16%|█▌        | 16/100 [03:33<22:31, 16.10s/it]
 17%|█▋        | 17/100 [03:43<20:04, 14.52s/it]
 18%|█▊        | 18/100 [03:56<19:07, 14.00s/it]
 19%|█▉        | 19/100 [04:08<17:58, 13.32s/it]
 20%|██        | 20/100 [04:15<15:20,

In [89]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit import RDLogger

RDLogger.DisableLog('rdApp.*')

base = 'SPMM/generated_molecules_from_pv.txt'
output = 'SPMM/valid_generated_molecules_from_pv.txt'

def validate_smiles(smiles, error_log_file=None):
    results = {
        'valid_syntax': False,
        'valid_molecule': False,
        'neutral': False,
        'allowed_atoms': False,
        'molecular_weight': False,
        'solubility': False,
        'is_phenol_or_aromatic_amine': False,
        'all_valid': False,
        'error': None
    }
    
    try:
        mol = Chem.MolFromSmiles(smiles, sanitize=False)
        if mol is None:
            results['error'] = "Не удалось разобрать SMILES-строку (синтаксическая ошибка)"
            return results
        
        results['valid_syntax'] = True
        
        try:
            Chem.SanitizeMol(mol)
            sanitized_mol = Chem.MolFromSmiles(smiles)
            
            if sanitized_mol is None:
                results['error'] = "Не удалось санитизировать молекулу"
                return results
                
            results['valid_molecule'] = True
            
            for atom in sanitized_mol.GetAtoms():
                if atom.GetFormalCharge() != 0:
                    results['error'] = f"Атом {atom.GetIdx()} имеет заряд {atom.GetFormalCharge()}"
                    return results
                if atom.GetNumRadicalElectrons() > 0:
                    results['error'] = f"Атом {atom.GetIdx()} имеет неспаренные электроны"
                    return results
            
            results['neutral'] = True
            
            allowed_atoms = {'C', 'H', 'O', 'N', 'P', 'S'}
            for atom in sanitized_mol.GetAtoms():
                if atom.GetSymbol() not in allowed_atoms:
                    results['error'] = f"Недопустимый атом: {atom.GetSymbol()}"
                    return results
            
            results['allowed_atoms'] = True
            
            mw = Descriptors.MolWt(sanitized_mol)
            if mw > 1000:
                results['error'] = f"Молекулярная масса превышает 1000 г/моль: {mw:.2f}"
                return results
            
            results['molecular_weight'] = True
            
            logp = Descriptors.MolLogP(sanitized_mol)
            if logp <= 1:
                results['error'] = f"LogP меньше или равен 1: {logp:.2f}"
                return results
            
            results['solubility'] = True
            
            aromatic_amine_pattern = Chem.MolFromSmarts('c-[NH2,NH1,NH0;!$(NC=O);!$(N=*);!$(N#*)]')
            phenol_pattern = Chem.MolFromSmarts('c-[OH]')
            
            has_aromatic_amine = mol.HasSubstructMatch(aromatic_amine_pattern)
            has_phenol = mol.HasSubstructMatch(phenol_pattern)
            
            if has_phenol or has_aromatic_amine:
                results['is_phenol_or_aromatic_amine'] = True
            else:
                if has_phenol or has_aromatic_amine:
                    results['error'] = "Молекула содержит фенол или ароматический амин, но имеет дополнительные группы"
                else:
                    results['error'] = "Молекула не является фенолом или ароматическим амином"
                return results
            
            results['all_valid'] = True
            
            return results
            
        except Exception as e:
            if error_log_file:
                with open(error_log_file, 'a') as log:
                    log.write(f"SMILES: {smiles}\nОшибка: {str(e)}\n\n")
            results['error'] = f"Ошибка санитизации: {str(e)}"
            return results
            
    except Exception as e:
        if error_log_file:
            with open(error_log_file, 'a') as log:
                log.write(f"SMILES: {smiles}\nОшибка: {str(e)}\n\n")
        results['error'] = f"Неожиданная ошибка: {str(e)}"
        return results
    return results

valid_smiles = []
validation_results = []

with open(base, 'r') as f:
    smiles_list = [line.strip() for line in f.readlines() if line.strip()]

print(f"Найдено {len(smiles_list)} строк в файле")

for smiles in smiles_list:
    result = validate_smiles(smiles)
    validation_results.append(result)
    if result['all_valid']:
        valid_smiles.append(smiles)

total = len(smiles_list)
total_valid_syntax = sum(1 for r in validation_results if r['valid_syntax'])
total_valid_molecule = sum(1 for r in validation_results if r['valid_molecule'])
total_neutral = sum(1 for r in validation_results if r['neutral'])
total_allowed_atoms = sum(1 for r in validation_results if r['allowed_atoms'])
total_mw_valid = sum(1 for r in validation_results if r['molecular_weight'])
total_soluble = sum(1 for r in validation_results if r['solubility'])
total_is_phenol_or_amine = sum(1 for r in validation_results if r['is_phenol_or_aromatic_amine'])
total_all_valid = sum(1 for r in validation_results if r['all_valid'])

print(f"Отчёт по валидации для файла {base}:")
print(f"  Синтаксически валидные SMILES: {total_valid_syntax} из {total}")
print(f"  Химически валидные молекулы: {total_valid_molecule} из {total}")
print(f"  Нейтральные молекулы: {total_neutral} из {total}")
print(f"  С допустимыми атомами: {total_allowed_atoms} из {total}")
print(f"  Молекулярная масса ≤ 1000 г/моль: {total_mw_valid} из {total}")
print(f"  Растворимые в гексане (logP > 1): {total_soluble} из {total}")
print(f"  Является фенолом или ароматическим амином: {total_is_phenol_or_amine} из {total}")
print(f"  Прошли ВСЕ проверки: {total_all_valid} из {total}")
        
with open(output, 'w') as f:
    for smiles in valid_smiles:
        f.write(f"{smiles}\n")

print(f"  Сохранено {len(valid_smiles)} валидных молекул в файл {output}")

Найдено 100 строк в файле
Отчёт по валидации для файла SPMM/generated_molecules_from_pv.txt:
  Синтаксически валидные SMILES: 55 из 100
  Химически валидные молекулы: 30 из 100
  Нейтральные молекулы: 29 из 100
  С допустимыми атомами: 29 из 100
  Молекулярная масса ≤ 1000 г/моль: 26 из 100
  Растворимые в гексане (logP > 1): 20 из 100
  Является фенолом или ароматическим амином: 16 из 100
  Прошли ВСЕ проверки: 16 из 100
  Сохранено 16 валидных молекул в файл SPMM/valid_generated_molecules_from_pv.txt
