In [20]:
import os

os.makedirs("SPMM/generated_molecules_from_top", exist_ok=True)

In [2]:
with open('SPMM/property_name.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

property_list = [line.strip() for line in lines]

In [3]:
good_properties = [
    'LogP', 
    'NumHDonors',
    'NumAromaticRings', 
    'TPSA', 
    'MolWt', 
    'NHOHCount', 
    'FractionCSP3', 
    'RingCount',
    'MaxEStateIndex',
    'MinEStateIndex', 
    'HeavyAtomCount'
]

In [4]:
indices = [property_list.index(prop) for prop in good_properties if prop in property_list]

In [5]:
print(*indices)

42 40 51 32 33 18 50 27 29 20


In [7]:
import pandas as pd

df = pd.read_csv('data/2025-04-12T12-32_export.csv')

In [17]:
pd.DataFrame(df.sort_values(by='PDSC', ascending=False).head(5).drop_duplicates()['Smiles']).to_csv('data/top_smiles_before_topgen.csv', index=False)

In [26]:
!cd SPMM && python "d_pv2smiles_single.py" --csv_file "../data/top_smiles_before_topgen.csv" --output_dir "./generated_molecules_from_top" --n_generate 100 --property_positions 42 40 51 32 33 18 50 27 29 20 --k 2 --seed 42 --checkpoint "../checkpoint_SPMM.ckpt" --device "cuda"

^C


In [None]:
import os
import pandas as pd
import glob
import datetime
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit import RDLogger

RDLogger.DisableLog('rdApp.*')

base_directory = 'SPMM/generated_molecules_from_top'
output_dir = 'SPMM/valid_molecules_from_top'
reports_dir = 'SPMM/validation_reports_from_top'

os.makedirs(output_dir, exist_ok=True)
os.makedirs(reports_dir, exist_ok=True)

def validate_smiles(smiles, error_log_file=None):
    results = {
        'valid_syntax': False,
        'valid_molecule': False,
        'neutral': False,
        'allowed_atoms': False,
        'molecular_weight': False,
        'solubility': False,
        'is_phenol_or_aromatic_amine': False,  # Добавлена новая проверка
        'all_valid': False,
        'error': None
    }
    
    try:
        mol = Chem.MolFromSmiles(smiles, sanitize=False)
        if mol is None:
            results['error'] = "Не удалось разобрать SMILES-строку (синтаксическая ошибка)"
            return results
        
        results['valid_syntax'] = True
        
        try:
            Chem.SanitizeMol(mol)
            sanitized_mol = Chem.MolFromSmiles(smiles)
            
            if sanitized_mol is None:
                results['error'] = "Не удалось санитизировать молекулу"
                return results
                
            results['valid_molecule'] = True
            
            for atom in sanitized_mol.GetAtoms():
                if atom.GetFormalCharge() != 0:
                    results['error'] = f"Атом {atom.GetIdx()} имеет заряд {atom.GetFormalCharge()}"
                    return results
                if atom.GetNumRadicalElectrons() > 0:
                    results['error'] = f"Атом {atom.GetIdx()} имеет неспаренные электроны"
                    return results
            
            results['neutral'] = True
            
            allowed_atoms = {'C', 'H', 'O', 'N', 'P', 'S'}
            for atom in sanitized_mol.GetAtoms():
                if atom.GetSymbol() not in allowed_atoms:
                    results['error'] = f"Недопустимый атом: {atom.GetSymbol()}"
                    return results
            
            results['allowed_atoms'] = True
            
            mw = Descriptors.MolWt(sanitized_mol)
            if mw > 1000:
                results['error'] = f"Молекулярная масса превышает 1000 г/моль: {mw:.2f}"
                return results
            
            results['molecular_weight'] = True
            
            logp = Descriptors.MolLogP(sanitized_mol)
            if logp <= 1:
                results['error'] = f"LogP меньше или равен 1: {logp:.2f}"
                return results
            
            results['solubility'] = True
            
            aromatic_amine_pattern = Chem.MolFromSmarts('c-[NH2,NH1,NH0;!$(NC=O);!$(N=*);!$(N#*)]')
            phenol_pattern = Chem.MolFromSmarts('c-[OH]')
            
            has_aromatic_amine = mol.HasSubstructMatch(aromatic_amine_pattern)
            has_phenol = mol.HasSubstructMatch(phenol_pattern)
            
            if has_phenol or has_aromatic_amine:
                results['is_phenol_or_aromatic_amine'] = True
            else:
                if has_phenol or has_aromatic_amine:
                    results['error'] = "Молекула содержит фенол или ароматический амин, но имеет дополнительные группы"
                else:
                    results['error'] = "Молекула не является фенолом или ароматическим амином"
                return results
            
            results['all_valid'] = True
            
            return results
            
        except Exception as e:
            if error_log_file:
                with open(error_log_file, 'a') as log:
                    log.write(f"SMILES: {smiles}\nОшибка: {str(e)}\n\n")
            results['error'] = f"Ошибка санитизации: {str(e)}"
            return results
            
    except Exception as e:
        if error_log_file:
            with open(error_log_file, 'a') as log:
                log.write(f"SMILES: {smiles}\nОшибка: {str(e)}\n\n")
        results['error'] = f"Неожиданная ошибка: {str(e)}"
        return results
    return results

def save_report(file_name, report_data, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(f"Отчет по валидации для файла: {file_name}\n")
        f.write(f"Дата и время: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        
        f.write(f"Всего строк в файле: {report_data['total']}\n")
        f.write(f"Синтаксически валидные SMILES: {report_data['valid_syntax']} из {report_data['total']} ({(report_data['valid_syntax']/report_data['total'])*100:.2f}%)\n")
        f.write(f"Химически валидные молекулы: {report_data['valid_molecule']} из {report_data['total']} ({(report_data['valid_molecule']/report_data['total'])*100:.2f}%)\n")
        f.write(f"Нейтральные молекулы: {report_data['neutral']} из {report_data['total']} ({(report_data['neutral']/report_data['total'])*100:.2f}%)\n")
        f.write(f"С допустимыми атомами: {report_data['allowed_atoms']} из {report_data['total']} ({(report_data['allowed_atoms']/report_data['total'])*100:.2f}%)\n")
        f.write(f"Молекулярная масса ≤ 1000 г/моль: {report_data['mw_valid']} из {report_data['total']} ({(report_data['mw_valid']/report_data['total'])*100:.2f}%)\n")
        f.write(f"Растворимые в гексане (logP > 1): {report_data['soluble']} из {report_data['total']} ({(report_data['soluble']/report_data['total'])*100:.2f}%)\n")
        f.write(f"Является фенолом или ароматическим амином: {report_data['is_phenol_or_amine']} из {report_data['total']} ({(report_data['is_phenol_or_amine']/report_data['total'])*100:.2f}%)\n")  # Новая строка
        f.write(f"Прошли ВСЕ проверки: {report_data['all_valid']} из {report_data['total']} ({(report_data['all_valid']/report_data['total'])*100:.2f}%)\n\n")
        
        f.write(f"Сохранено {report_data['all_valid']} валидных молекул в файл {report_data['output_file']}\n")

file_pattern = os.path.join(base_directory, 'generated_molecules_*.txt')
files = glob.glob(file_pattern)

report_data_list = []

if not files:
    print(f"Файлы не найдены в {base_directory}")
else:
    print(f"Найдено {len(files)} файлов для обработки")
    
    for file_path in files:
        file_name = os.path.basename(file_path)
        print(f"\nОбработка файла: {file_name}")
        
        valid_smiles = []
        validation_results = []
        
        with open(file_path, 'r') as f:
            smiles_list = [line.strip() for line in f.readlines() if line.strip()]
        
        print(f"Найдено {len(smiles_list)} строк в файле")
        
        for smiles in smiles_list:
            result = validate_smiles(smiles)
            validation_results.append(result)
            if result['all_valid']:
                valid_smiles.append(smiles)
        
        total = len(smiles_list)
        total_valid_syntax = sum(1 for r in validation_results if r['valid_syntax'])
        total_valid_molecule = sum(1 for r in validation_results if r['valid_molecule'])
        total_neutral = sum(1 for r in validation_results if r['neutral'])
        total_allowed_atoms = sum(1 for r in validation_results if r['allowed_atoms'])
        total_mw_valid = sum(1 for r in validation_results if r['molecular_weight'])
        total_soluble = sum(1 for r in validation_results if r['solubility'])
        total_is_phenol_or_amine = sum(1 for r in validation_results if r['is_phenol_or_aromatic_amine'])  # Новая строка
        total_all_valid = sum(1 for r in validation_results if r['all_valid'])

        print(f"Отчёт по валидации для файла {file_name}:")
        print(f"  Синтаксически валидные SMILES: {total_valid_syntax} из {total}")
        print(f"  Химически валидные молекулы: {total_valid_molecule} из {total}")
        print(f"  Нейтральные молекулы: {total_neutral} из {total}")
        print(f"  С допустимыми атомами: {total_allowed_atoms} из {total}")
        print(f"  Молекулярная масса ≤ 1000 г/моль: {total_mw_valid} из {total}")
        print(f"  Растворимые в гексане (logP > 1): {total_soluble} из {total}")
        print(f"  Является фенолом или ароматическим амином: {total_is_phenol_or_amine} из {total}")  # Новая строка
        print(f"  Прошли ВСЕ проверки: {total_all_valid} из {total}")
                
        output_file = os.path.join(output_dir, f"valid_{file_name}")
        with open(output_file, 'w') as f:
            for smiles in valid_smiles:
                f.write(f"{smiles}\n")
        
        print(f"  Сохранено {len(valid_smiles)} валидных молекул в файл {output_file}")
        
        report_file = os.path.join(reports_dir, f"report_{file_name}.txt")
        report_data = {
            'filename': file_name,
            'total': total,
            'valid_syntax': total_valid_syntax,
            'valid_molecule': total_valid_molecule,
            'neutral': total_neutral,
            'allowed_atoms': total_allowed_atoms,
            'mw_valid': total_mw_valid,
            'soluble': total_soluble,
            'is_phenol_or_amine': total_is_phenol_or_amine,  # Новая строка
            'all_valid': total_all_valid,
            'output_file': os.path.basename(output_file)
        }
        report_data_list.append(report_data)
        
        save_report(file_name, report_data, report_file)
        print(f"  Отчет сохранен в файл {report_file}")
                
    summary_df = pd.DataFrame(report_data_list)
    summary_csv_path = os.path.join(reports_dir, 'validation_summary.csv')
    summary_df.to_csv(summary_csv_path, index=False)
    print(f"\nСводная таблица сохранена в: {summary_csv_path}")