# Analyze generated smiles
生成したリガンドを分析するノートブック

In [3]:
import os
import pandas as pd
import numpy as np
import yaml
import matplotlib.pyplot as plt
from rdkit import Chem

In [6]:
with open('../config/filepath.yml') as file:
    path_config = yaml.safe_load(file)

with open('../config/model.yml') as file:
    model_config = yaml.safe_load(file)

with open('../config/data.yml') as file:
    data_config = yaml.safe_load(file)

dude_dir = os.path.join('..', path_config['data']['DUD-E'])
alphafold_dir = os.path.join('..', path_config['data']['alphafold'])
smiles_dir = os.path.join('..', path_config['data']['smiles'])
output_dir = os.path.join('..', path_config['data']['plots'])
hist_dir = os.path.join('..', path_config['data']['hist'])
eval_dir = os.path.join('..', path_config['data']['eval'], 'decoder_only')
preprocessed_dir = os.path.join('..', path_config['data']['preprocessed'])
sample_dir = os.path.join('..', path_config['data']['samples'])
model_dir = os.path.join('..', path_config['data']['docking'])

ds_reg_output_dir = os.path.join(output_dir, 'ds_regression')


In [10]:
def filter_valid_smiles(input_df, smiles_column="Generated_SMILES"):
    """
    Filters valid SMILES from a CSV file and calculates the valid SMILES ratio.

    Args:
        input_csv (str): Path to the input CSV file.
        smiles_column (str): Column name containing SMILES strings in the input CSV.

    Returns:
        float: The ratio of valid SMILES.
    """

    # Check if the SMILES column exists
    if smiles_column not in input_df.columns:
        raise ValueError(f"Column '{smiles_column}' not found in the input CSV.")

    # Check validity of SMILES
    input_df['is_valid'] = input_df[smiles_column].apply(lambda x: Chem.MolFromSmiles(x) is not None)

    # Calculate the valid SMILES ratio
    valid_ratio = input_df['is_valid'].mean()

    # Filter only valid SMILES
    valid_data = input_df[input_df['is_valid']].drop(columns=['is_valid'])

    return valid_ratio, valid_data

## DRD3 local model
drd3 のデータセットで学習し、ドッキングスコアの良いもの10個をシード化合物として生成した smiles

In [8]:
drd3_model_timestamp = '2025-01-10_20-12-20'
drd3_best_docking = pd.read_csv(os.path.join(eval_dir, drd3_model_timestamp, 'best_docking.csv'))

  SEED_Ligand_ID                                   Generated_SMILES
0   ZINC20605418  CC[C@H](C)[NH+](C)CCNC(=O)c1ccc(-c2c(C)oc3c2CC...
1   ZINC20605418  C[NH+](CCOc1ccc(Br)cc1)CC[NH+]1CCC(O)(c2ccccc2...
2   ZINC20605418                               Nc1nccc2c1sc1ccccc12
3   ZINC20605418                     C[NH+](CCc1ccccc1)Cc1cc(Br)cs1
4   ZINC20605418  CC[C@@H](C)NC(=O)/C(C#N)=C1\S[C@@H]2N=NC(SCc3n...


In [12]:
valid_ratio, valid_data = filter_valid_smiles(drd3_best_docking)
print(f"Valid SMILES ratio: {valid_ratio*100:.2f}")
print(f"Number of valid SMILES: {len(valid_data)}")

Valid SMILES ratio: 86.30
Number of valid SMILES: 863


[00:47:54] Explicit valence for atom # 28 N, 4, is greater than permitted
[00:47:54] SMILES Parse Error: extra close parentheses while parsing: O=S1(=O)[C@H]2CC[C@@H]3CC(CCC4=CCC=CCC4)O3)N=N[C@@H]21
[00:47:54] SMILES Parse Error: check for mistakes around position 43:
[00:47:54] 3CC(CCC4=CCC=CCC4)O3)N=N[C@@H]21
[00:47:54] ~~~~~~~~~~~~~~~~~~~~^
[00:47:54] SMILES Parse Error: Failed parsing SMILES 'O=S1(=O)[C@H]2CC[C@@H]3CC(CCC4=CCC=CCC4)O3)N=N[C@@H]21' for input: 'O=S1(=O)[C@H]2CC[C@@H]3CC(CCC4=CCC=CCC4)O3)N=N[C@@H]21'
[00:47:54] Explicit valence for atom # 7 N, 6, is greater than permitted
[00:47:54] SMILES Parse Error: unclosed ring for input: 'Cc1cc2c(N3CCO[C@@H]4C3)c(C(=O)Nc3ccc(F)cc3C(F)(F)F)ccc2o1'
[00:47:54] SMILES Parse Error: unclosed ring for input: 'C[C@H]1[C@H]2[C@@H](c3ccccc3)Nc3cc(C[NH2+]C[C@@H](C)O)ccc2N(CC(=O)N(C)C)C1=O'
[00:47:54] SMILES Parse Error: unclosed ring for input: 'Cc1ccc2c(C)c(C(=O)NC3CC4CCC(CC(=O)NC5CCCCC5)[C@H](C)C3)sc2n1'
[00:47:54] Explicit valence for a