In [2]:
import torch.nn as nn

class SMILESLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        x, _ = self.lstm(x)
        return self.fc(x)


In [3]:
all_chars = ['#',
 '(',
 ')',
 '+',
 '-',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '=',
 'A',
 'B',
 'C',
 'F',
 'H',
 'I',
 'N',
 'O',
 'P',
 'S',
 '[',
 ']',
 'c',
 'e',
 'i',
 'l',
 'n',
 'o',
 'r',
 's']

char2idx = {'#': 1,
 '(': 2,
 ')': 3,
 '+': 4,
 '-': 5,
 '1': 6,
 '2': 7,
 '3': 8,
 '4': 9,
 '5': 10,
 '6': 11,
 '7': 12,
 '=': 13,
 'A': 14,
 'B': 15,
 'C': 16,
 'F': 17,
 'H': 18,
 'I': 19,
 'N': 20,
 'O': 21,
 'P': 22,
 'S': 23,
 '[': 24,
 ']': 25,
 'c': 26,
 'e': 27,
 'i': 28,
 'l': 29,
 'n': 30,
 'o': 31,
 'r': 32,
 's': 33}

idx2char = {1: '#',
 2: '(',
 3: ')',
 4: '+',
 5: '-',
 6: '1',
 7: '2',
 8: '3',
 9: '4',
 10: '5',
 11: '6',
 12: '7',
 13: '=',
 14: 'A',
 15: 'B',
 16: 'C',
 17: 'F',
 18: 'H',
 19: 'I',
 20: 'N',
 21: 'O',
 22: 'P',
 23: 'S',
 24: '[',
 25: ']',
 26: 'c',
 27: 'e',
 28: 'i',
 29: 'l',
 30: 'n',
 31: 'o',
 32: 'r',
 33: 's'}

vocab_size = 34

In [4]:
import torch

model = SMILESLSTM(34)
model.load_state_dict(torch.load('smiles_lstm.pth', map_location=torch.device('cpu')))
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

  model.load_state_dict(torch.load('smiles_lstm.pth', map_location=torch.device('cpu')))


In [5]:
from rdkit import Chem

def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None


import random

def generate_smiles(model, start_token='C', max_len=100, stop_chance=0.2, min_len=15):
    model.eval()
    input_seq = torch.tensor([[char2idx.get(start_token, 1)]], dtype=torch.long).to(device)
    generated = [start_token]

    for i in range(max_len):
        output = model(input_seq)
        prob = torch.softmax(output[0, -1], dim=0)
        next_idx = torch.multinomial(prob, 1).item()
        if next_idx == 0:
            break

        next_char = idx2char[next_idx]
        generated.append(next_char)

        if i >= min_len and random.random() < stop_chance:
            break

        input_seq = torch.tensor([[next_idx]], dtype=torch.long).to(device)

    return ''.join(generated)

print(generate_smiles(model, max_len=60, stop_chance=0.25))

def generate_n_valid_smiles(model, n=100, start_token='C', max_len=100, stop_chance=0.3, min_len=10, max_output_len=40, max_total_tries=25000):
    valid_smiles = set()
    tries = 0

    while len(valid_smiles) < n and tries < max_total_tries:
        smiles = generate_smiles(
            model,
            start_token=start_token,
            max_len=max_len,
            stop_chance=stop_chance,
            min_len=min_len
        )
        tries += 1
        if is_valid_smiles(smiles) and len(smiles) <= max_output_len:
            valid_smiles.add(smiles)

    return list(valid_smiles)



CN#-]c1FCS=[O=Cn2)c1)c1)c1)c


In [6]:
valid_molecules = generate_n_valid_smiles(model, n=100)
print(f"Сгенерировано валидных молекул: {len(valid_molecules)}")

[01:29:06] SMILES Parse Error: extra close parentheses while parsing: CCO=Cc1)c1)c1)c1)-]
[01:29:06] SMILES Parse Error: check for mistakes around position 8:
[01:29:06] CCO=Cc1)c1)c1)c1)-]
[01:29:06] ~~~~~~~^
[01:29:06] SMILES Parse Error: Failed parsing SMILES 'CCO=Cc1)c1)c1)c1)-]' for input: 'CCO=Cc1)c1)c1)c1)-]'
[01:29:06] SMILES Parse Error: extra close parentheses while parsing: CCl)c1)c1)c1
[01:29:06] SMILES Parse Error: check for mistakes around position 4:
[01:29:06] CCl)c1)c1)c1
[01:29:06] ~~~^
[01:29:06] SMILES Parse Error: Failed parsing SMILES 'CCl)c1)c1)c1' for input: 'CCl)c1)c1)c1'
[01:29:06] SMILES Parse Error: extra close parentheses while parsing: Cc1)c1)n1)c1)
[01:29:06] SMILES Parse Error: check for mistakes around position 4:
[01:29:06] Cc1)c1)n1)c1)
[01:29:06] ~~~^
[01:29:06] SMILES Parse Error: Failed parsing SMILES 'Cc1)c1)n1)c1)' for input: 'Cc1)c1)n1)c1)'
[01:29:06] Explicit valence for atom # 6 O, 3, is greater than permitted
[01:29:06] SMILES Parse Error: ex

Сгенерировано валидных молекул: 100


[01:30:48] SMILES Parse Error: extra close parentheses while parsing: CCCN#COCCCNc1)c1)Nc1)c
[01:30:48] SMILES Parse Error: check for mistakes around position 14:
[01:30:48] CCCN#COCCCNc1)c1)Nc1)c
[01:30:48] ~~~~~~~~~~~~~^
[01:30:48] SMILES Parse Error: Failed parsing SMILES 'CCCN#COCCCNc1)c1)Nc1)c' for input: 'CCCN#COCCCNc1)c1)Nc1)c'
[01:30:48] SMILES Parse Error: extra close parentheses while parsing: CNS=CCCCl)c1)c1)c1
[01:30:48] SMILES Parse Error: check for mistakes around position 10:
[01:30:48] CNS=CCCCl)c1)c1)c1
[01:30:48] ~~~~~~~~~^
[01:30:48] SMILES Parse Error: Failed parsing SMILES 'CNS=CCCCl)c1)c1)c1' for input: 'CNS=CCCCl)c1)c1)c1'
[01:30:48] Explicit valence for atom # 2 O, 3, is greater than permitted
[01:30:48] SMILES Parse Error: extra close parentheses while parsing: Cc1)c1)c1)c1)c1)c1)
[01:30:48] SMILES Parse Error: check for mistakes around position 4:
[01:30:48] Cc1)c1)c1)c1)c1)c1)
[01:30:48] ~~~^
[01:30:48] SMILES Parse Error: Failed parsing SMILES 'Cc1)c1)c1)c1)

In [7]:
for i, smi in enumerate(valid_molecules[:100]):
    print(f"{i+1}: {smi}")

1: CCC=S=CCS=CCN
2: CNCCCCCCCCCO
3: CCCS=S=CCCCC
4: CCCCCCCOCCOCC=CC
5: CCC=CCCNS=CN
6: CCCCS=S=NCCC
7: CCCCCCCCCCCl
8: CCN=S=CCOCCC=COC
9: CCCCC=S=S=NS
10: CS=NCCCCNCCOC
11: CSC=CCS=CCCCC
12: CCNCNCC=COCO
13: CS=CS=CCCCNN
14: CNCCCOCCS=CC
15: COCCNCCCCCCN
16: CCCOCCCS=CCC
17: CNCC=P(=CCl)
18: CCCCCNC=CCCN
19: COCCCC=CCCCCCS
20: CCS=CCCCCCCC
21: CCCCCOCCCOCN
22: CCCNS=S=COCCO
23: CCC=CC=S=S=S=C
24: CCOCCC=CCCNCCNC
25: COCCCOCS=S=CCCCCN
26: C=S=S=ICN=S=C
27: CCC#CCCCCS=CCC
28: CCS=S#CCCNCC
29: CCS=CCOCCCCO
30: CCCCCCCCCCCCC
31: CS=S=CCCCS=S
32: CCOCCCCCS=CC
33: CNCCCCCNCCCC
34: CCCCCCS=CC=C
35: CCCCCC=S=CCN
36: COCCCC#CNCCC
37: CCC=CNS=S=S=C
38: CCCCCCNS=COCCO
39: C=CCCCCNCCCCCO
40: CCNCCCCCCCCC
41: CNS=CCS=COCl
42: CNS=CCCCCCCCCCC
43: CCCOCCCNCOCN
44: CCCC=CCCCOCCC
45: CCS=CCCCCN=CC
46: CCCOCS=CCC=S=C
47: CCCCCCCCOCCCNC
48: CCS=COCS=S=C
49: CCCCCNCCCS=CCC
50: CNS=S=COCCOC
51: CCNCNC=CCCCC
52: CCCCS=CCCCCCCO
53: CNS=CCNCOCCC
54: CC#CCOCCC=CO
55: CS=CNCCCCCCNC
56: CNCCCC=CS=CC=P
57: CCOC

In [10]:
from rdkit import Chem
from rdkit.Chem import AllChem

def smiles_to_sdf(smiles_list, sdf_filename):
    writer = Chem.SDWriter(sdf_filename)

    for i, smi in enumerate(smiles_list):
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            print(f"Warning: invalid SMILES skipped: {smi}")
            continue
        
        mol = Chem.AddHs(mol)

        AllChem.EmbedMolecule(mol, randomSeed=0xf00d)
        AllChem.UFFOptimizeMolecule(mol)

        mol.SetProp("_Name", f"Molecule_{i+1}")
        writer.write(mol)

    writer.close()
    print(f"SDF файл '{sdf_filename}' успешно сохранён, молекул: {writer.GetNumWrites()}")


In [12]:
from rdkit import Chem
from rdkit.Chem import Descriptors

def passes_lipinski(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False

    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    h_donors = Descriptors.NumHDonors(mol)
    h_acceptors = Descriptors.NumHAcceptors(mol)

    return (
        mw <= 500 and
        logp <= 5 and
        h_donors <= 5 and
        h_acceptors <= 10
    )


In [14]:
filtered = [smi for smi in valid_molecules if passes_lipinski(smi)]
print(f"Прошли правило Липинского: {len(filtered)} из {len(valid_molecules)}")

Прошли правило Липинского: 99 из 100


In [11]:
smiles_to_sdf(valid_molecules, "output_molecules.sdf")



ValueError: Bad Conformer Id