In [2]:
import torch.nn as nn

class SMILESLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        x, _ = self.lstm(x)
        return self.fc(x)


In [3]:
all_chars = ['#',
 '(',
 ')',
 '+',
 '-',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '=',
 'A',
 'B',
 'C',
 'F',
 'H',
 'I',
 'N',
 'O',
 'P',
 'S',
 '[',
 ']',
 'c',
 'e',
 'i',
 'l',
 'n',
 'o',
 'r',
 's']

char2idx = {'#': 1,
 '(': 2,
 ')': 3,
 '+': 4,
 '-': 5,
 '1': 6,
 '2': 7,
 '3': 8,
 '4': 9,
 '5': 10,
 '6': 11,
 '7': 12,
 '=': 13,
 'A': 14,
 'B': 15,
 'C': 16,
 'F': 17,
 'H': 18,
 'I': 19,
 'N': 20,
 'O': 21,
 'P': 22,
 'S': 23,
 '[': 24,
 ']': 25,
 'c': 26,
 'e': 27,
 'i': 28,
 'l': 29,
 'n': 30,
 'o': 31,
 'r': 32,
 's': 33}

idx2char = {1: '#',
 2: '(',
 3: ')',
 4: '+',
 5: '-',
 6: '1',
 7: '2',
 8: '3',
 9: '4',
 10: '5',
 11: '6',
 12: '7',
 13: '=',
 14: 'A',
 15: 'B',
 16: 'C',
 17: 'F',
 18: 'H',
 19: 'I',
 20: 'N',
 21: 'O',
 22: 'P',
 23: 'S',
 24: '[',
 25: ']',
 26: 'c',
 27: 'e',
 28: 'i',
 29: 'l',
 30: 'n',
 31: 'o',
 32: 'r',
 33: 's'}

vocab_size = 34

In [4]:
import torch

model = SMILESLSTM(34)
model.load_state_dict(torch.load('smiles_lstm.pth', map_location=torch.device('cpu')))
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

  model.load_state_dict(torch.load('smiles_lstm.pth', map_location=torch.device('cpu')))


In [5]:
from rdkit import Chem

def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None


import random

def generate_smiles(model, start_token='C', max_len=100, stop_chance=0.2, min_len=15):
    model.eval()
    input_seq = torch.tensor([[char2idx.get(start_token, 1)]], dtype=torch.long).to(device)
    generated = [start_token]

    for i in range(max_len):
        output = model(input_seq)
        prob = torch.softmax(output[0, -1], dim=0)
        next_idx = torch.multinomial(prob, 1).item()
        if next_idx == 0:
            break

        next_char = idx2char[next_idx]
        generated.append(next_char)

        if i >= min_len and random.random() < stop_chance:
            break

        input_seq = torch.tensor([[next_idx]], dtype=torch.long).to(device)

    return ''.join(generated)

print(generate_smiles(model, max_len=60, stop_chance=0.25))

def generate_n_valid_smiles(model, n=100, start_token='C', max_len=100, stop_chance=0.3, min_len=10, max_output_len=40, max_total_tries=25000):
    valid_smiles = set()
    tries = 0

    while len(valid_smiles) < n and tries < max_total_tries:
        smiles = generate_smiles(
            model,
            start_token=start_token,
            max_len=max_len,
            stop_chance=stop_chance,
            min_len=min_len
        )
        tries += 1
        if is_valid_smiles(smiles) and len(smiles) <= max_output_len:
            valid_smiles.add(smiles)

    print(len(valid_smiles), '/', tries)
    return list(valid_smiles)



CNCCC=COc1)c1)c1)c1)c


In [6]:
valid_molecules = generate_n_valid_smiles(model, n=100)
print(f"Сгенерировано валидных молекул: {len(valid_molecules)}")

[20:12:28] SMILES Parse Error: extra close parentheses while parsing: Cl)c1)c1)c1)c
[20:12:28] SMILES Parse Error: check for mistakes around position 3:
[20:12:28] Cl)c1)c1)c1)c
[20:12:28] ~~^
[20:12:28] SMILES Parse Error: Failed parsing SMILES 'Cl)c1)c1)c1)c' for input: 'Cl)c1)c1)c1)c'
[20:12:28] SMILES Parse Error: extra close parentheses while parsing: CO=Cc1)c1)c1)c1
[20:12:28] SMILES Parse Error: check for mistakes around position 7:
[20:12:28] CO=Cc1)c1)c1)c1
[20:12:28] ~~~~~~^
[20:12:28] SMILES Parse Error: Failed parsing SMILES 'CO=Cc1)c1)c1)c1' for input: 'CO=Cc1)c1)c1)c1'
[20:12:28] SMILES Parse Error: extra close parentheses while parsing: CCCCc1)n1)c2)c1
[20:12:28] SMILES Parse Error: check for mistakes around position 7:
[20:12:28] CCCCc1)n1)c2)c1
[20:12:28] ~~~~~~^
[20:12:28] SMILES Parse Error: Failed parsing SMILES 'CCCCc1)n1)c2)c1' for input: 'CCCCc1)n1)c2)c1'
[20:12:28] SMILES Parse Error: extra close parentheses while parsing: CCO=CCO=Cc1)
[20:12:28] SMILES Parse Er

100 / 11768
Сгенерировано валидных молекул: 100


[20:14:42] SMILES Parse Error: extra close parentheses while parsing: CSCl)c1)c1)c1)c
[20:14:42] SMILES Parse Error: check for mistakes around position 5:
[20:14:42] CSCl)c1)c1)c1)c
[20:14:42] ~~~~^
[20:14:42] SMILES Parse Error: Failed parsing SMILES 'CSCl)c1)c1)c1)c' for input: 'CSCl)c1)c1)c1)c'
[20:14:42] SMILES Parse Error: syntax error while parsing: CCNCO=CO=[CO=CCCC
[20:14:42] SMILES Parse Error: check for mistakes around position 12:
[20:14:42] CCNCO=CO=[CO=CCCC
[20:14:42] ~~~~~~~~~~~^
[20:14:42] SMILES Parse Error: Failed parsing SMILES 'CCNCO=CO=[CO=CCCC' for input: 'CCNCO=CO=[CO=CCCC'
[20:14:42] SMILES Parse Error: extra close parentheses while parsing: CS=Cc1)o1)c1
[20:14:42] SMILES Parse Error: check for mistakes around position 7:
[20:14:42] CS=Cc1)o1)c1
[20:14:42] ~~~~~~^
[20:14:42] SMILES Parse Error: Failed parsing SMILES 'CS=Cc1)o1)c1' for input: 'CS=Cc1)o1)c1'
[20:14:42] SMILES Parse Error: extra close parentheses while parsing: CO=CCCCO=CNc1)c1)c1
[20:14:42] SMILES 

In [7]:
for i, smi in enumerate(valid_molecules[:100]):
    print(f"{i+1}: {smi}")

1: CCCNS=S=CCOCC
2: COCCCCN=S=CO
3: CCCCCCCC#CCO
4: CCCN=S=S=CCCC
5: CCNCNS=S=CCN
6: CCNCCCCCCCCCC
7: CCCCCCS=COC=C
8: CCOCCN=CCCCCC
9: CCCCCCCCCCCS
10: CCCCCCCCCCC=CO
11: CCNCCCCCCCCC
12: CCCCCCCCCCCN
13: CCCOCCCCCCNC
14: CS=CCCNC=CCCC
15: CCCCS=CCCCCO
16: CCCCCNCS=S=C
17: CCCCCCCCCCCCC
18: CC=CCNS=CCCCC
19: COCS=S=C=S=CCO
20: CCCCOCCCCCNN
21: C=CCOCCCCNS=P
22: CCCCN=CCCCCCN
23: CNCCCCCCCCNC
24: COCCCCCCCCCO
25: COCCCCCNC=C=C
26: CCCNC=CCNCNCCCC
27: CCS=CNS=CCCCO
28: CCC=CNCS=CNC
29: C=SCCCCOCCN=C
30: CCNCC=S=CCCCC
31: CNCCCN=CCS=C
32: CCCS=S=CNCCCCCC
33: CCC=CC=S=CCC
34: CCCCS=S=CCNS=C
35: CCCCCOCCNS=C
36: CNS=CCNCCOCC
37: CCOCS=CCCCNN
38: CCCCCCCCOCNC
39: C=CNCCCC=S=CCO
40: CCCCCCNCCCCCCl
41: CCCCCOCNCCCCCC
42: CS=S=CCCNSCC
43: C=CCCN=CCCCC
44: CCCCNS=S=CCCCC
45: CC=CCCCCCCCCC
46: CCCCCN=CCC=C
47: CS=CCS=CCCCC
48: CCCCNCOCCc1CCc1
49: CCCCCN=CC=CC
50: CCS=CCCCC=CC
51: CCCCNCOCCOCC
52: COCCCCS=COCC
53: C=CCCOCCOCCCCCCC
54: CCCCCS=S=S=CC
55: CS=CCCCCCNCC
56: CCCCNS=CCCNCCC
57: CNCCS=CC

In [12]:
def calculate_uniqueness(valid_smiles):
    unique = set(valid_smiles) 
    return (len(unique) / len(valid_smiles)) * 100

print(calculate_uniqueness(valid_molecules))

100.0


In [13]:
from rdkit import Chem
from rdkit.Chem import AllChem

def smiles_to_sdf(smiles_list, sdf_filename):
    writer = Chem.SDWriter(sdf_filename)

    for i, smi in enumerate(smiles_list):
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            print(f"Warning: invalid SMILES skipped: {smi}")
            continue
        
        mol = Chem.AddHs(mol)

        AllChem.EmbedMolecule(mol, randomSeed=0xf00d)
        AllChem.UFFOptimizeMolecule(mol)

        mol.SetProp("_Name", f"Molecule_{i+1}")
        writer.write(mol)

    writer.close()
    print(f"SDF файл '{sdf_filename}' успешно сохранён, молекул: {writer.GetNumWrites()}")


In [14]:
from rdkit import Chem
from rdkit.Chem import Descriptors

def passes_lipinski(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False

    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    h_donors = Descriptors.NumHDonors(mol)
    h_acceptors = Descriptors.NumHAcceptors(mol)

    return (
        mw <= 500 and
        logp <= 5 and
        h_donors <= 5 and
        h_acceptors <= 10
    )


In [15]:
filtered = [smi for smi in valid_molecules if passes_lipinski(smi)]
print(f"Прошли правило Липинского: {len(filtered)} из {len(valid_molecules)}")

Прошли правило Липинского: 99 из 100


In [16]:
smiles_to_sdf(valid_molecules, "output_molecules.sdf")



ValueError: Bad Conformer Id