In [1]:
import torch.nn as nn

class SMILESLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=256, hidden_dim=512):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.dropout1 = nn.Dropout(0.2)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.dropout2 = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        x, _ = self.lstm1(x)
        x = self.dropout1(x)
        x, _ = self.lstm2(x)
        x = self.dropout2(x)
        return self.fc(x)


In [2]:
all_chars = ['#',
 '(',
 ')',
 '+',
 '-',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '=',
 'A',
 'B',
 'C',
 'F',
 'H',
 'I',
 'L',
 'N',
 'O',
 'P',
 'R',
 'S',
 '[',
 ']',
 'c',
 'e',
 'i',
 'n',
 'o',
 's']

char2idx = {'#': 1,
 '(': 2,
 ')': 3,
 '+': 4,
 '-': 5,
 '1': 6,
 '2': 7,
 '3': 8,
 '4': 9,
 '5': 10,
 '6': 11,
 '7': 12,
 '=': 13,
 'A': 14,
 'B': 15,
 'C': 16,
 'F': 17,
 'H': 18,
 'I': 19,
 'L': 20,
 'N': 21,
 'O': 22,
 'P': 23,
 'R': 24,
 'S': 25,
 '[': 26,
 ']': 27,
 'c': 28,
 'e': 29,
 'i': 30,
 'n': 31,
 'o': 32,
 's': 33,
 'К': 34,
 'Н': 35}

idx2char = {1: '#',
 2: '(',
 3: ')',
 4: '+',
 5: '-',
 6: '1',
 7: '2',
 8: '3',
 9: '4',
 10: '5',
 11: '6',
 12: '7',
 13: '=',
 14: 'A',
 15: 'B',
 16: 'C',
 17: 'F',
 18: 'H',
 19: 'I',
 20: 'L',
 21: 'N',
 22: 'O',
 23: 'P',
 24: 'R',
 25: 'S',
 26: '[',
 27: ']',
 28: 'c',
 29: 'e',
 30: 'i',
 31: 'n',
 32: 'o',
 33: 's',
 34: 'К',
 35: 'Н'}

vocab_size = 36

In [3]:
import torch

model = SMILESLSTM(vocab_size)
model.load_state_dict(torch.load('smiles_lstm (1).pth', map_location=torch.device('cpu')))
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

  model.load_state_dict(torch.load('smiles_lstm (1).pth', map_location=torch.device('cpu')))


In [25]:
from rdkit import Chem

def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None


import random
max_len = 167
def generate_smiles(model, start_token='Н'):
    model.eval()
    input_seq = torch.tensor([[char2idx.get(start_token, 1)]], dtype=torch.long).to(device)
    generated = []
    for i in range(max_len):
        output = model(input_seq)
        logits = output[0, -1]

        logits = logits / 0.75
        probs = torch.softmax(logits, dim=0)
        next_idx = torch.multinomial(probs, num_samples=1).item()

        if next_idx == 0:
            continue
        if next_idx == char2idx['К']:
            break

        next_char = idx2char[next_idx]
        generated.append(next_char)

        input_seq = torch.tensor([[next_idx]], dtype=torch.long).to(device)
    generated = ['Br' if c == 'R' else 'Cl' if c == 'L' else c for c in generated]
    return ''.join(generated)

print(generate_smiles(model))

print(generate_smiles(model))

tries = 0
def generate_n_valid_smiles(model, n=100, max_total_tries = 25000):
    global tries
    valid_smiles = list()
    tries = 0

    while len(valid_smiles) < n and tries < max_total_tries:
        smiles = generate_smiles(
            model
        )
        tries += 1
        if is_valid_smiles(smiles):
            valid_smiles.append(smiles)

    return list(valid_smiles)



O#N=OCCOC(F)c1
CCN=CN=OCc1


In [27]:
tries = 0
valid_molecules = generate_n_valid_smiles(model, n=100)
print(f"Сгенерировано валидных молекул: {len(valid_molecules)} / {tries}")

[10:31:33] SMILES Parse Error: unclosed ring for input: 'CCCCC(F)c1'
[10:31:33] SMILES Parse Error: unclosed ring for input: 'C(F)c1'
[10:31:33] SMILES Parse Error: extra open parentheses while parsing: C-c2C(C
[10:31:33] SMILES Parse Error: check for mistakes around position 6:
[10:31:33] C-c2C(C
[10:31:33] ~~~~~^
[10:31:33] SMILES Parse Error: Failed parsing SMILES 'C-c2C(C' for input: 'C-c2C(C'
[10:31:33] SMILES Parse Error: unclosed ring for input: 'CC1'
[10:31:33] SMILES Parse Error: extra open parentheses while parsing: C(C(C(F)c1
[10:31:33] SMILES Parse Error: check for mistakes around position 2:
[10:31:33] C(C(C(F)c1
[10:31:33] ~^
[10:31:33] SMILES Parse Error: extra open parentheses while parsing: C(C(C(F)c1
[10:31:33] SMILES Parse Error: check for mistakes around position 4:
[10:31:33] C(C(C(F)c1
[10:31:33] ~~~^
[10:31:33] SMILES Parse Error: Failed parsing SMILES 'C(C(C(F)c1' for input: 'C(C(C(F)c1'
[10:31:33] SMILES Parse Error: unclosed ring for input: 'Cc1'
[10:31:34] SM

Сгенерировано валидных молекул: 100 / 8773


In [28]:
print(f"Сгенерировано валидных молекул: {len(valid_molecules)} / {tries}")

for i, smi in enumerate(valid_molecules[:100]):
    print(f"{i+1}: {smi}")

Сгенерировано валидных молекул: 100 / 8773
1: CCC1C(F)c1
2: C(F)c1C(F)c1
3: C(Cc1)c1
4: CC(C(F)c1)c1
5: OCC1Cn1
6: CC(COc1)c1
7: C(F)c1COC(F)c1
8: C(F)c1C(F)c1
9: Cc1COc1
10: CCC(F)c1CC1
11: c1CCC(F)c1
12: C(CC(CCl)c1)c1
13: O
14: OCN(C(F)c1)c1
15: C1C(CCCl)c1
16: C(F)c1Cc1
17: CC#N
18: OC(CCc1)c1
19: CN=C
20: C(Cl)c1C(F)c1
21: CC(F)c1C(F)n1
22: CBr
23: CC(Cl)c1C(F)c1
24: CO
25: CCCC(CC(F)c1)c1
26: C1COc1
27: C
28: CCc1CC1
29: CSCCCN=C(CCCC(F)n1)c1
30: Oc1C(F)n1
31: C
32: CC(F)c1C(F)c1
33: CCCCN(F)c1CCCOCCCc1
34: C(F)c1Cc1
35: OC=CC(F)c1C(F)c1
36: C(COc1)n1
37: CC1COc1
38: C(F)c1CC(F)c1
39: CCO
40: ON(F)n1CNCC1
41: CC1C(F)c1
42: CC1COCCC(F)c1
43: CC(CCCC(F)c1)c1
44: CCCCCc1CCC(F)c1
45: CN=C(F)c1C(F)c1
46: OCN=O
47: CCCOCCCCN=C(C1)c1
48: O
49: Oc1CC(F)c1
50: CNC(F)c1CC(F)c1
51: Cc1C-c1
52: COC
53: C(F)c1CC(F)n1
54: CC
55: N=O
56: CCCC(F)c1C(F)c1
57: C(C(F)c1)c1
58: CCF
59: CCN=CN=O
60: OCC
61: C1C(F)c1
62: CCCCC(F)c1C(F)c1
63: CO
64: OCC(Cc1)c1
65: OCCCc1C(F)c1
66: CCCC(C(F)c1)c1
67: CN

In [29]:
def calculate_uniqueness(valid_smiles):
    unique = set(valid_smiles) 
    return (len(unique) / len(valid_smiles)) * 100

print(calculate_uniqueness(valid_molecules))

81.0


In [35]:
from rdkit import Chem
from rdkit.Chem import AllChem

def smiles_to_sdf(smiles_list, sdf_filename):
    writer = Chem.SDWriter(sdf_filename)

    for i, smi in enumerate(smiles_list):
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            print(f"Warning: invalid SMILES skipped: {smi}")
            continue
        
        mol = Chem.AddHs(mol)

        AllChem.EmbedMolecule(mol, randomSeed=0xf00d)
        AllChem.UFFOptimizeMolecule(mol)

        mol.SetProp("_Name", f"Molecule_{i+1}")
        writer.write(mol)

    writer.close()
    print(f"SDF файл '{sdf_filename}' успешно сохранён")


In [36]:
from rdkit import Chem
from rdkit.Chem import Descriptors

def passes_lipinski(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False

    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    h_donors = Descriptors.NumHDonors(mol)
    h_acceptors = Descriptors.NumHAcceptors(mol)

    return (
        mw <= 500 and
        logp <= 5 and
        h_donors <= 5 and
        h_acceptors <= 10
    )


In [37]:
filtered = [smi for smi in valid_molecules if passes_lipinski(smi)]
print(f"Прошли правило Липинского: {len(filtered)} из {len(valid_molecules)}")

Прошли правило Липинского: 100 из 100


In [38]:
smiles_to_sdf(valid_molecules, "output_molecules.sdf")

SDF файл 'output_molecules.sdf' успешно сохранён
