In [1]:
import torch.nn as nn

class SMILESLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=256, hidden_dim=512):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.dropout1 = nn.Dropout(0.2)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.dropout2 = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        x, _ = self.lstm1(x)
        x = self.dropout1(x)
        x, _ = self.lstm2(x)
        x = self.dropout2(x)
        return self.fc(x)


In [2]:
all_chars = ['#',
 '(',
 ')',
 '+',
 '-',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '=',
 'A',
 'B',
 'C',
 'F',
 'H',
 'I',
 'L',
 'N',
 'O',
 'P',
 'R',
 'S',
 '[',
 ']',
 'c',
 'e',
 'i',
 'n',
 'o',
 's']

char2idx = {'#': 1,
 '(': 2,
 ')': 3,
 '+': 4,
 '-': 5,
 '1': 6,
 '2': 7,
 '3': 8,
 '4': 9,
 '5': 10,
 '6': 11,
 '7': 12,
 '=': 13,
 'A': 14,
 'B': 15,
 'C': 16,
 'F': 17,
 'H': 18,
 'I': 19,
 'L': 20,
 'N': 21,
 'O': 22,
 'P': 23,
 'R': 24,
 'S': 25,
 '[': 26,
 ']': 27,
 'c': 28,
 'e': 29,
 'i': 30,
 'n': 31,
 'o': 32,
 's': 33,
 'К': 34,
 'Н': 35}

idx2char = {1: '#',
 2: '(',
 3: ')',
 4: '+',
 5: '-',
 6: '1',
 7: '2',
 8: '3',
 9: '4',
 10: '5',
 11: '6',
 12: '7',
 13: '=',
 14: 'A',
 15: 'B',
 16: 'C',
 17: 'F',
 18: 'H',
 19: 'I',
 20: 'L',
 21: 'N',
 22: 'O',
 23: 'P',
 24: 'R',
 25: 'S',
 26: '[',
 27: ']',
 28: 'c',
 29: 'e',
 30: 'i',
 31: 'n',
 32: 'o',
 33: 's',
 34: 'К',
 35: 'Н'}

vocab_size = 36

In [3]:
import torch

model = SMILESLSTM(vocab_size)
model.load_state_dict(torch.load('smiles_lstm (1).pth', map_location=torch.device('cpu')))
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

  model.load_state_dict(torch.load('smiles_lstm (1).pth', map_location=torch.device('cpu')))


In [47]:
from rdkit import Chem

def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None


import random
max_len = 167
def generate_smiles(model, start_token='Н'):
    model.eval()
    input_seq = torch.tensor([[char2idx.get(start_token, 1)]], dtype=torch.long).to(device)
    generated = []
    for i in range(max_len):
        output = model(input_seq)
        logits = output[0, -1]

        logits = logits / 0.75
        probs = torch.softmax(logits, dim=0)
        next_idx = torch.multinomial(probs, num_samples=1).item()

        if next_idx == 0:
            continue
        if next_idx == char2idx['К']:
            break

        next_char = idx2char[next_idx]
        generated.append(next_char)

        input_seq = torch.tensor([[next_idx]], dtype=torch.long).to(device)
    generated = ['Br' if c == 'R' else 'Cl' if c == 'L' else c for c in generated]
    return ''.join(generated)

print(generate_smiles(model))

print(generate_smiles(model))

tries = 0
def generate_n_valid_smiles(model, n=100, max_total_tries = 25000):
    global tries
    valid_smiles = list()
    tries = 0

    while len(valid_smiles) < n and tries < max_total_tries:
        smiles = generate_smiles(
            model
        )
        tries += 1
        if is_valid_smiles(smiles) and len(smiles) > 5:
            valid_smiles.append(smiles)

    return list(valid_smiles)



CCl)s1
CN=O[n1


In [48]:
tries = 0
valid_molecules = generate_n_valid_smiles(model, n=100)
print(f"Сгенерировано валидных молекул: {len(valid_molecules)} / {tries}")

[12:51:44] SMILES Parse Error: extra open parentheses while parsing: C(CC(CC(CC(CF)s1
[12:51:44] SMILES Parse Error: check for mistakes around position 2:
[12:51:44] C(CC(CC(CC(CF)s1
[12:51:44] ~^
[12:51:44] SMILES Parse Error: extra open parentheses while parsing: C(CC(CC(CC(CF)s1
[12:51:44] SMILES Parse Error: check for mistakes around position 5:
[12:51:44] C(CC(CC(CC(CF)s1
[12:51:44] ~~~~^
[12:51:44] SMILES Parse Error: extra open parentheses while parsing: C(CC(CC(CC(CF)s1
[12:51:44] SMILES Parse Error: check for mistakes around position 8:
[12:51:44] C(CC(CC(CC(CF)s1
[12:51:44] ~~~~~~~^
[12:51:44] SMILES Parse Error: Failed parsing SMILES 'C(CC(CC(CC(CF)s1' for input: 'C(CC(CC(CC(CF)s1'
[12:51:44] SMILES Parse Error: extra open parentheses while parsing: CCC(F)c1C(C(F)s1
[12:51:44] SMILES Parse Error: check for mistakes around position 10:
[12:51:44] CCC(F)c1C(C(F)s1
[12:51:44] ~~~~~~~~~^
[12:51:44] SMILES Parse Error: Failed parsing SMILES 'CCC(F)c1C(C(F)s1' for input: 'CCC(F)c1

Сгенерировано валидных молекул: 100 / 12827


In [49]:
print(f"Сгенерировано валидных молекул: {len(valid_molecules)} / {tries}")

for i, smi in enumerate(valid_molecules[:100]):
    print(f"{i+1}: {smi}")

Сгенерировано валидных молекул: 100 / 12827
1: CS(F)c1CCc1
2: CCC(F)c1C(F)n1
3: CCC(F)c1C(F)sc1
4: C(F)c1CC(F)c1
5: CCc1C(F)c1
6: C(F)c1CC2CC2C(F)n1
7: CC(F)c1C(Cl)c1
8: C(F)c1C(F)c1
9: C(COCCS(F)c1)c1
10: CCC(C#C1)c1
11: CCCCC(Cl)c1COCCCC(F)c1
12: CC(CCC(F)c1)c1
13: C(C1)c1
14: OC(F)c1C(F)c1
15: C(F)c1C(Cl)c1
16: C(Oc1)c1
17: C#CC(C(F)c1)c1
18: C(F)c1C(F)c1
19: SC(Cc1)c1
20: CCCC1Cc1
21: CCCC1C(F)c1
22: CCC(F)c1C(F)c1
23: C(CCc1)c1
24: CC(F)I
25: OC(F)c1C=C1
26: C(Cl)c1CCC(F)c1
27: C(F)c1C(Cl)c1
28: C1CCC(F)c1
29: C(F)c1Cc1
30: OCCC(F)c1OCC(F)c1
31: C(CCc1)c1
32: CCC(CCC1)c1
33: C(C(F)c1)c1
34: CCC(F)c1CC(Cl)c1
35: C(C(F)c1)c1
36: C(F)c1CCCN=C(F)c1
37: CCc1CC(F)c1
38: CCCCC(F)c1CCCC(F)c1
39: CCCCCOCC(CC(F)c1)c1
40: C(C(F)n1)c1
41: CC(CC(F)c1)n1
42: CCc1CCc1
43: C(CCCC(F)c1)c1
44: C(F)c1CCCCc1
45: C(COCc1)c1
46: C(F)c1CCc1
47: C(CCC(F)c1)n1
48: CCCCCc1CCOC(F)c1
49: C(C(F)c1)c1
50: CC(C(F)c1)c1
51: CC(F)c1C(F)c1
52: N=C(F)c1CCCCc1
53: CCc1OCC(F)c1
54: CCC(F)c1COc1
55: OP(CC1)c1
56: CC(F

In [50]:
def calculate_uniqueness(valid_smiles):
    unique = set(valid_smiles) 
    return (len(unique) / len(valid_smiles)) * 100

print(calculate_uniqueness(valid_molecules))

85.0


In [51]:
from rdkit import Chem
from rdkit.Chem import AllChem

def smiles_to_sdf(smiles_list, sdf_filename):
    writer = Chem.SDWriter(sdf_filename)

    for i, smi in enumerate(smiles_list):
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            print(f"Warning: invalid SMILES skipped: {smi}")
            continue
        
        mol = Chem.AddHs(mol)

        AllChem.EmbedMolecule(mol, randomSeed=0xf00d)
        AllChem.UFFOptimizeMolecule(mol)

        mol.SetProp("_Name", f"Molecule_{i+1}")
        writer.write(mol)

    writer.close()
    print(f"SDF файл '{sdf_filename}' успешно сохранён")


In [52]:
from rdkit import Chem
from rdkit.Chem import Descriptors

def passes_lipinski(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False

    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    h_donors = Descriptors.NumHDonors(mol)
    h_acceptors = Descriptors.NumHAcceptors(mol)

    return (
        mw <= 500 and
        logp <= 5 and
        h_donors <= 5 and
        h_acceptors <= 10
    )


In [53]:
filtered = [smi for smi in valid_molecules if passes_lipinski(smi)]
print(f"Прошли правило Липинского: {len(filtered)} из {len(valid_molecules)}")

Прошли правило Липинского: 100 из 100


In [54]:
import pandas as pd

df = pd.read_csv('SMILES_Big_Data_Set.csv')
smiles_list = df['SMILES'].tolist()
print(f"Всего молекул: {len(smiles_list)}")

Всего молекул: 16087


In [55]:

common_molecules = set(smiles_list) & set(valid_molecules)
num_common = len(common_molecules)
total_valid = len(valid_molecules)

print(f"Найдено совпадений: {num_common} из {total_valid}")
print("Совпадающие молекулы:", common_molecules)

Найдено совпадений: 0 из 100
Совпадающие молекулы: set()


In [57]:
smiles_to_sdf(valid_molecules, "output_molecules.sdf")

[13:10:28] UFFTYPER: Unrecognized atom type: S_5+4 (1)
[13:10:28] UFFTYPER: Unrecognized atom type: S_5+4 (1)
[13:10:28] UFFTYPER: Unrecognized atom type: S_5+4 (5)
[13:10:28] UFFTYPER: Unrecognized atom type: S_5+4 (5)


ValueError: Bad Conformer Id