In [1]:
import pandas as pd


df = pd.read_csv('SMILES_Big_Data_Set.csv')
smiles_list = df['SMILES'].tolist()  # Предполагаем, что столбец называется 'SMILES'
print(f"Всего молекул: {len(smiles_list)}")

Всего молекул: 16087


In [2]:
pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.3.2-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2025.3.2-cp311-cp311-manylinux_2_28_x86_64.whl (35.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.2/35.2 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.3.2


In [4]:
from rdkit import Chem

In [33]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Символы SMILES
all_chars = sorted(list(set(''.join(smiles_list))))
char2idx = {ch: i+1 for i, ch in enumerate(all_chars)}  # 0 — паддинг
idx2char = {i: ch for ch, i in char2idx.items()}
vocab_size = len(char2idx) + 1

# Преобразуем SMILES в индексы
def smiles_to_tensor(smiles):
    return torch.tensor([char2idx[ch] for ch in smiles], dtype=torch.long)

class SMILESDataset(Dataset):
    def __init__(self, smiles_list):
        self.data = [smiles_to_tensor(s) for s in smiles_list]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx][:-1]
        y = self.data[idx][1:]
        return x, y

def collate_fn(batch):
    x_batch, y_batch = zip(*batch)
    x_batch = pad_sequence(x_batch, batch_first=True)
    y_batch = pad_sequence(y_batch, batch_first=True)
    return x_batch, y_batch

dataset = SMILESDataset(smiles_list)
loader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)


In [34]:
import torch.nn as nn

class SMILESLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        x, _ = self.lstm(x)
        return self.fc(x)


In [47]:
model = SMILESLSTM(vocab_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=0)

for epoch in range(40):
    model.train()
    total_loss = 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        output = model(x)
        loss = criterion(output.view(-1, vocab_size), y.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Эпоха {epoch + 1}: Потеря = {total_loss:.4f}")


Эпоха 1: Потеря = 346.6205
Эпоха 2: Потеря = 235.6616
Эпоха 3: Потеря = 209.4553
Эпоха 4: Потеря = 194.0333
Эпоха 5: Потеря = 182.9324
Эпоха 6: Потеря = 174.1450
Эпоха 7: Потеря = 166.9323
Эпоха 8: Потеря = 161.2043
Эпоха 9: Потеря = 155.7454
Эпоха 10: Потеря = 151.5291
Эпоха 11: Потеря = 147.4075
Эпоха 12: Потеря = 144.0498
Эпоха 13: Потеря = 140.4120
Эпоха 14: Потеря = 137.6425
Эпоха 15: Потеря = 134.8231
Эпоха 16: Потеря = 132.3958
Эпоха 17: Потеря = 151.5099
Эпоха 18: Потеря = 133.6278
Эпоха 19: Потеря = 129.5853
Эпоха 20: Потеря = 127.0538
Эпоха 21: Потеря = 124.9668
Эпоха 22: Потеря = 122.9452
Эпоха 23: Потеря = 121.2491
Эпоха 24: Потеря = 120.3371
Эпоха 25: Потеря = 118.6991
Эпоха 26: Потеря = 117.6745
Эпоха 27: Потеря = 115.9774
Эпоха 28: Потеря = 115.0649
Эпоха 29: Потеря = 113.6369
Эпоха 30: Потеря = 112.5752
Эпоха 31: Потеря = 111.7963
Эпоха 32: Потеря = 110.6979
Эпоха 33: Потеря = 110.0480
Эпоха 34: Потеря = 109.0881
Эпоха 35: Потеря = 107.8975
Эпоха 36: Потеря = 106.9912
Э

In [48]:
from rdkit import Chem

def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None


In [49]:
import random

def generate_smiles(model, start_token='C', max_len=100, stop_chance=0.2, min_len=15):
    model.eval()
    input_seq = torch.tensor([[char2idx.get(start_token, 1)]], dtype=torch.long).to(device)
    generated = [start_token]

    for i in range(max_len):
        output = model(input_seq)
        prob = torch.softmax(output[0, -1], dim=0)
        next_idx = torch.multinomial(prob, 1).item()
        if next_idx == 0:
            break

        next_char = idx2char[next_idx]
        generated.append(next_char)

        if i >= min_len and random.random() < stop_chance:
            break

        input_seq = torch.tensor([[next_idx]], dtype=torch.long).to(device)

    return ''.join(generated)

print(generate_smiles(model, max_len=60, stop_chance=0.25))


CCCN#-]CCNCCCc1Fc


In [55]:
def generate_n_valid_smiles(model, n=100, start_token='C', max_len=100, stop_chance=0.3, min_len=10, max_output_len=40, max_total_tries=25000):
    valid_smiles = set()
    tries = 0

    while len(valid_smiles) < n and tries < max_total_tries:
        smiles = generate_smiles(
            model,
            start_token=start_token,
            max_len=max_len,
            stop_chance=stop_chance,
            min_len=min_len
        )
        tries += 1
        if is_valid_smiles(smiles) and len(smiles) <= max_output_len:
            valid_smiles.add(smiles)

    return list(valid_smiles)


In [56]:
valid_molecules = generate_n_valid_smiles(model, n=100)
print(f"Сгенерировано валидных молекул: {len(valid_molecules)}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[22:02:36] ~~~^
[22:02:36] SMILES Parse Error: Failed parsing SMILES 'CCl)c1)c1)c1' for input: 'CCl)c1)c1)c1'
[22:02:36] SMILES Parse Error: extra close parentheses while parsing: CCc1)c1)n2)c
[22:02:36] SMILES Parse Error: check for mistakes around position 5:
[22:02:36] CCc1)c1)n2)c
[22:02:36] ~~~~^
[22:02:36] SMILES Parse Error: Failed parsing SMILES 'CCc1)c1)n2)c' for input: 'CCc1)c1)n2)c'
[22:02:36] SMILES Parse Error: extra close parentheses while parsing: CNc1)c1)c1)c1
[22:02:36] SMILES Parse Error: check for mistakes around position 5:
[22:02:36] CNc1)c1)c1)c1
[22:02:36] ~~~~^
[22:02:36] SMILES Parse Error: Failed parsing SMILES 'CNc1)c1)c1)c1' for input: 'CNc1)c1)c1)c1'
[22:02:37] SMILES Parse Error: extra close parentheses while parsing: CN=Cl)c1)c1)c1)
[22:02:37] SMILES Parse Error: check for mistakes around position 6:
[22:02:37] CN=Cl)c1)c1)c1)
[22:02:37] ~~~~~^
[22:02:37] SMILES Parse Error: Failed parsing S

Сгенерировано валидных молекул: 100


In [57]:
for i, smi in enumerate(valid_molecules[:100]):
    print(f"{i+1}: {smi}")

1: CCS=CS=CS=CCNC
2: CCCCS=CCCCCCO
3: CNC=S=S=S=CCCCN
4: CCCCCCCCC=S=CO
5: CCCCNCCC=S=C
6: CCS=S=CNCCCCC
7: CCCOC#CCCCCC
8: Cl
9: CS=CCCOCNCNCCCC
10: CS=CCCCCCCCCC
11: CCCC=S=CCNCC
12: CCCCOCCS=CNS
13: CCC=S=CCCCCCC
14: C=CCCCCS=S=N
15: CNCCCCCCCCCN
16: CCNCCN=CCCCNCCO
17: CCCS=CCNCCCl
18: CCCCOCS=CCCC
19: CC=CCCCCCCCCC
20: ClCCCCCCCNNC
21: C=CCNNS=CNCN
22: COCC=CCCOCCS
23: COCCCCC=CC=C
24: CC#CCCNCNCCNC
25: CCCS=CNS=CCC=C
26: CCCCCNCCS=CC
27: CS=NNS=CCCCCN
28: CCCCOCCCNCCC
29: CS=S=CCCCCCCNCCC
30: CNS=CS=S=S=C
31: CCN=CCCNCCCN
32: CNS=COCCCCCCO
33: CCCCCOCCS=CO
34: COCCNC=CCCS=CCl
35: CNS=CCCCCOCCCCC
36: C#CCCCCCCCOC
37: CCCCCCS=CCCS
38: COCCCCCCOCCC
39: CCCC=CCCCOCCCCS=C
40: CCCCOCCCNS=S=COCCCCO
41: CCNCCCC=CS=CC
42: CCS=CCCCC=CC
43: CCCCCCCCCCCCC
44: CCS=S=CS=CCN
45: CNCCNCCC=CCC
46: CCCCCCCCCS=CCC
47: CCC=S=C=CNCC
48: CS=S=CCCCCCC
49: CCCNCCCS=CNS=C
50: CCCNCCCCCCCC
51: CCS=NS=CCCCCC
52: CS=C=CS=CCCO
53: CCCS=CCC=S=C
54: CCCNCCCCCCCCCCN
55: CS=ICCCNS=COCN
56: CCNCOCS=COCCCC
57: CS=

In [58]:
torch.save(model.state_dict(), 'smiles_lstm.pth')


In [59]:
vocab_size

34

In [60]:
all_chars = sorted(list(set(''.join(smiles_list))))

In [62]:
all_chars

['#',
 '(',
 ')',
 '+',
 '-',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '=',
 'A',
 'B',
 'C',
 'F',
 'H',
 'I',
 'N',
 'O',
 'P',
 'S',
 '[',
 ']',
 'c',
 'e',
 'i',
 'l',
 'n',
 'o',
 'r',
 's']

In [61]:
char2idx = {ch: i+1 for i, ch in enumerate(all_chars)}  # 0 — паддинг

In [63]:
char2idx

{'#': 1,
 '(': 2,
 ')': 3,
 '+': 4,
 '-': 5,
 '1': 6,
 '2': 7,
 '3': 8,
 '4': 9,
 '5': 10,
 '6': 11,
 '7': 12,
 '=': 13,
 'A': 14,
 'B': 15,
 'C': 16,
 'F': 17,
 'H': 18,
 'I': 19,
 'N': 20,
 'O': 21,
 'P': 22,
 'S': 23,
 '[': 24,
 ']': 25,
 'c': 26,
 'e': 27,
 'i': 28,
 'l': 29,
 'n': 30,
 'o': 31,
 'r': 32,
 's': 33}

In [64]:
idx2char = {i: ch for ch, i in char2idx.items()}

In [65]:
idx2char

{1: '#',
 2: '(',
 3: ')',
 4: '+',
 5: '-',
 6: '1',
 7: '2',
 8: '3',
 9: '4',
 10: '5',
 11: '6',
 12: '7',
 13: '=',
 14: 'A',
 15: 'B',
 16: 'C',
 17: 'F',
 18: 'H',
 19: 'I',
 20: 'N',
 21: 'O',
 22: 'P',
 23: 'S',
 24: '[',
 25: ']',
 26: 'c',
 27: 'e',
 28: 'i',
 29: 'l',
 30: 'n',
 31: 'o',
 32: 'r',
 33: 's'}

In [67]:
vocab_size = len(char2idx) + 1

In [68]:
vocab_size

34