# Импорт данных

In [None]:
!pip install rdkit



In [None]:
import numpy as np
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('drive/MyDrive/df_diploma_model.csv')
df.head()

Unnamed: 0,smiles,"Масса , Да",Липофильность,TPSA
0,c1ccc(Nc2nc(OCC3CCCCC3)c3[nH]cnc3n2)cc1,323.17461,4.0556,75.72
1,Cc1ccc(NC(=O)c2cccnc2)cc1Nc1nccc(-c2cccnc2)n1,382.154209,4.23792,92.69
2,NS(=O)(=O)c1ccc(Nc2nc(OCC3CCCCC3)c3[nH]cnc3n2)cc1,402.14741,2.703,135.88
3,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,493.259009,4.59032,86.28
4,O=C(O)c1ccccc1Nc1ccnc(Nc2ccccc2)n1,306.111676,3.662,87.14


In [None]:
import os
model_dir = 'drive/MyDrive/model'
if not os.path.exists(model_dir):
  os.mkdir(model_dir)

# Подготовка данных к модели

## Векторизация строк и стандартизация признаков

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

class DataPreprocessing:
    def __init__(self, df, smiles_col='smiles', prop_col=['Масса , Да', 'Липофильность', 'TPSA'], smiles_len = None):
        self.df = df
        self.smiles_col = smiles_col
        self.prop_col = prop_col
        self.scaler = StandardScaler()
        self.char_to_int = dict()
        self.int_to_char = dict()
        self.smiles_len = smiles_len
        if self.smiles_len is None:
            self.smiles_len = df[smiles_col].str.len().to_numpy().max()
        self._build_char_dictionaries()

    def _build_char_dictionaries(self):
        """Инициализирует словари self.char_to_int и self.int_to_char и их размер"""
        all_smiles = ''.join(self.df[self.smiles_col])
        unique_chars = sorted(set(all_smiles))
        unique_chars.append('<') # Начальный символ
        unique_chars.append('>') # Конечный символ
        self.char_to_int = {char: idx for idx, char in enumerate(unique_chars)}
        self.int_to_char = {idx: char for idx, char in enumerate(unique_chars)}
        self.vocab_size = len(unique_chars)

    def feature_preprocessing(self):
        """Обучает и применяет scaler для свойств SMILES
        Приводит строку SMILES в массив чисел
        Конкатенирует эти свойства: сначала идет вектор smiles, после свойства
        """
        #scaler для prop
        features_scaled = self.scaler.fit_transform(df[self.prop_col])

        # vectorizer для SMILES
        smiles_strings = self.df[self.smiles_col]
        smiles_arrays = []
        for smiles in smiles_strings:
            smiles = '<' + smiles + '>'
            int_array = [self.char_to_int[char] for char in smiles]
            padded_array = int_array + [self.char_to_int['>']] * (self.smiles_len + 2 - len(int_array))
            smiles_arrays.append(padded_array)

        smiles_vectorized = np.array(smiles_arrays)

        # Объединяем SMILES векторы и числовые признаки
        X_combined = np.hstack([smiles_vectorized, features_scaled])
        self.smiles_len += 2
        self.prop_len = X_combined.shape[1] - self.smiles_len
        return X_combined, self.char_to_int, self.int_to_char, self.vocab_size, self.smiles_len, self.prop_len

    def prop_scaler(self, prop_str):
        """Принимает строку свойств в виде '400 4 90' и применяет к ним обученный scaler"""
        prop_list = list(map(float, prop_str.split()))
        prop = pd.DataFrame([prop_list], columns=self.prop_col)
        return self.scaler.transform(prop)[0]

    def int_array_to_smiles(self, int_array):
        """Преобразует массив чисел обратно в SMILES строку"""
        smiles = ''.join([self.int_to_char[i] for i in int_array])
        smiles = smiles.replace('<', '').replace('>', '')
        return smiles


In [None]:
data_preprocessor = DataPreprocessing(df)
X, char_to_int, int_to_char, vocab_size, smiles_len, prop_len = data_preprocessor.feature_preprocessing()
print(X.shape)
print(f"vocab_size: {vocab_size}, smiles_len: {smiles_len}, prop_len: {prop_len}")
print(X[0])
print(data_preprocessor.int_array_to_smiles(X[0][:smiles_len]))
print(f'{'':-<60}')
print(df.iloc[0])

(112620, 72)
vocab_size: 23, smiles_len: 69, prop_len: 3
[21.         18.          4.         18.         18.         18.
  1.         13.         18.          5.         19.         18.
  1.         14.         10.         10.          6.         10.
 10.         10.         10.         10.          6.          2.
 18.          6.         16.         19.         12.         17.
 18.         19.         18.          6.         19.          5.
  2.         18.         18.          4.         22.         22.
 22.         22.         22.         22.         22.         22.
 22.         22.         22.         22.         22.         22.
 22.         22.         22.         22.         22.         22.
 22.         22.         22.         22.         22.         22.
 22.         22.         22.         -1.52075618 -0.1360609  -0.52894888]
c1ccc(Nc2nc(OCC3CCCCC3)c3[nH]cnc3n2)cc1
------------------------------------------------------------
smiles           c1ccc(Nc2nc(OCC3CCCCC3)c3[nH]cnc3n2)

In [None]:
print(data_preprocessor.prop_scaler('400 4 90'))

[-0.35393256 -0.18845111  0.07375604]


## Сохранение StandartScaler, char_to_int, int_to_dict

In [None]:
import json
import joblib

# Сохранение
with open(model_dir + "/char_to_int.json", "w", encoding="utf-8") as f:
    json.dump(data_preprocessor.char_to_int, f, ensure_ascii=False, indent=2)
with open(model_dir + "/int_to_char.json", "w", encoding="utf-8") as f:
    json.dump(data_preprocessor.int_to_char, f, ensure_ascii=False, indent=2)

joblib.dump(data_preprocessor.scaler, model_dir + '/scaler.joblib')

# # Загрузка
# with open(model_dir + "/char_to_int.json", "r", encoding="utf-8") as f:
#     char_to_int = json.load(f)
# with open(model_dir + "/int_to_char.json", "r", encoding="utf-8") as f:
#     int_to_char = json.load(f)
# loaded_scaler = joblib.load('scaler.joblib')

['drive/MyDrive/model/scaler.joblib']

## Создание датасета

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

class VAEDataset(Dataset):
    def __init__(self, X, smiles_len, prop_len):
        self.sequences = torch.LongTensor(X[:, :smiles_len])
        self.properties = torch.FloatTensor(X[:, -prop_len:])
        # Reshape для LSTM: (batch_size, seq_len, 1)
        # self.sequences = self.sequences.unsqueeze(-1)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        properties = self.properties[idx]
        return sequence[:-1], sequence[1:], properties

In [None]:
dataset = VAEDataset(X, smiles_len, prop_len)
total_size = len(dataset)
train_size = int(0.8 * total_size)
test_size = total_size - train_size
print(train_size)
print(test_size)

90096
22524


## Создание даталодера

In [None]:
from torch.utils.data import random_split

train_dataset, test_dataset = random_split(
    dataset, [train_size, test_size],
    generator=torch.Generator().manual_seed(42)
)

train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

# Модель

## Конфигурация и Гиперпараметры

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

In [None]:
class Config:
    vocab_size = 23
    embedding_dim = 64
    hidden_dim = 256
    latent_dim = 128
    prop_len = 3          # Количество свойств (например, logP, MolWt, QED)
    smiles_len = 69       # Максимальная длина последовательности
    batch_size = 16
    learning_rate = 1e-3
    epochs = 10
    beta = 0.01           # Вес KL дивергенции (KL Annealing factor)

    SOS_TOKEN = 21  # Start of Sequence
    EOS_TOKEN = 22  # End of Sequence
    PAD_TOKEN = 22  # Padding

## Архитектура Модели (VAE)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class MolecularVAE(nn.Module):
    def __init__(self, config):
        super(MolecularVAE, self).__init__()
        self.config = config

        # Encoder
        self.embedding = nn.Embedding(config.vocab_size, config.embedding_dim, padding_idx=config.PAD_TOKEN)
        self.encoder_gru = nn.GRU(config.embedding_dim, config.hidden_dim, batch_first=True)

        self.fc_mu = nn.Linear(config.hidden_dim + config.prop_len, config.latent_dim)
        self.fc_logvar = nn.Linear(config.hidden_dim + config.prop_len, config.latent_dim)

        # Decoder
        self.decoder_init = nn.Linear(config.latent_dim + config.prop_len, config.hidden_dim)
        self.decoder_gru = nn.GRU(config.embedding_dim, config.hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(config.hidden_dim, config.vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=2)

    def reparameterize(self, mu, logvar):
        if self.training:
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(std)
            return mu + eps * std
        else:
            return mu

    def forward(self, x_in, properties):
        """
        x_in: (batch, seq_len-1) - входная последовательность (уже без последнего токена)
        properties: (batch, prop_len)
        """
        # --- ENCODER ---
        # В качестве входа энкодера используем x_in (без последнего токена),
        # это стандартная практика, хотя иногда подают и полную последовательность.
        embedded = self.embedding(x_in)
        _, hidden = self.encoder_gru(embedded)
        h_last = hidden[-1]

        # Кондиционирование
        h_conditioned = torch.cat([h_last, properties], dim=1)

        mu = self.fc_mu(h_conditioned)
        logvar = self.fc_logvar(h_conditioned)
        z = self.reparameterize(mu, logvar)

        # --- DECODER ---
        z_conditioned = torch.cat([z, properties], dim=1)
        decoder_hidden_init = self.decoder_init(z_conditioned)
        decoder_hidden = decoder_hidden_init.unsqueeze(0)

        # Teacher Forcing: используем тот же x_in как вход декодера
        dec_emb = self.embedding(x_in)
        dec_output, _ = self.decoder_gru(dec_emb, decoder_hidden)

        logits = self.fc_out(dec_output)
        log_probs = self.log_softmax(logits)

        return log_probs, mu, logvar

## Функция генерации

In [None]:

def generate_smiles(model, properties, max_len=50):
    """
    Генерирует SMILES на основе заданных свойств.
    properties: тензор (1, prop_len) или (batch, prop_len)
    """
    model.eval()
    with torch.no_grad():
        if properties.dim() == 1:
            properties = properties.unsqueeze(0)

        batch_size = properties.size(0)

        # 1. Сэмплируем z из априорного распределения N(0, I)
        z = torch.randn(batch_size, model.config.latent_dim).to(device)

        # 2. Инициализируем скрытое состояние декодера
        z_conditioned = torch.cat([z, properties], dim=1)
        hidden = model.decoder_init(z_conditioned).unsqueeze(0)

        # 3. Начальный токен (SOS)
        input_token = torch.tensor([[model.config.SOS_TOKEN]] * batch_size).to(device) # (batch, 1)

        generated_seqs = []

        for _ in range(max_len):
            # Эмбеддинг текущего токена
            embeddings = model.embedding(input_token) # (batch, 1, emb_dim)

            # Шаг GRU
            output, hidden = model.decoder_gru(embeddings, hidden)

            # Проекция в словарь
            logits = model.fc_out(output) # (batch, 1, vocab_size)
            log_prob = model.log_softmax(logits)

            # Greedy decoding (выбираем токен с макс. вероятностью)
            # Для разнообразия можно использовать torch.multinomial(torch.exp(log_prob.squeeze(1)), 1)
            topv, topi = log_prob.topk(1, dim=2)
            next_token = topi.squeeze(2) # (batch, 1)

            generated_seqs.append(next_token.item())

            input_token = next_token

            if next_token.item() == model.config.EOS_TOKEN:
                break

        return generated_seqs

## Цикл обучения

In [None]:
def train(model, train_loader, val_loader, config, device):
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

    # Loss function logic included here or external
    def loss_function(log_probs, target, mu, logvar, beta):
        # Flatten target to (batch * seq_len) not needed for NLL if dims are correct,
        # but NLLLoss expects (N, C, d1...)
        log_probs = log_probs.transpose(1, 2) # (batch, vocab, seq_len)
        recon_loss = F.nll_loss(log_probs, target, reduction='sum', ignore_index=config.PAD_TOKEN)
        kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        return recon_loss + beta * kl_loss, recon_loss, kl_loss

    print("Starting training...")

    for epoch in range(config.epochs):
        model.train()
        train_loss = 0

        # ВАЖНО: Распаковка 3 элементов из вашего даталоадера
        for batch_idx, (x_in, x_out, props) in enumerate(train_loader):
            # Перенос на устройство
            x_in = x_in.to(device)     # Вход (LongTensor)
            x_out = x_out.to(device)   # Цель (LongTensor)
            props = props.to(device)   # Свойства (FloatTensor)

            optimizer.zero_grad()

            # Forward
            log_probs, mu, logvar = model(x_in, props)

            # Loss
            loss, recon, kl = loss_function(log_probs, x_out, mu, logvar, config.beta)

            # Backward
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            optimizer.step()

            train_loss += loss.item()

        # --- Validation Loop ---
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for x_in, x_out, props in val_loader:
                x_in, x_out, props = x_in.to(device), x_out.to(device), props.to(device)
                log_probs, mu, logvar = model(x_in, props)
                loss, _, _ = loss_function(log_probs, x_out, mu, logvar, config.beta)
                val_loss += loss.item()

        print(f"Epoch {epoch+1}: Train Loss {train_loss/len(train_loader.dataset):.4f}, "
              f"Val Loss {val_loss/len(val_loader.dataset):.4f}")



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = Config()
model = MolecularVAE(config).to(device)
train(model, train_loader, test_loader, config, device)

Starting training...
Epoch 1: Train Loss 25.9715, Val Loss 16.1170
Epoch 2: Train Loss 14.1676, Val Loss 12.3578
Epoch 3: Train Loss 11.4267, Val Loss 10.6041
Epoch 4: Train Loss 9.9591, Val Loss 9.3835
Epoch 5: Train Loss 9.0028, Val Loss 8.7068
Epoch 6: Train Loss 8.3101, Val Loss 8.2709
Epoch 7: Train Loss 7.7883, Val Loss 7.9214
Epoch 8: Train Loss 7.3601, Val Loss 7.5558
Epoch 9: Train Loss 7.0141, Val Loss 7.2775
Epoch 10: Train Loss 6.7440, Val Loss 6.9466


In [None]:
prop_gen = torch.tensor([data_preprocessor.prop_scaler('400 4 90')], dtype=torch.float32)
prop_gen = prop_gen.to(device)
smiles_int = generate_smiles(model, prop_gen, max_len=69)
smiles_str = data_preprocessor.int_array_to_smiles(smiles_int)
print(smiles_str)
print(smiles_str.__len__())

N3C)nc1)cc1C(N)=O)(C)C1(C)C)c1C=O)CC1(C)C)c1C=O)CC1(C)C)c1C=O)CC1(C)C
69


In [None]:
torch.save(model, model_dir + '/' + 'model.pth')

# Генерация SMILES

In [None]:
model = torch.load(model_dir + '/' + 'model.pth', weights_only=False)
model.eval()

MolecularVAE(
  (embedding): Embedding(23, 64, padding_idx=22)
  (encoder_gru): GRU(64, 256, batch_first=True)
  (fc_mu): Linear(in_features=259, out_features=128, bias=True)
  (fc_logvar): Linear(in_features=259, out_features=128, bias=True)
  (decoder_init): Linear(in_features=131, out_features=256, bias=True)
  (decoder_gru): GRU(64, 256, batch_first=True)
  (fc_out): Linear(in_features=256, out_features=23, bias=True)
  (log_softmax): LogSoftmax(dim=2)
)

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import RDLogger
from rdkit.Chem import Descriptors, Crippen

prop_gen = torch.tensor([data_preprocessor.prop_scaler('400 4 90')], dtype=torch.float32)
prop_gen = prop_gen.to(device)
valid_smiles = []
for i in range(1000):
    def is_valid_smile(smiles):
        """
        Расширенная проверка валидности SMILES
        Args:
            smiles: SMILES строка
        """
        RDLogger.DisableLog('rdApp.*')
        if smiles is None or smiles == "":
            return False

        try:
            mol = Chem.MolFromSmiles(smiles)

            if mol is None:
                return False
            if mol.GetNumAtoms() == 0:
                return False
            return True
        except:
            return False

    smiles_int = generate_smiles(model, prop_gen, max_len=50)
    smiles_str = data_preprocessor.int_array_to_smiles(smiles_int)
    if is_valid_smile(smiles_str):
        valid_smiles.append(smiles_str)
        mol = Chem.MolFromSmiles(smiles_str)
        molecular_weight = Descriptors.MolWt(mol)
        logp = Crippen.MolLogP(mol)
        tpsa = Descriptors.TPSA(mol)
        print(f"{molecular_weight:.2f}, {logp:.2f}, {tpsa:.2f}, {smiles_str}")
print("Валидных SMILES:", f'{len(valid_smiles)}/{1000}')


418.50, 2.85, 100.20, CNCc1ccc(OC)c(Nc2nccc(C(=O)Nc3ccc4c(c3)CNCC4)n2)c1
401.44, 5.92, 55.63, Cc1cc(Nc2nccc(C(F)(F)F)n2)cc(-c2cnn(C3CCCCC3)c2)c1
404.41, 3.27, 111.96, COc1ccc(Nc2ncc(F)c(Nc3ccc4c(c3)C(=O)NCC4)n2)cc1C#N
455.57, 4.85, 101.22, CCN(c1ccnc(Nc2ccc(OC)c(NCc3cccnc3)c2)n1)c1ccccc1CN
421.43, 5.20, 55.63, CC#CCn1c(Cc2ccccc2)nc2ccc(Nc3ncc(C(F)(F)F)cn3)cc21
403.49, 3.65, 81.59, Cc1cnc(Nc2cccc(C(=O)O)c2)nc1-c1ccc(N2CCN(C)CC2)cc1
434.55, 5.92, 85.66, CC(CNc1ccnc(Nc2ccc(NCc3ccccc3)c(C#N)c2)n1)c1ccccc1
420.42, 5.00, 91.83, CNc1ccc(Nc2nccc(C(=O)Nc3cccc4cccnc34)n2)cc1C(F)(F)
421.50, 4.71, 64.90, N=C(Cc1ccccc1)c1cnc(Nc2cccc(CCN3CCC(F)(F)C3)c2)nc1
420.49, 3.99, 91.83, CCCNC(=O)c1ccc(F)c(CNc2nccc(Nc3cnc4c(c3)CCC4)n2)c1
407.50, 3.45, 99.04, N=S(C)(=O)c1ccc(Nc2nccc(C(=O)N3CCc4ccccc4C3)n2)cc1
423.41, 5.13, 62.64, COc1cnc(N(C)c2ccc(Nc3nccc(C(F)(F)F)n3)cc2F)CC1(C)C
427.53, 3.55, 89.35, CCn1ccc(S(=O)(=O)OCc2ccnc(Nc3cccc(N4CCCC4)c3)n2)c1
398.43, 3.87, 108.90, N(C)c1ccc(Nc2nccc(C(=O)Nc3cccc4cccnc34)

In [None]:
data_save = pd.DataFrame({'SMILES': valid_smiles})
data_save['Mol'] = data_save['SMILES'].apply(Chem.MolFromSmiles)
data_save['Масса'] = data_save['Mol'].apply(Descriptors.MolWt)
data_save['Липофильность'] = data_save['Mol'].apply(Crippen.MolLogP)
data_save['TPSA'] = data_save['Mol'].apply(Descriptors.TPSA)
data_save = data_save.drop('Mol', axis=1)

data_save.to_csv(model_dir + '/molecules.csv')