In [None]:
!pip install transformers



In [None]:
!pip install --upgrade scikit-learn



In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.6


In [None]:
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader
import numpy as np
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence


from torch.nn import TransformerEncoder, TransformerEncoderLayer

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [None]:
num_components = 37
embedding_dim = 32
component_embed = nn.Embedding(num_components, embedding_dim)

In [None]:
data1 = pd.read_csv('/content/my_data.csv')

data2 = pd.read_csv('/content/NTO_smiles_encoded.csv')

In [None]:
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

df = df1.merge(df2, on="component_name", how="left")

In [None]:
import numpy as np
from numpy import ndarray
from rdkit import Chem
from rdkit.Chem import (
    Descriptors, rdMolDescriptors,
    Mol, PeriodicTable,
    AllChem, rdPartialCharges
    )
from math import pi


class Smiles2Descriptors:

    def __init__(self, smiles: str) -> None:
        self.smiles: str = smiles
        self.molecule: Mol = Chem.MolFromSmiles(smiles)

        if self.molecule is None:
            raise ValueError("Неверный SMILES")

        # Молекула с водородом
        self.mol_with_h: Mol = Chem.AddHs(self.molecule)

        # Основные дескрипторы
        # липофильность
        self.logp: float = Descriptors.MolLogP(self.molecule)
        # площадь поверхности полярных участков
        self.tpsa: float = rdMolDescriptors.CalcTPSA(self.molecule)
        # молярная масса
        self.molwt: float = Descriptors.MolWt(self.molecule)

        # Характеристики связей
        self.num_bounds: int = self.molecule.GetNumBonds()
        self.num_rotatable_bounds: int = rdMolDescriptors.CalcNumRotatableBonds(
            self.mol_with_h
        )
        self.num_non_rotatable_bounds: int = self.num_bounds - self.num_rotatable_bounds
        # доля не вращающихся связей
        self.fraction_non_rotatable_bounds: float = (
            self.num_non_rotatable_bounds / self.num_bounds
        )

        # Объем по Ван-дер-Ваальсу
        self.vdw_volume: float = self._calculate_vdw_volume(self.mol_with_h)

        self.num_atoms: int = self._find_atomic_number(self.mol_with_h)

        # Степень разветвленности (алгоритм Тараса Бондаренко)
        self.degree_of_branching: int = (
            self.smiles.count('(') / self.molecule.GetNumAtoms())

        # Дипольный момент
        # self.dipole_moment: float = self.calculate_dipole()

        # fingerprints
        # self.fp_morgan = AllChem.GetMorganFingerprintAsBitVect(
        #     self.molecule, radius=2, nBits=1024
        # )
        # self.fp_maccs = Chem.MACCSkeys.GenMACCSKeys(self.molecule)
        # self.fp_topological = Chem.RDKFingerprint(self.molecule)

    @staticmethod
    def _calculate_vdw_volume(mol: Mol) -> float:
        volume: float = 0.0
        periodic_table: PeriodicTable = Chem.GetPeriodicTable()

        for atom in mol.GetAtoms():
            atomic_number: int = atom.GetAtomicNum()
            radius: float = periodic_table.GetRvdw(atomic_number)
            volume += (4 / 3) * pi * (radius ** 3)

        return volume

    @staticmethod
    def _find_atomic_number(mol: Mol) -> int:
        if mol is not None:
            num_atoms = mol.GetNumAtoms()

        return num_atoms

    def calculate_dipole(self) -> float:
        try:
            mol = Chem.MolFromSmiles(self.smiles)
            if mol is None:
                raise ValueError("Invalid SMILES")
            mol = Chem.AddHs(mol)
            AllChem.EmbedMolecule(mol)
            AllChem.MMFFOptimizeMolecule(mol)
            rdPartialCharges.ComputeGasteigerCharges(mol)
            conf = mol.GetConformer()
            dipole = np.zeros(3)
            for atom in mol.GetAtoms():
                charge = atom.GetDoubleProp("_GasteigerCharge") * 1.602176634e-19  # e -> C
                pos = conf.GetAtomPosition(atom.GetIdx())
                pos_m = np.array([pos.x, pos.y, pos.z]) * 1e-10  # A -> m
                dipole += charge * pos_m
            magnitude = np.linalg.norm(dipole)
            return magnitude / 3.33564e-30  # Перевод в Debye
        except Exception as e:
            print(f"Error calculating dipole moment: {str(e)}")
            return None

    def as_vector(self) -> ndarray:
        return np.array([
            self.logp,
            self.tpsa,
            self.molwt,
            self.vdw_volume,
            self.fraction_non_rotatable_bounds,
            self.num_atoms,
            self.degree_of_branching,
            # self.dipole_moment
        ])

    def __repr__(self) -> str:
        return (
            f"SMILES: {self.smiles}\n"
            f"Descriptors:\n"
            f"  LogP: {self.logp:.2f}\n"
            f"  TPSA: {self.tpsa:.2f}\n"
            f"  Molecular Weight: {self.molwt:.2f}\n"
            f"  Total Bonds: {self.num_bounds}\n"
            f"  Rotatable Bonds: {self.num_rotatable_bounds}\n"
            f"  Non-Rotatable Bonds: {self.num_non_rotatable_bounds}\n"
            f"  Van der Waals Volume: {self.vdw_volume:.2f}\n"
            f"  Fraction of non-rotatable bounds: {self.fraction_non_rotatable_bounds:.2f}\n"
            f"  Num atoms: {self.num_atoms}\n"
            f"  Degree of branching: {self.degree_of_branching}\n"
            f"  Dipole Moment: {self.dipole_moment:.2f} D\n"
            # f"  Morgan fingerprint: {self.fp_morgan}\n"
            # f"  MACCS fingerprint: {self.fp_maccs}\n"
            # f"  Topological fingerprint: {self.fp_topological}\n"
        )


In [None]:
def calculate_descriptors(smiles):
    try:
        descriptor = Smiles2Descriptors(smiles)
        return descriptor.as_vector()
    except ValueError as e:
        print(f"Ошибка при обработке {smiles}: {e}")
        return [None] * 8
    except Exception as e:
        print(f"Неизвестная ошибка для {smiles}: {e}")
        return [None] * 8

In [None]:
descriptor_labels = [
    "LogP",
    "TPSA",
    "MolWt",
    "Van_Der_Waals volume"
    "Fraction_non_rotatable_bounds",
    "num_atoms",
    "Degree_of_branching"
]

In [None]:
descriptors_data = []
for smiles in df['smiles']:
    descriptors = calculate_descriptors(smiles)
    descriptors_data.append(descriptors)

[08:42:48] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:42:48] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:42:48] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:42:48] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:42:48] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:42:48] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:42:48] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:42:48] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:42:48] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:42:48] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:42:48] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:42:48] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:42:48] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:42:48] Explicit valence for atom # 6 C, 5, is g

Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный

[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is g

Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный

[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:12] Explicit valence for atom # 6 C, 5, is greater than permitted


Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES


[08:43:15] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:15] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:15] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:15] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:15] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:15] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:15] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:15] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:15] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:15] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:15] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:15] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:15] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:15] Explicit valence for atom # 6 C, 5, is g

Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный

[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is g

Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный

[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is g

Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный

[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is g

Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный

[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is g

Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный

[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:16] Explicit valence for atom # 6 C, 5, is greater than permitted
[08:43:17] Explicit valence for atom # 6 C, 5, is greater than permitted


Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES
Ошибка при обработке CCC(C(OC[CH2](C)CCCCCCC[CH2])=O)C: Неверный SMILES


In [None]:
result_df = pd.DataFrame(
    data={
        'blend_id': df['blend_id'],
        'component_name': df['component_name'],
        'mass_fraction': df['mass_fraction'],
        'component_type_title': df['component_type_title'],
        'log_transformed': df['log_transformed'],
        'smiles': df['smiles'],
        **{name: [d[i] for d in descriptors_data] for i, name in enumerate(descriptor_labels)},
        'oil_property_value': df['oil_property_value'],
    }
)
result_df

Unnamed: 0,blend_id,component_name,mass_fraction,component_type_title,log_transformed,smiles,LogP,TPSA,MolWt,Van_Der_Waals volumeFraction_non_rotatable_bounds,num_atoms,Degree_of_branching,oil_property_value
0,c090b033-87c0-4342-b737-fa7289b6f976,3a03d432-d849-417d-92e7-9a604187a096,67.00,5360a311-e081-4972-9215-26b5d9072f65,1.708378,CCCC(C)CCC,3.2227,0.0,114.232,294.924341,0.0,26.0,4.52
1,c090b033-87c0-4342-b737-fa7289b6f976,c33ebc5c-0935-4c6b-a489-f1404f88be22,33.00,5360a311-e081-4972-9215-26b5d9072f65,1.708378,CCC(C)CCCC,3.2227,0.0,114.232,294.924341,0.0,26.0,4.52
2,c090b033-87c0-4342-b737-fa7289b6f976,3a03d432-d849-417d-92e7-9a604187a096,67.00,5360a311-e081-4972-9215-26b5d9072f65,1.708378,CCCC(C)CCC,3.2227,0.0,114.232,294.924341,0.0,26.0,4.52
3,c090b033-87c0-4342-b737-fa7289b6f976,3a03d432-d849-417d-92e7-9a604187a096,67.00,5360a311-e081-4972-9215-26b5d9072f65,1.708378,CCCC(C)CCC,3.2227,0.0,114.232,294.924341,0.0,26.0,4.52
4,c090b033-87c0-4342-b737-fa7289b6f976,c33ebc5c-0935-4c6b-a489-f1404f88be22,33.00,5360a311-e081-4972-9215-26b5d9072f65,1.708378,CCC(C)CCCC,3.2227,0.0,114.232,294.924341,0.0,26.0,4.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...
94870,3826f353-92cb-4759-9625-8de04fd31f92,0962b43d-14b3-4bc5-b7b4-34500629ff48,49.69,2b8465f2-f5e2-4d0a-b468-52ec4467def1,2.406044,CCC(C)(C)CC,2.8326,0.0,100.205,259.868356,0.0,23.0,10.09
94871,3826f353-92cb-4759-9625-8de04fd31f92,7bc3fcf4-7f3f-4ff6-9bdd-a62fc39828d3,19.80,3c132c45-5aea-4626-8a81-becbb02b10b0,2.406044,CCCCCCC(CCCCCCCC)CCCCCCCC,9.0742,0.0,324.637,820.764119,0.0,71.0,10.09
94872,3826f353-92cb-4759-9625-8de04fd31f92,6b4f217d-2c74-4b84-ae57-7e6f79a6d6ad,29.70,3c132c45-5aea-4626-8a81-becbb02b10b0,2.406044,CCCCCCCC(C)C(CCCCC)C(C)CCCCC,8.3959,0.0,310.610,785.708134,0.0,68.0,10.09
94873,3826f353-92cb-4759-9625-8de04fd31f92,6b4f217d-2c74-4b84-ae57-7e6f79a6d6ad,29.70,3c132c45-5aea-4626-8a81-becbb02b10b0,2.406044,CCCCCCCC(C)C(CCCCC)C(C)CCCCC,8.3959,0.0,310.610,785.708134,0.0,68.0,10.09


In [None]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94875 entries, 0 to 94874
Data columns (total 13 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   blend_id                                           94875 non-null  object 
 1   component_name                                     94875 non-null  object 
 2   mass_fraction                                      94875 non-null  float64
 3   component_type_title                               94875 non-null  object 
 4   log_transformed                                    94875 non-null  float64
 5   smiles                                             94875 non-null  object 
 6   LogP                                               94435 non-null  float64
 7   TPSA                                               94435 non-null  float64
 8   MolWt                                              94435 non-null  float64
 9   Van_De

In [None]:
df = result_df

In [None]:
df.fillna({
    'log_transformed': df['log_transformed'].median(),
    'LogP': df['LogP'].mean(),
    'TPSA': df['TPSA'].median(),
    'MolWt': df['MolWt'].mean(),
    'Van_Der_Waals volumeFraction_non_rotatable_bounds': df['Van_Der_Waals volumeFraction_non_rotatable_bounds'].mean(),
    'num_atoms': df['num_atoms'].median(),
    'Degree_of_branching': df['Degree_of_branching'].mean()
}, inplace=True)


In [None]:
component_encoder = LabelEncoder()
type_encoder = LabelEncoder()
smiles_encoder = LabelEncoder()

df['component_idx'] = component_encoder.fit_transform(df['component_name'])
df['type_idx'] = type_encoder.fit_transform(df['component_type_title'])
df['smiles_idx'] = smiles_encoder.fit_transform(df['smiles'])

df


Unnamed: 0,blend_id,component_name,mass_fraction,component_type_title,log_transformed,smiles,LogP,TPSA,MolWt,Van_Der_Waals volumeFraction_non_rotatable_bounds,num_atoms,Degree_of_branching,oil_property_value,component_idx,type_idx,smiles_idx
0,c090b033-87c0-4342-b737-fa7289b6f976,3a03d432-d849-417d-92e7-9a604187a096,67.00,5360a311-e081-4972-9215-26b5d9072f65,1.708378,CCCC(C)CCC,3.2227,0.0,114.232,294.924341,0.0,26.0,4.52,10,5,14
1,c090b033-87c0-4342-b737-fa7289b6f976,c33ebc5c-0935-4c6b-a489-f1404f88be22,33.00,5360a311-e081-4972-9215-26b5d9072f65,1.708378,CCC(C)CCCC,3.2227,0.0,114.232,294.924341,0.0,26.0,4.52,31,5,12
2,c090b033-87c0-4342-b737-fa7289b6f976,3a03d432-d849-417d-92e7-9a604187a096,67.00,5360a311-e081-4972-9215-26b5d9072f65,1.708378,CCCC(C)CCC,3.2227,0.0,114.232,294.924341,0.0,26.0,4.52,10,5,14
3,c090b033-87c0-4342-b737-fa7289b6f976,3a03d432-d849-417d-92e7-9a604187a096,67.00,5360a311-e081-4972-9215-26b5d9072f65,1.708378,CCCC(C)CCC,3.2227,0.0,114.232,294.924341,0.0,26.0,4.52,10,5,14
4,c090b033-87c0-4342-b737-fa7289b6f976,c33ebc5c-0935-4c6b-a489-f1404f88be22,33.00,5360a311-e081-4972-9215-26b5d9072f65,1.708378,CCC(C)CCCC,3.2227,0.0,114.232,294.924341,0.0,26.0,4.52,31,5,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94870,3826f353-92cb-4759-9625-8de04fd31f92,0962b43d-14b3-4bc5-b7b4-34500629ff48,49.69,2b8465f2-f5e2-4d0a-b468-52ec4467def1,2.406044,CCC(C)(C)CC,2.8326,0.0,100.205,259.868356,0.0,23.0,10.09,0,2,9
94871,3826f353-92cb-4759-9625-8de04fd31f92,7bc3fcf4-7f3f-4ff6-9bdd-a62fc39828d3,19.80,3c132c45-5aea-4626-8a81-becbb02b10b0,2.406044,CCCCCCC(CCCCCCCC)CCCCCCCC,9.0742,0.0,324.637,820.764119,0.0,71.0,10.09,19,3,23
94872,3826f353-92cb-4759-9625-8de04fd31f92,6b4f217d-2c74-4b84-ae57-7e6f79a6d6ad,29.70,3c132c45-5aea-4626-8a81-becbb02b10b0,2.406044,CCCCCCCC(C)C(CCCCC)C(C)CCCCC,8.3959,0.0,310.610,785.708134,0.0,68.0,10.09,13,3,24
94873,3826f353-92cb-4759-9625-8de04fd31f92,6b4f217d-2c74-4b84-ae57-7e6f79a6d6ad,29.70,3c132c45-5aea-4626-8a81-becbb02b10b0,2.406044,CCCCCCCC(C)C(CCCCC)C(C)CCCCC,8.3959,0.0,310.610,785.708134,0.0,68.0,10.09,13,3,24


In [None]:
df = df1.drop(columns=['log_transformed'])

In [None]:
numerical_cols = [
    'mass_fraction', 'oil_property_value'
]

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


In [None]:
class BlendPredictor:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.component_encoder = LabelEncoder()
        self.type_encoder = LabelEncoder()
        self.scaler = StandardScaler()
        self.model = None
        self.max_components = 0  # Для паддинга

    class BlendDataset(Dataset):
        def __init__(self, df, predictor):
            self.groups = df.groupby('blend_id')
            self.predictor = predictor

        def __len__(self):
            return len(self.groups.groups)

        def __getitem__(self, idx):
            blend_id = list(self.groups.groups.keys())[idx]
            group = self.groups.get_group(blend_id)

            features = []
            for _, row in group.iterrows():
                features.append([
                    self.predictor.component_encoder.transform([row['component_name']])[0],
                    self.predictor.type_encoder.transform([row['component_type_title']])[0],
                    row['mass_fraction']
                ])

            return torch.FloatTensor(features)

    def collate_fn(self, batch):
        batch.sort(key=lambda x: len(x), reverse=True)

        sequences = [item for item in batch]
        padded = pad_sequence(sequences, batch_first=True)

        return padded.to(self.device)

    def fit(self, train_df):
        self.component_encoder.fit(train_df['component_name'])
        self.type_encoder.fit(train_df['component_type_title'])
        self.scaler.fit(train_df[['mass_fraction']])

        self.max_components = train_df.groupby('blend_id').size().max()
        print(f"Max components per blend: {self.max_components}")

        self.model = nn.Sequential(
            nn.TransformerEncoder(
                nn.TransformerEncoderLayer(
                    d_model=3,
                    nhead=3,
                    dim_feedforward=64,
                    batch_first=True
                ),
                num_layers=2
            ),
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(),
            nn.Linear(3, 1)
        ).to(self.device)

        dataset = self.BlendDataset(train_df, self)
        dataloader = DataLoader(
            dataset,
            batch_size=32,
            collate_fn=self.collate_fn,
            shuffle=True
        )

        optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)
        criterion = nn.L1Loss()

        for epoch in range(100):
            self.model.train()
            total_loss = 0
            for batch in dataloader:
                targets = train_df.groupby('blend_id')['oil_property_value'].first().values[:len(batch)]
                targets = torch.FloatTensor(targets).to(self.device)

                optimizer.zero_grad()
                outputs = self.model(batch).squeeze()
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            print(f"Epoch {epoch+1} Loss: {total_loss/len(dataloader):.4f}")

    def predict(self, components):
        data = pd.DataFrame({
            'blend_id': ['predict_blend'] * len(components),
            'component_name': [c['name'] for c in components],
            'component_type_title': [c['type'] for c in components],
            'mass_fraction': [c['mass'] for c in components]
        })

        data['mass_fraction'] = self.scaler.transform(data[['mass_fraction']]).flatten()
        dataset = self.BlendDataset(data, self)
        dataloader = DataLoader(
            dataset,
            batch_size=1,
            collate_fn=self.collate_fn
        )

        self.model.eval()
        with torch.no_grad():
            for batch in dataloader:
                return self.model(batch).squeeze().item()

In [None]:

    # Инициализация
    trainer = BlendPredictor()

    # Пример обучающих данных
    train_data = df

    # Обучение
    trainer.fit(train_data)


Max components per blend: 330




RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x330 and 3x1)

In [None]:

    # Пример предсказания
new_components = [
]

prediction = trainer.predict(new_components)
print(f"Predicted value: {prediction:.2f}")

In [None]:
# for epoch in range(100):
#     total_loss = 0
#     for inputs, targets in dataloader:
#         inputs = inputs.to(device)
#         targets = targets.to(device)

#         optimizer.zero_grad()
#         outputs = model(inputs)
#         loss = criterion(outputs.squeeze(), targets)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()

#     avg_loss = total_loss / len(dataloader)
#     print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")


Epoch 1, Loss: 6.5229
Epoch 2, Loss: 5.8096
Epoch 3, Loss: 5.2134
Epoch 4, Loss: 4.7431
Epoch 5, Loss: 4.3723
Epoch 6, Loss: 4.0886
Epoch 7, Loss: 3.8032
Epoch 8, Loss: 3.4847
Epoch 9, Loss: 3.1887
Epoch 10, Loss: 2.8742
Epoch 11, Loss: 2.5317
Epoch 12, Loss: 2.2263
Epoch 13, Loss: 1.9057
Epoch 14, Loss: 1.6189
Epoch 15, Loss: 1.4437
Epoch 16, Loss: 1.3597
Epoch 17, Loss: 1.2749
Epoch 18, Loss: 1.2464
Epoch 19, Loss: 1.2319
Epoch 20, Loss: 1.2301
Epoch 21, Loss: 1.2258
Epoch 22, Loss: 1.2267
Epoch 23, Loss: 1.2154
Epoch 24, Loss: 1.2056
Epoch 25, Loss: 1.1815
Epoch 26, Loss: 1.1554
Epoch 27, Loss: 1.1051
Epoch 28, Loss: 1.0439
Epoch 29, Loss: 0.9208
Epoch 30, Loss: 0.7823
Epoch 31, Loss: 0.6481
Epoch 32, Loss: 0.5429
Epoch 33, Loss: 0.5270
Epoch 34, Loss: 0.4323
Epoch 35, Loss: 0.3699
Epoch 36, Loss: 0.3225
Epoch 37, Loss: 0.2920
Epoch 38, Loss: 0.2801
Epoch 39, Loss: 0.2336
Epoch 40, Loss: 0.2161
Epoch 41, Loss: 0.1967
Epoch 42, Loss: 0.1834
Epoch 43, Loss: 0.1773
Epoch 44, Loss: 0.16

KeyboardInterrupt: 

In [None]:
# torch.save(model.state_dict(),'model_first_try.pth')

In [None]:
# def load_model(path):
#     checkpoint = torch.load(path, map_location=device)
#     model = BlendTransformer().to(device)
#     model.load_state_dict(checkpoint['model_state_dict'])
#     return model

In [None]:
# # Перевод модели в режим оценки
# model.eval()

# predictions = []
# true_values = []

# with torch.no_grad():  # Отключение вычисления градиентов
#     for features, targets in dataloader:
#         features, targets = features.to(device), targets.to(device)

#         # Предсказание
#         outputs = model(features)

#         # Сохранение результатов
#         predictions.extend(outputs.cpu().numpy())
#         true_values.extend(targets.cpu().numpy())

# # Преобразование в массивы NumPy
# predictions = np.array(predictions).squeeze()
# true_values = np.array(true_values)

# # Вывод результатов
# print("Predictions:", predictions)
# print("True Values:", true_values)

Predictions: [10.095041   8.333366  10.19422   10.01713    5.4499516  4.511152
  8.269084   6.7128863  6.2261257  5.64046    9.172999  10.012323
  6.760896   8.505348   8.786482   6.142917   3.622222   6.6109786
  8.698145   6.9571753  5.3773193  6.1087956 10.057497   6.565551
  6.5891747  6.9715533  5.6643915  7.7088294  6.9791255  5.8214793
  6.88275    9.200432  10.136037   6.9527593  6.9782457  6.863877
  8.131656   7.958987  10.161758   6.7829504  5.747252   6.9293494
  7.920833   4.4951577  7.413735   4.9529657  7.740026   5.3576455
  5.6606793  6.7172365  6.3736944  4.33911    6.78375    8.934355
  6.107484   7.045359   7.477793   5.701645   5.7215133  4.4711804
  9.580505   4.948862   6.163697   7.445796   8.918499   5.7819695
  5.883141   7.492828   6.36673    6.6870103  6.8470345  5.8073053
  6.2929516  6.808868   6.844104   5.694687   8.018608   6.6053205
  8.854675   7.9095345  5.8606353  9.3308115  5.6362176  5.789807
  9.150806   6.263161   6.9128847  9.09112    8.026887 

In [None]:
# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score

# # Вычисление метрик
# mse = mean_absolute_error(true_values, predictions)

# print(f': {mse:.4f}')

: 0.0867
