In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (36.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.2/36.2 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.1


In [None]:
import kagglehub
import os

os.environ['KAGGLE_USERNAME'] = 'kirimusha'
os.environ['KAGGLE_KEY'] = ''

# Скачать датасет
path = kagglehub.competition_download("spbstu-polyml-competition")
print("Path to competition files:", path)

Downloading from https://www.kaggle.com/api/v1/competitions/data/download-all/spbstu-polyml-competition...


100%|██████████| 462k/462k [00:00<00:00, 64.9MB/s]

Extracting files...
Path to competition files: /root/.cache/kagglehub/competitions/spbstu-polyml-competition





In [None]:
import pandas as pd

train = pd.read_csv("/root/.cache/kagglehub/competitions/spbstu-polyml-competition/train.csv")
test = pd.read_csv("/root/.cache/kagglehub/competitions/spbstu-polyml-competition/test.csv")

print(f"Размер обучающей выборки: {train.shape}")
print(f"Размер тестовой выборки: {test.shape}")

Размер обучающей выборки: (22180, 2)
Размер тестовой выборки: (9506, 2)


In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski, Crippen, Fragments
from rdkit.Chem import AllChem
from rdkit.Chem.EState import EState_VSA
import numpy as np

def get_comprehensive_descriptors(smiles):
    """
    Извлекает все возможные дескрипторы из SMILES строки
    """
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None

        # Базовые дескрипторы
        descriptors = {
            # Молекулярные свойства
            'MolWt': Descriptors.MolWt(mol),
            'HeavyAtomMolWt': Descriptors.HeavyAtomMolWt(mol),
            'ExactMolWt': Descriptors.ExactMolWt(mol),
            'NumValenceElectrons': Descriptors.NumValenceElectrons(mol),

            # Полярность и растворимость
            'TPSA': Descriptors.TPSA(mol),
            'MolLogP': Descriptors.MolLogP(mol),
            'MolMR': Descriptors.MolMR(mol),

            # Доноры/акцепторы водорода
            'NumHDonors': Descriptors.NumHDonors(mol),
            'NumHAcceptors': Descriptors.NumHAcceptors(mol),

            # Атомы
            'HeavyAtomCount': Descriptors.HeavyAtomCount(mol),
            'NumHeteroatoms': Descriptors.NumHeteroatoms(mol),
            'NumAtoms': mol.GetNumAtoms(),

            # Связи
            'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
            'NumAmideBonds': Lipinski.NumAmideBonds(mol),

            # Кольца
            'RingCount': Descriptors.RingCount(mol),
            'NumAromaticRings': Descriptors.NumAromaticRings(mol),
            'NumAliphaticRings': Descriptors.NumAliphaticRings(mol),
            'NumSaturatedRings': Descriptors.NumSaturatedRings(mol),
            'NumAromaticCarbocycles': Lipinski.NumAromaticCarbocycles(mol),
            'NumAromaticHeterocycles': Lipinski.NumAromaticHeterocycles(mol),
            'NumAliphaticCarbocycles': Lipinski.NumAliphaticCarbocycles(mol),
            'NumAliphaticHeterocycles': Lipinski.NumAliphaticHeterocycles(mol),

            # Углероды
            'FractionCSP3': Descriptors.FractionCSP3(mol),
            'NumCSP3': sum(1 for atom in mol.GetAtoms() if atom.GetHybridization() == Chem.HybridizationType.SP3),
            'NumCSP2': sum(1 for atom in mol.GetAtoms() if atom.GetHybridization() == Chem.HybridizationType.SP2),
            'NumCSP': sum(1 for atom in mol.GetAtoms() if atom.GetHybridization() == Chem.HybridizationType.SP),

            # Степень окисления
            'MaxAbsPartialCharge': Descriptors.MaxAbsPartialCharge(mol),
            'MaxPartialCharge': Descriptors.MaxPartialCharge(mol),
            'MinPartialCharge': Descriptors.MinPartialCharge(mol),

            # Поверхность и объем
            'LabuteASA': Descriptors.LabuteASA(mol),
            'PEOE_VSA1': Descriptors.PEOE_VSA1(mol),
            'PEOE_VSA2': Descriptors.PEOE_VSA2(mol),
            'PEOE_VSA3': Descriptors.PEOE_VSA3(mol),
            'PEOE_VSA4': Descriptors.PEOE_VSA4(mol),
            'PEOE_VSA5': Descriptors.PEOE_VSA5(mol),
            'PEOE_VSA6': Descriptors.PEOE_VSA6(mol),
            'PEOE_VSA7': Descriptors.PEOE_VSA7(mol),
            'PEOE_VSA8': Descriptors.PEOE_VSA8(mol),
            'PEOE_VSA9': Descriptors.PEOE_VSA9(mol),
            'PEOE_VSA10': Descriptors.PEOE_VSA10(mol),
            'PEOE_VSA11': Descriptors.PEOE_VSA11(mol),
            'PEOE_VSA12': Descriptors.PEOE_VSA12(mol),
            'PEOE_VSA13': Descriptors.PEOE_VSA13(mol),
            'PEOE_VSA14': Descriptors.PEOE_VSA14(mol),

            # SMR_VSA descriptors
            'SMR_VSA1': Descriptors.SMR_VSA1(mol),
            'SMR_VSA2': Descriptors.SMR_VSA2(mol),
            'SMR_VSA3': Descriptors.SMR_VSA3(mol),
            'SMR_VSA4': Descriptors.SMR_VSA4(mol),
            'SMR_VSA5': Descriptors.SMR_VSA5(mol),
            'SMR_VSA6': Descriptors.SMR_VSA6(mol),
            'SMR_VSA7': Descriptors.SMR_VSA7(mol),
            'SMR_VSA8': Descriptors.SMR_VSA8(mol),
            'SMR_VSA9': Descriptors.SMR_VSA9(mol),
            'SMR_VSA10': Descriptors.SMR_VSA10(mol),

            # SlogP_VSA descriptors
            'SlogP_VSA1': Descriptors.SlogP_VSA1(mol),
            'SlogP_VSA2': Descriptors.SlogP_VSA2(mol),
            'SlogP_VSA3': Descriptors.SlogP_VSA3(mol),
            'SlogP_VSA4': Descriptors.SlogP_VSA4(mol),
            'SlogP_VSA5': Descriptors.SlogP_VSA5(mol),
            'SlogP_VSA6': Descriptors.SlogP_VSA6(mol),
            'SlogP_VSA7': Descriptors.SlogP_VSA7(mol),
            'SlogP_VSA8': Descriptors.SlogP_VSA8(mol),
            'SlogP_VSA9': Descriptors.SlogP_VSA9(mol),
            'SlogP_VSA10': Descriptors.SlogP_VSA10(mol),
            'SlogP_VSA11': Descriptors.SlogP_VSA11(mol),
            'SlogP_VSA12': Descriptors.SlogP_VSA12(mol),

            # EState_VSA descriptors
            'EState_VSA1': EState_VSA.EState_VSA1(mol),
            'EState_VSA2': EState_VSA.EState_VSA2(mol),
            'EState_VSA3': EState_VSA.EState_VSA3(mol),
            'EState_VSA4': EState_VSA.EState_VSA4(mol),
            'EState_VSA5': EState_VSA.EState_VSA5(mol),
            'EState_VSA6': EState_VSA.EState_VSA6(mol),
            'EState_VSA7': EState_VSA.EState_VSA7(mol),
            'EState_VSA8': EState_VSA.EState_VSA8(mol),
            'EState_VSA9': EState_VSA.EState_VSA9(mol),
            'EState_VSA10': EState_VSA.EState_VSA10(mol),
            'EState_VSA11': EState_VSA.EState_VSA11(mol),
        }

        # Добавляем фрагменты молекулы
        fragment_descriptors = {
            'fr_Al_COO': Fragments.fr_Al_COO(mol),
            'fr_Al_OH': Fragments.fr_Al_OH(mol),
            'fr_Al_OH_noTert': Fragments.fr_Al_OH_noTert(mol),
            'fr_ArN': Fragments.fr_ArN(mol),
            'fr_Ar_COO': Fragments.fr_Ar_COO(mol),
            'fr_Ar_N': Fragments.fr_Ar_N(mol),
            'fr_Ar_NH': Fragments.fr_Ar_NH(mol),
            'fr_Ar_OH': Fragments.fr_Ar_OH(mol),
            'fr_COO': Fragments.fr_COO(mol),
            'fr_COO2': Fragments.fr_COO2(mol),
            'fr_C_O': Fragments.fr_C_O(mol),
            'fr_C_O_noCOO': Fragments.fr_C_O_noCOO(mol),
            'fr_C_S': Fragments.fr_C_S(mol),
            'fr_HOCCN': Fragments.fr_HOCCN(mol),
            'fr_Imine': Fragments.fr_Imine(mol),
            'fr_NH0': Fragments.fr_NH0(mol),
            'fr_NH1': Fragments.fr_NH1(mol),
            'fr_NH2': Fragments.fr_NH2(mol),
            'fr_N_O': Fragments.fr_N_O(mol),
            'fr_Ndealkylation1': Fragments.fr_Ndealkylation1(mol),
            'fr_Ndealkylation2': Fragments.fr_Ndealkylation2(mol),
            'fr_Nhpyrrole': Fragments.fr_Nhpyrrole(mol),
            'fr_SH': Fragments.fr_SH(mol),
            'fr_aldehyde': Fragments.fr_aldehyde(mol),
            'fr_alkyl_carbamate': Fragments.fr_alkyl_carbamate(mol),
            'fr_alkyl_halide': Fragments.fr_alkyl_halide(mol),
            'fr_allylic_oxid': Fragments.fr_allylic_oxid(mol),
            'fr_amide': Fragments.fr_amide(mol),
            'fr_amidine': Fragments.fr_amidine(mol),
            'fr_aniline': Fragments.fr_aniline(mol),
            'fr_aryl_methyl': Fragments.fr_aryl_methyl(mol),
            'fr_azide': Fragments.fr_azide(mol),
            'fr_azo': Fragments.fr_azo(mol),
            'fr_barbitur': Fragments.fr_barbitur(mol),
            'fr_benzodiazepine': Fragments.fr_benzodiazepine(mol),
            'fr_bicyclic': Fragments.fr_bicyclic(mol),
            'fr_diazo': Fragments.fr_diazo(mol),
            'fr_dihydropyridine': Fragments.fr_dihydropyridine(mol),
            'fr_epoxide': Fragments.fr_epoxide(mol),
            'fr_ester': Fragments.fr_ester(mol),
            'fr_ether': Fragments.fr_ether(mol),
            'fr_furan': Fragments.fr_furan(mol),
            'fr_guanido': Fragments.fr_guanido(mol),
            'fr_halogen': Fragments.fr_halogen(mol),
            'fr_hdrzine': Fragments.fr_hdrzine(mol),
            'fr_hdrzone': Fragments.fr_hdrzone(mol),
            'fr_imidazole': Fragments.fr_imidazole(mol),
            'fr_imide': Fragments.fr_imide(mol),
            'fr_isocyan': Fragments.fr_isocyan(mol),
            'fr_isothiocyan': Fragments.fr_isothiocyan(mol),
            'fr_ketone': Fragments.fr_ketone(mol),
            'fr_ketone_Topliss': Fragments.fr_ketone_Topliss(mol),
            'fr_lactam': Fragments.fr_lactam(mol),
            'fr_lactone': Fragments.fr_lactone(mol),
            'fr_methoxy': Fragments.fr_methoxy(mol),
            'fr_morpholine': Fragments.fr_morpholine(mol),
            'fr_nitrile': Fragments.fr_nitrile(mol),
            'fr_nitro': Fragments.fr_nitro(mol),
            'fr_nitro_arom': Fragments.fr_nitro_arom(mol),
            'fr_nitro_arom_nonortho': Fragments.fr_nitro_arom_nonortho(mol),
            'fr_nitroso': Fragments.fr_nitroso(mol),
            'fr_oxazole': Fragments.fr_oxazole(mol),
            'fr_oxime': Fragments.fr_oxime(mol),
            'fr_para_hydroxylation': Fragments.fr_para_hydroxylation(mol),
            'fr_phenol': Fragments.fr_phenol(mol),
            'fr_phenol_noOrthoHbond': Fragments.fr_phenol_noOrthoHbond(mol),
            'fr_phos_acid': Fragments.fr_phos_acid(mol),
            'fr_phos_ester': Fragments.fr_phos_ester(mol),
            'fr_piperdine': Fragments.fr_piperdine(mol),
            'fr_piperzine': Fragments.fr_piperzine(mol),
            'fr_priamide': Fragments.fr_priamide(mol),
            'fr_prisulfonamd': Fragments.fr_prisulfonamd(mol),
            'fr_pyridine': Fragments.fr_pyridine(mol),
            'fr_quatN': Fragments.fr_quatN(mol),
            'fr_sulfide': Fragments.fr_sulfide(mol),
            'fr_sulfonamd': Fragments.fr_sulfonamd(mol),
            'fr_sulfone': Fragments.fr_sulfone(mol),
            'fr_term_acetylene': Fragments.fr_term_acetylene(mol),
            'fr_tetrazole': Fragments.fr_tetrazole(mol),
            'fr_thiazole': Fragments.fr_thiazole(mol),
            'fr_thiocyan': Fragments.fr_thiocyan(mol),
            'fr_thiophene': Fragments.fr_thiophene(mol),
            'fr_unbrch_alkane': Fragments.fr_unbrch_alkane(mol),
            'fr_urea': Fragments.fr_urea(mol),
        }

        descriptors.update(fragment_descriptors)

        # Добавляем дополнительные дескрипторы Lipinski
        lipinski_descriptors = {
            'LipinskiHBD': Lipinski.NumHDonors(mol),
            'LipinskiHBA': Lipinski.NumHAcceptors(mol),
            'LipinskiRotatableBonds': Lipinski.NumRotatableBonds(mol),
            'LipinskiNHOH': Lipinski.NumHDonors(mol),
            'LipinskiNOCount': Lipinski.NumHAcceptors(mol),
        }

        descriptors.update(lipinski_descriptors)

        # Добавляем информацию о заряде
        charge_descriptors = {
            'formal_charge': Chem.GetFormalCharge(mol),
            'num_radical_electrons': Descriptors.NumRadicalElectrons(mol),
        }

        descriptors.update(charge_descriptors)

        return descriptors

    except Exception as e:
        print(f"Error processing {smiles}: {e}")
        return None

In [None]:
def add_fingerprint_features(smiles, fp_size=512):
    """
    Добавляет Morgan fingerprint к дескрипторам
    """
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None

        # Morgan fingerprint
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=fp_size)
        fp_features = {f'fp_{i}': int(bit) for i, bit in enumerate(fp)}

        return fp_features

    except Exception as e:
        print(f"Error generating fingerprint for {smiles}: {e}")
        return None

def get_all_features(smiles, include_fingerprints=True, fp_size=256):
    """
    Комбинирует все возможные фичи
    """
    # Основные дескрипторы
    descriptors = get_comprehensive_descriptors(smiles)
    if descriptors is None:
        return None

    # Фингерпринты
    if include_fingerprints:
        fp_features = add_fingerprint_features(smiles, fp_size)
        if fp_features:
            descriptors.update(fp_features)

    return descriptors

In [None]:
# Извлечение дескрипторов для обучающей выборки
train_descriptors = []
failed_smiles = []

for smiles in train['smiles']:
    descr = get_all_features(smiles)
    if descr is not None:
        train_descriptors.append(descr)
    else:
        failed_smiles.append(smiles)
        train_descriptors.append(None)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
[12:06:16] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 9 10 11 12 13
[12:06:16] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[12:06:16] Can't kekulize mol.  Unkekulized atoms: 4 5 8 9 10
[12:06:16] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
[12:06:17] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
[12:06:18] Can't kekulize mol.  Unkekulized atoms: 15 16 17 18 19 20 21 22 23
[12:06:18] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5
[12:06:19] Can't kekulize mol.  Unkekulized atoms: 9 10 11 12 13 19 20 21 22
[12:06:19] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 6
[12:06:20] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
[12:06:20] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 7 8 9 10 11
[12:06:20] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
[12:06:20] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 1

In [None]:
# Удаление строк с неудачным извлечением дескрипторов
train = train.copy()
train['descriptors'] = train_descriptors
train = train.dropna(subset=['descriptors'])

print(f"Успешно обработано: {len(train)} из {len(train_descriptors)} молекул")

Успешно обработано: 21963 из 22180 молекул


In [None]:
# Создание DataFrame с дескрипторами
X_train = pd.DataFrame(train['descriptors'].tolist())
y_train = train['mpC']

In [None]:
# Извлечение дескрипторов для тестовой выборки
print("Извлечение дескрипторов для тестовой выборки...")
test_descriptors = []

# Сначала получим все успешные дескрипторы из тренировочной выборки
successful_train_descriptors = []
for desc in train_descriptors:
    if desc is not None:
        successful_train_descriptors.append(desc)

# Вычислим средние значения по всем фичам из успешных тренировочных данных
if successful_train_descriptors:
    train = pd.DataFrame(successful_train_descriptors)
    mean_values = train.mean().to_dict()
else:
    # Fallback значения если нет успешных тренировочных данных
    mean_values = {
        'MolWt': 200, 'TPSA': 50, 'MolLogP': 2,
        'NumHDonors': 1, 'NumHAcceptors': 2,
        'NumRotatableBonds': 4, 'NumAromaticRings': 1,
        'FractionCSP3': 0.5, 'HeavyAtomCount': 20, 'RingCount': 2
    }

# Обработка тестовой выборки
failed_count = 0
for i, smiles in enumerate(test['smiles']):
    desc = get_all_features(smiles, include_fingerprints=True, fp_size=256)
    if desc is not None:
        test_descriptors.append(desc)
    else:
        failed_count += 1
        # Используем средние значения из тренировочной выборки
        # Для фингерпринтов заполняем нулями
        default_desc = mean_values.copy()

        # Добавляем фингерпринты с нулевыми значениями
        fp_features = {f'fp_{i}': 0 for i in range(256)}
        default_desc.update(fp_features)

        test_descriptors.append(default_desc)

        if failed_count <= 5:  # Покажем только первые 5 ошибок
            print(f"Ошибка обработки SMILES {i}: {smiles}")

print(f"Успешно обработано: {len(test_descriptors) - failed_count}/{len(test)}")
print(f"Ошибок: {failed_count}")

Извлечение дескрипторов для тестовой выборки...


[12:06:49] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15


Ошибка обработки SMILES 20: CN=C(/NC#N)NCCSCc1ncnc1C


[12:06:49] Explicit valence for atom # 1 Br, 3, is greater than permitted


Ошибка обработки SMILES 70: FBr(F)F


[12:06:50] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 7 8 9 10 11


Ошибка обработки SMILES 204: [O-][N+](=O)c1ccc2ncnc2c1


[12:06:50] Can't kekulize mol.  Unkekulized atoms: 8 9 10 11 12


Ошибка обработки SMILES 276: Clc1cccc(c1Cl)c2cncc2C#N
Ошибка обработки SMILES 298: Cc2cnc1ccccc12


[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
[12:07:06] Can't kekulize mol.  Unkekulized atoms: 5 6 7 9 12
[12:07:06] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[12:07:06] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 8
[12:07:07] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[12:07:07] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
[12:07:07] Can't kekulize mol.  Unkekulized atoms: 12 13 14 15 16 17 18 19 20
[12:07:08] Can't kekulize mol.  Unkekulized atoms: 9 10 11 12 13 14 15 16 17
[12:07:08] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 7 8 9 10
[12:07:08] Can't kekulize mol.  Unkekulized atoms: 12 13 15 16 17 18 19 20 21
[12:07:09] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 10
[12:07:10] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[12:07:10] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 9 10 11 12 13
[12:07:10] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
[12:07:11] Can't kekulize mo

Успешно обработано: 9406/9506
Ошибок: 100




In [None]:
# Создание DataFrame для тестовой выборки
X_test = pd.DataFrame(test_descriptors)

# Убедимся, что порядок фич совпадает с тренировочной выборкой
# Важно: используем те же столбцы, что и в X_train
if 'X_train' in locals():
    # Добавляем недостающие колонки
    missing_cols = set(X_train.columns) - set(X_test.columns)
    for col in missing_cols:
        X_test[col] = 0

    # Убираем лишние колонки
    extra_cols = set(X_test.columns) - set(X_train.columns)
    X_test = X_test.drop(columns=extra_cols)

    # Сортируем колонки в том же порядке
    X_test = X_test[X_train.columns]

In [None]:
from sklearn.model_selection import train_test_split

# Разделение на тренировочную и валидационную выборки
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

In [None]:
print(f"Тренировочная выборка: {X_tr.shape}")
print(f"Валидационная выборка: {X_val.shape}")

Тренировочная выборка: (17570, 424)
Валидационная выборка: (4393, 424)


In [None]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


Вариант модели номер 1

In [None]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(
    # Бустинг
    iterations=500,
    learning_rate=0.015,
    depth=12,

    # Регуляризация
    l2_leaf_reg=1,
    random_strength=1.2,
    bootstrap_type='Bernoulli',
    subsample=0.85,
    rsm=0.7,

    # Деревья
    grow_policy='Lossguide',
    min_data_in_leaf=2,
    max_leaves=256,
    max_bin=254,
    score_function='L2',

    # Оптимизация
    leaf_estimation_iterations=15,
    leaf_estimation_method='Newton',
    model_size_reg=0.3,

    # Обучение
    loss_function='RMSE',
    eval_metric='RMSE',
    early_stopping_rounds=200,
    use_best_model=True,

    random_seed=42,
    verbose=200,
    thread_count=-1
)

In [None]:
model.fit(
    X_tr, y_tr,
    eval_set=(X_val, y_val),
    verbose=True,
    use_best_model=True
)

0:	learn: 96.2177993	test: 96.1181629	best: 96.1181629 (0)	total: 511ms	remaining: 4m 14s
1:	learn: 95.0598978	test: 94.9779109	best: 94.9779109 (1)	total: 992ms	remaining: 4m 7s
2:	learn: 93.9084484	test: 93.8706129	best: 93.8706129 (2)	total: 1.51s	remaining: 4m 10s
3:	learn: 92.7860724	test: 92.7880827	best: 92.7880827 (3)	total: 2.01s	remaining: 4m 9s
4:	learn: 91.6794053	test: 91.7209907	best: 91.7209907 (4)	total: 2.53s	remaining: 4m 10s
5:	learn: 90.5928626	test: 90.6762875	best: 90.6762875 (5)	total: 3.02s	remaining: 4m 8s
6:	learn: 89.5219786	test: 89.6408463	best: 89.6408463 (6)	total: 3.52s	remaining: 4m 8s
7:	learn: 88.4683858	test: 88.6250640	best: 88.6250640 (7)	total: 4.01s	remaining: 4m 6s
8:	learn: 87.4317002	test: 87.6233165	best: 87.6233165 (8)	total: 4.54s	remaining: 4m 7s
9:	learn: 86.4100159	test: 86.6486090	best: 86.6486090 (9)	total: 5.03s	remaining: 4m 6s
10:	learn: 85.4067849	test: 85.6914683	best: 85.6914683 (10)	total: 5.54s	remaining: 4m 6s
11:	learn: 84.41

<catboost.core.CatBoostRegressor at 0x7e22b3203890>

Вариант модели номер 2


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Улучшенный Random Forest
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,           # Не ограничивать глубину
    min_samples_split=2,
    min_samples_leaf=1,
    max_features=0.36,
    max_samples=1.0,
    random_state=42,
    n_jobs=-1
)

model.fit(X_tr, y_tr)

In [None]:
# Get the depth of each tree in the random forest
tree_depths = [tree.tree_.max_depth for tree in model.estimators_]
print("Tree depths:", tree_depths)
print("Average depth:", sum(tree_depths) / len(tree_depths))
print("Max depth:", max(tree_depths))
print("Min depth:", min(tree_depths))

Tree depths: [38, 37, 43, 38, 38, 42, 40, 48, 39, 34, 40, 35, 48, 37, 41, 42, 42, 39, 39, 33, 34, 42, 43, 40, 34, 39, 35, 36, 35, 35, 37, 38, 37, 44, 41, 35, 37, 39, 36, 40, 41, 34, 38, 40, 41, 36, 38, 45, 33, 43, 38, 40, 37, 46, 38, 41, 35, 38, 34, 43, 40, 42, 41, 42, 51, 41, 33, 33, 36, 34, 41, 38, 36, 37, 42, 37, 37, 39, 38, 39, 42, 36, 35, 37, 39, 42, 48, 38, 37, 41, 41, 40, 40, 39, 35, 37, 43, 50, 35, 33]
Average depth: 38.94
Max depth: 51
Min depth: 33


Оценим модель теперь

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

# Оценка модели
y_pred_val = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred_val)
r2 = r2_score(y_val, y_pred_val)

print(f"\nРезультаты на валидационной выборке:")
print(f"MAE: {mae:.2f}°C")
print(f"R²: {r2:.4f}")


Результаты на валидационной выборке:
MAE: 21.77°C
R²: 0.8763


In [None]:
# Предсказание для тестовой выборки
print("Предсказание для тестовой выборки...")
test_predictions = model.predict(X_test)

Предсказание для тестовой выборки...


In [None]:
# Создание submission файла
submission = pd.DataFrame({
    'id': test['id'],
    'mpC': test_predictions
})

In [None]:
from google.colab import files

# Сохранение результатов
submission.to_csv('submission.csv', index=False)
print(f"\nПредсказания сохранены в submission.csv")
print(f"Диапазон предсказанных значений: {test_predictions.min():.2f}°C - {test_predictions.max():.2f}°C")
files.download('submission.csv')


Предсказания сохранены в submission.csv
Диапазон предсказанных значений: -178.00°C - 472.44°C


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Сначала проверим размерности
print(f"Длина X_train.columns: {len(X_train.columns)}")
print(f"Длина model.feature_importances_: {len(model.feature_importances_)}")

# Исправленный код с проверкой
if len(X_train.columns) == len(model.feature_importances_):
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
else:
    # Если размерности не совпадают, используем индексы
    print("Размерности не совпадают! Используем индексы...")
    feature_importance = pd.DataFrame({
        'feature': [f'feature_{i}' for i in range(len(model.feature_importances_))],
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)

print("\nВажность признаков:")
print(feature_importance.head())

# Установите порог важности
importance_threshold = 0.001 # признаки с важностью менее 0.1% будут исключены

# Отбираем важные признаки
important_features = feature_importance[feature_importance['importance'] > importance_threshold]['feature'].tolist()

print(f"\nИсходное количество признаков: {len(X_train.columns)}")
print(f"Отобрано признаков с важностью > {importance_threshold}: {len(important_features)}")

# Создаем новые наборы данных
if len(X_train.columns) == len(model.feature_importances_):
    # Если имена совпадают
    X_train = X_train[important_features]
    X_test = X_test[important_features]
else:
    # Если используем индексы
    feature_indices = [int(feat.split('_')[1]) for feat in important_features]
    X_train = X_train.iloc[:, feature_indices]
    X_test = X_test.iloc[:, feature_indices]
    # Переименовываем колонки для удобства
    X_train.columns = important_features
    X_test.columns = important_features

print(f"\nНовая размерность X_train: {X_train.shape}")
print(f"Новая размерность X_test: {X_test.shape}")

# Выводим список отобранных признаков
print(f"\nОтобранные признаки (важность > {importance_threshold}):")
for i, feature in enumerate(important_features, 1):
    importance_value = feature_importance[feature_importance['feature'] == feature]['importance'].values[0]
    print(f"{i:2d}. {feature}: {importance_value:.4f}")

Длина X_train.columns: 424
Длина model.feature_importances_: 424

Важность признаков:
            feature  importance
4              TPSA    0.161679
24          NumCSP2    0.097465
14        RingCount    0.070864
268          fp_100    0.051915
9    HeavyAtomCount    0.035912

Исходное количество признаков: 424
Отобрано признаков с важностью > 0.005: 30

Новая размерность X_train: (21963, 30)
Новая размерность X_test: (9506, 30)

Отобранные признаки (важность > 0.005):
 1. TPSA: 0.1617
 2. NumCSP2: 0.0975
 3. RingCount: 0.0709
 4. fp_100: 0.0519
 5. HeavyAtomCount: 0.0359
 6. HeavyAtomMolWt: 0.0307
 7. LabuteASA: 0.0300
 8. LipinskiHBD: 0.0292
 9. LipinskiNHOH: 0.0289
10. NumHDonors: 0.0282
11. NumAtoms: 0.0242
12. NumHeteroatoms: 0.0223
13. MaxPartialCharge: 0.0151
14. ExactMolWt: 0.0133
15. MolWt: 0.0115
16. SMR_VSA10: 0.0110
17. MolMR: 0.0108
18. fr_bicyclic: 0.0099
19. SlogP_VSA2: 0.0089
20. MinPartialCharge: 0.0088
21. FractionCSP3: 0.0080
22. MolLogP: 0.0080
23. MaxAbsPartialCha

In [None]:
from sklearn.model_selection import train_test_split

# Разделение на тренировочную и валидационную выборки
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Улучшенный Random Forest
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,           # Не ограничивать глубину
    min_samples_split=2,
    min_samples_leaf=1,
    max_features=0.3,         # Меньше фич для разнообразия
    max_samples=0.8,          # Subsample для bagging
    random_state=42,
    n_jobs=-1
)

model.fit(X_tr, y_tr)

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

# Оценка модели
y_pred_val = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred_val)
r2 = r2_score(y_val, y_pred_val)

print(f"\nРезультаты на валидационной выборке:")
print(f"MAE: {mae:.2f}°C")
print(f"R²: {r2:.4f}")


Результаты на валидационной выборке:
MAE: 22.62°C
R²: 0.8625


In [None]:
# Предсказание для тестовой выборки
print("Предсказание для тестовой выборки...")
test_predictions = model.predict(X_test)

Предсказание для тестовой выборки...


In [None]:
# Создание submission файла
submission = pd.DataFrame({
    'id': test['id'],
    'mpC': test_predictions
})

In [None]:
from google.colab import files

# Сохранение результатов
submission.to_csv('submission.csv', index=False)
print(f"\nПредсказания сохранены в submission.csv")
print(f"Диапазон предсказанных значений: {test_predictions.min():.2f}°C - {test_predictions.max():.2f}°C")
files.download('submission.csv')


Предсказания сохранены в submission.csv
Диапазон предсказанных значений: -184.11°C - 414.18°C


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>