In [2]:
# 系统库
import os
import subprocess
import time
import shutil
import json
import socket
from datetime import datetime, timedelta

# 第三方科学计算 & 可视化
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 设置中文字体，避免乱码
plt.rcParams['font.sans-serif'] = ['SimHei']        # 黑体
plt.rcParams['axes.unicode_minus'] = False          # 解决负号显示成方块的问题

# 机器学习 & 优化
import xgboost as xgb
import optuna
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer

# 化学信息学 (RDKit)
from rdkit import Chem, RDLogger
from rdkit.Chem import (
    Descriptors, Crippen, rdMolDescriptors,
    MACCSkeys, RDKFingerprint, rdFingerprintGenerator
)
from rdkit.Chem.AtomPairs import Pairs, Torsions

# 关闭 RDKit 的警告
RDLogger.DisableLog('rdApp.*')

# Avalon 指纹（可选）
try:
    from rdkit.Avalon import pyAvalonTools
    avalon_available = True
except ImportError:
    avalon_available = False
print(f"Avalon available: {avalon_available}")

# Kaggle API
from kaggle.api.kaggle_api_extended import KaggleApi



  from .autonotebook import tqdm as notebook_tqdm


Avalon available: True


In [3]:
# 初始化




host = "10.162.147.95"
user = "user1"
password = "123456"

database_name = 'thermophysical_property_melting_point'  # 数据库名称
competition = database_name  # 竞赛名称
kaggle_competition_name = "melting-point"
study_save_name = "self"





if socket.gethostname() == 'hao-2':
    dir = rf'D:\数据\Kaggle_\{competition}'
else:
    dir = os.getcwd()




DIRS = {
    "dir":              dir,                                       
    "DATA_DIR000":      os.path.join(dir, "DATA_DIR000"),
    "HISTORY":          os.path.join(dir, "HISTORY", f"{study_save_name}"),
    "SUBMISSION":       os.path.join(dir, "SUBMISSION", f"{study_save_name}"),
}


# 自动创建目录
for key, path in DIRS.items():
    os.makedirs(path, exist_ok=True)

# 打印时一行一个地址
print("✅ 路径已创建：\n")
for key, path in DIRS.items():
    print(f"{key:<12} : {path}")


✅ 路径已创建：

dir          : D:\数据\Kaggle_\thermophysical_property_melting_point
DATA_DIR000  : D:\数据\Kaggle_\thermophysical_property_melting_point\DATA_DIR000
HISTORY      : D:\数据\Kaggle_\thermophysical_property_melting_point\HISTORY\self
SUBMISSION   : D:\数据\Kaggle_\thermophysical_property_melting_point\SUBMISSION\self


In [26]:
# 加载Kaggle 训练集和 Bradley 熔点公开数据集

# Kaggle 提供的训练集和测试集
train_df = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "train.csv"))
test_df  = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "test.csv"))
train_df["origin_info"] = 'Kaggle' 
test_df["origin_info"] = 'Kaggle' 



# 外部 Bradley 熔点公开数据集
bradley_df = pd.read_excel(os.path.join(DIRS['DATA_DIR000'], "BradleyMeltingPointDataset.xlsx"))
bradleyplus_df = pd.read_excel(os.path.join(DIRS['DATA_DIR000'], "BradleyDoublePlusGoodMeltingPointDataset.xlsx"))

bradley_df["origin_info"] = 'bradley' 
bradleyplus_df["origin_info"] = 'bradleyplus' 


# 输出数据集规模，确认加载成功
print("Train                        shape:", train_df.shape)
print("Test                         shape:", test_df.shape)
print("Bradley dataset              shape:", bradley_df.shape)
print("Bradley Plus Good dataset    shape:", bradleyplus_df.shape)

Train                        shape: (2662, 428)
Test                         shape: (666, 427)
Bradley dataset              shape: (28645, 10)
Bradley Plus Good dataset    shape: (3041, 12)


In [24]:
bradley_df.head()

Unnamed: 0,key,name,smiles,mpC,csid,link,source,donotuse,donotusebecause,origin_info
0,1,"2-(2,4-dinitrobenzyl)pyridine",c1ccnc(c1)Cc2ccc(cc2[N+](=O)[O-])[N+](=O)[O-],92.0,64018,http://www.alfa.com/en/GP100W.pgm?DSSTK=B24192,Alfa Aesar,,,bradley
1,2,2-(1-piperidinyl)aniline,c1ccc(c(c1)N)N2CCCCC2,46.0,403764,http://www.alfa.com/en/GP100W.pgm?DSSTK=A13073,Alfa Aesar,,,bradley
2,3,2-(1-piperazinyl)pyrimidine,c1cnc(nc1)N2CCNCC2,33.0,80080,http://www.alfa.com/en/GP100W.pgm?DSSTK=L15884,Alfa Aesar,,,bradley
3,4,2-(1-piperazinyl)phenol,c1ccc(c(c1)N2CCNCC2)O,125.0,63701,http://www.alfa.com/en/GP100W.pgm?DSSTK=B20252,Alfa Aesar,,,bradley
4,5,2-(1-cyclohexenyl)ethylamine,C1CCC(=CC1)CCN,-55.0,69388,http://www.alfa.com/en/GP100W.pgm?DSSTK=L08261,Alfa Aesar,,,bradley


数据处理


In [6]:
def canonicalize(smile):
    try:
        mol = Chem.MolFromSmiles(smile)
        if mol is None:
            return None
        return Chem.MolToSmiles(mol, canonical=True)
    except:
        return None

In [8]:
train_df["SMILES_normalized"] = train_df["SMILES"].apply(canonicalize)
test_df["SMILES_normalized"] = test_df["SMILES"].apply(canonicalize)
print(train_df[["SMILES", "SMILES_normalized", 'Tm']].head())
print(test_df[["id", "SMILES", "SMILES_normalized"]].head())

                        SMILES               SMILES_normalized      Tm
0        FC1=C(F)C(F)(F)C1(F)F           FC1=C(F)C(F)(F)C1(F)F  213.15
1  c1ccc2c(c1)ccc3Nc4ccccc4c23  c1ccc2c(c1)ccc1[nH]c3ccccc3c12  407.15
2          CCN1C(C)=Nc2ccccc12              CCn1c(C)nc2ccccc21  324.15
3                   CC#CC(=O)O                      CC#CC(=O)O  351.15
4                    CCCCC(S)C                       CCCCC(C)S  126.15
     id               SMILES    SMILES_normalized
0  1022  CCOC(=O)c1ccc(O)cc1  CCOC(=O)c1ccc(O)cc1
1  1146   CCCCCCc1ccc(O)cc1O   CCCCCCc1ccc(O)cc1O
2    79                ClCBr                ClCBr
3  2279          C=CCCCCCCCC          C=CCCCCCCCC
4  1342  Fc1ccc(cc1)C(F)(F)F  Fc1ccc(C(F)(F)F)cc1


In [21]:
duplicates = merge_df[
    merge_df.duplicated('SMILES_normalized', keep=False)
][["SMILES", "SMILES_normalized", "Tm"]]

# 按 SMILES_normalized 排序，让相同的排在一起
duplicates = duplicates.sort_values(by="SMILES_normalized").reset_index(drop=True)

summary = (
    duplicates
    .groupby("SMILES_normalized")
    .agg(
        count_rows=("SMILES", "count"),            # 每组的行数
        unique_SMILES=("SMILES", pd.Series.nunique),# 不同SMILES的数量
        Tm_min=("Tm", "min"),                      # 最小 Tm
        Tm_max=("Tm", "max"),                      # 最大 Tm
        Tm_mean=("Tm", "mean"),                    # 平均 Tm
        Tm_std=("Tm", "std")                       # Tm 的标准差
    )
    .reset_index()
)

# 计算 Tm 的误差范围（最大值 - 最小值）
summary["Tm_range"] = summary["Tm_max"] - summary["Tm_min"]

print(summary)


                    SMILES_normalized  count_rows  unique_SMILES  Tm_min  \
0                       B1Oc2ccccc2O1           2              2  285.15   
1                                  Br           2              1  186.15   
2                                BrBr           2              1  265.90   
3                           BrC(Br)Br           3              2  281.15   
4     BrC(c1ccccc1)(c1ccccc1)c1ccccc1           2              2  426.15   
...                               ...         ...            ...     ...   
5104                         c1cnccn1           3              2  326.15   
5105                         c1cncnc1           4              3  294.15   
5106      c1csc(-c2ccc(-c3cccs3)s2)c1           2              2  365.15   
5107                c1csc(-c2cccs2)c1           3              3  305.15   
5108                         c1ncncn1           2              2  359.15   

      Tm_max     Tm_mean    Tm_std  Tm_range  
0     285.15  285.150000  0.000000      