In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold, GridSearchCV
from sklearn.metrics import f1_score
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.feature_extraction.text import TfidfVectorizer
import tqdm, sys, os, gc, re, argparse, warnings
import optuna

warnings.filterwarnings('ignore')

In [2]:

train = pd.read_excel('./dataset-new/traindata-new.xlsx')
test = pd.read_excel('./dataset-new/testdata-new.xlsx')

# test数据不包含 DC50 (nM) 和 Dmax (%)
train = train.drop(['DC50 (nM)', 'Dmax (%)'], axis=1)

# 定义了一个空列表drop_cols，用于存储在测试数据集中非空值小于10个的列名。
drop_cols = []
for f in test.columns:
    if test[f].notnull().sum() < 10:
        drop_cols.append(f)

# 使用drop方法从训练集和测试集中删除了这些列，以避免在后续的分析或建模中使用这些包含大量缺失值的列
train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)

# 使用pd.concat将清洗后的训练集和测试集合并成一个名为data的DataFrame，便于进行统一的特征工程处理
data = pd.concat([train, test], axis=0, ignore_index=True)
cols = data.columns[2:]

In [3]:

# 将SMILES转换为分子对象列表,并转换为SMILES字符串列表
data['smiles_list'] = data['Smiles'].apply(
    lambda x: [Chem.MolToSmiles(mol, isomericSmiles=True) for mol in [Chem.MolFromSmiles(x)]])
data['smiles_list'] = data['smiles_list'].map(lambda x: ' '.join(x))

# 使用TfidfVectorizer计算TF-IDF
tfidf = TfidfVectorizer(max_df=0.9, min_df=1, sublinear_tf=True)
res = tfidf.fit_transform(data['smiles_list'])

# 将结果转为dataframe格式
tfidf_df = pd.DataFrame(res.toarray())
tfidf_df.columns = [f'smiles_tfidf_{i}' for i in range(tfidf_df.shape[1])]

# 按列合并到data数据
data = pd.concat([data, tfidf_df], axis=1)


# 自然数编码
def label_encode(series):
    unique = list(series.unique())
    return series.map(dict(zip(
        unique, range(series.nunique())
    )))


for col in cols:
    if data[col].dtype == 'object':
        data[col] = label_encode(data[col])

train = data[data.Label.notnull()].reset_index(drop=True)
test = data[data.Label.isnull()].reset_index(drop=True)

# 特征筛选
features = [f for f in train.columns if f not in ['uuid', 'Label', 'smiles_list']]

# 构建训练集和测试集
x_train = train[features]
x_test = test[features]

# 训练集标签
y_train = train['Label'].astype(int)


def optimize_catboost(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 1000, 20000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait': trial.suggest_int('od_wait', 10, 100),
        'random_seed': 2022,
        'task_type': 'CPU',
        'eval_metric': 'AUC',
        'allow_writing_files': False,
    }
    if params['bootstrap_type'] == 'Bayesian':
        params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    elif params['bootstrap_type'] == 'Bernoulli':
        params['subsample'] = trial.suggest_float('subsample', 0.5, 1)

    model = CatBoostClassifier(**params)

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2022)
    scores = []

    for train_index, valid_index in kf.split(x_train, y_train):
        trn_x, val_x = x_train.iloc[train_index], x_train.iloc[valid_index]
        trn_y, val_y = y_train.iloc[train_index], y_train.iloc[valid_index]

        model.fit(trn_x, trn_y, eval_set=(val_x, val_y), verbose=False, early_stopping_rounds=100)

        preds = model.predict_proba(val_x)[:, 1]
        score = f1_score(val_y, np.where(preds > 0.5, 1, 0))
        scores.append(score)

    return np.mean(scores)


study = optuna.create_study(direction='maximize')
study.optimize(optimize_catboost, n_trials=100)

print('Best trial:')
trial = study.best_trial
print(trial.values)
print(trial.params)


[I 2024-07-16 18:40:52,951] A new study created in memory with name: no-name-7c1c5df0-7a11-4282-9f16-06c087031144
[I 2024-07-16 18:41:00,046] Trial 0 finished with value: 0.8767043797888091 and parameters: {'iterations': 7850, 'depth': 5, 'learning_rate': 0.022789853452140924, 'l2_leaf_reg': 7.145051492375322, 'bootstrap_type': 'MVS', 'od_type': 'IncToDec', 'od_wait': 79}. Best is trial 0 with value: 0.8767043797888091.
[I 2024-07-16 18:41:06,801] Trial 1 finished with value: 0.8871691480562449 and parameters: {'iterations': 4130, 'depth': 9, 'learning_rate': 0.01045943528414114, 'l2_leaf_reg': 1.1969579641381038, 'bootstrap_type': 'Bayesian', 'od_type': 'Iter', 'od_wait': 11, 'bagging_temperature': 3.521168290080058}. Best is trial 1 with value: 0.8871691480562449.
[I 2024-07-16 18:41:50,758] Trial 2 finished with value: 0.8917685708850005 and parameters: {'iterations': 8832, 'depth': 10, 'learning_rate': 0.06680489057158719, 'l2_leaf_reg': 3.243419633865654, 'bootstrap_type': 'MVS', 

Best trial:
[0.9092771535178203]
{'iterations': 8139, 'depth': 10, 'learning_rate': 0.1384900486794972, 'l2_leaf_reg': 1.8557295276737344, 'bootstrap_type': 'MVS', 'od_type': 'IncToDec', 'od_wait': 38}


In [4]:

def cv_model(clf, train_x, train_y, test_x, clf_name, params, seed=2022):
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} {}************************************'.format(str(i + 1),
                                                                                                      str(seed)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], \
        train_y[valid_index]

        model = clf(**params)
        model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                  metric_period=100,
                  cat_features=[],
                  use_best_model=True,
                  verbose=1)

        val_pred = model.predict_proba(val_x)[:, 1]
        test_pred = model.predict_proba(test_x)[:, 1]

        train[valid_index] = val_pred
        test += test_pred / kf.n_splits
        cv_scores.append(f1_score(val_y, np.where(val_pred > 0.5, 1, 0)))

        print(cv_scores)

    print("%s_score_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test


best_params = trial.params
best_params['iterations'] = 20000
cat_train, cat_test = cv_model(CatBoostClassifier, x_train, y_train, x_test, "cat", best_params)

pd.DataFrame(
    {
        'uuid': test['uuid'],
        'Label': np.where(cat_test > 0.5, 1, 0)
    }
).to_csv('submit.csv', index=None)

************************************ 1 2022************************************
0:	learn: 0.5651178	test: 0.6002490	best: 0.6002490 (0)	total: 15.1ms	remaining: 5m 2s
100:	learn: 0.0228432	test: 0.3709048	best: 0.3709048 (100)	total: 1.54s	remaining: 5m 2s
200:	learn: 0.0114088	test: 0.4107524	best: 0.3709048 (100)	total: 3.09s	remaining: 5m 4s
300:	learn: 0.0092459	test: 0.4244404	best: 0.3709048 (100)	total: 4.65s	remaining: 5m 4s
400:	learn: 0.0089932	test: 0.4294323	best: 0.3709048 (100)	total: 6.22s	remaining: 5m 3s
500:	learn: 0.0089183	test: 0.4301204	best: 0.3709048 (100)	total: 7.76s	remaining: 5m 2s
600:	learn: 0.0086831	test: 0.4328610	best: 0.3709048 (100)	total: 9.31s	remaining: 5m
700:	learn: 0.0082770	test: 0.4405221	best: 0.3709048 (100)	total: 10.9s	remaining: 4m 59s
800:	learn: 0.0082295	test: 0.4421865	best: 0.3709048 (100)	total: 12.4s	remaining: 4m 57s
900:	learn: 0.0081688	test: 0.4428698	best: 0.3709048 (100)	total: 14s	remaining: 4m 57s
1000:	learn: 0.0080843	te