In [49]:
!pip install lightgbm openpyxl

Defaulting to user installation because normal site-packages is not writeable


In [50]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import f1_score
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.feature_extraction.text import TfidfVectorizer
import tqdm, sys, os, gc, re, argparse, warnings
from lightgbm import log_evaluation, early_stopping
warnings.filterwarnings('ignore')


In [51]:
import pandas as pd
import re

train = pd.read_excel('./dataset-new/traindata-new.xlsx')
test = pd.read_excel('./dataset-new/testdata-new.xlsx')

# test数据不包含 DC50 (nM) 和 Dmax (%)
train = train.drop(['DC50 (nM)', 'Dmax (%)'], axis=1)

# 定义了一个空列表drop_cols，用于存储在测试数据集中非空值小于10个的列名。
drop_cols = []
for f in test.columns:
    if test[f].notnull().sum() < 10:
        drop_cols.append(f)
        
# 使用drop方法从训练集和测试集中删除了这些列，以避免在后续的分析或建模中使用这些包含大量缺失值的列
train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)

# 使用pd.concat将清洗后的训练集和测试集合并成一个名为data的DataFrame，便于进行统一的特征工程处理
data = pd.concat([train, test], axis=0, ignore_index=True)
cols = data.columns[2:]

# 将SMILES转换为分子对象列表,并转换为SMILES字符串列表
data['smiles_list'] = data['Smiles'].apply(lambda x:[Chem.MolToSmiles(mol, isomericSmiles=True) for mol in [Chem.MolFromSmiles(x)]])
data['smiles_list'] = data['smiles_list'].map(lambda x: ' '.join(x))  

# 使用TfidfVectorizer计算TF-IDF
tfidf = TfidfVectorizer(max_df = 0.9, min_df = 1, sublinear_tf = True)
res = tfidf.fit_transform(data['smiles_list'])

# 将结果转为dataframe格式
tfidf_df = pd.DataFrame(res.toarray())
tfidf_df.columns = [f'smiles_tfidf_{i}' for i in range(tfidf_df.shape[1])]

# 按列合并到data数据
data = pd.concat([data, tfidf_df], axis=1)

# 自然数编码
def label_encode(series):
    unique = list(series.unique())
    return series.map(dict(zip(
        unique, range(series.nunique())
    )))

for col in cols:
    if data[col].dtype == 'object':
        data[col]  = label_encode(data[col])
        
train = data[data.Label.notnull()].reset_index(drop=True)
test = data[data.Label.isnull()].reset_index(drop=True)

# 特征筛选
features = [f for f in train.columns if f not in ['uuid','Label','smiles_list']]

# 构建训练集和测试集
x_train = train[features]
x_test = test[features]

# 训练集标签
y_train = train['Label'].astype(int)



atomic_masses = {
    'H': 1.008, 'He': 4.002602, 'Li': 6.94, 'Be': 9.0122, 'B': 10.81, 'C': 12.01,
    'N': 14.01, 'O': 16.00, 'F': 19.00, 'Ne': 20.180, 'Na': 22.990, 'Mg': 24.305,
    'Al': 26.982, 'Si': 28.085, 'P': 30.97, 'S': 32.07, 'Cl': 35.45, 'Ar': 39.95,
    'K': 39.10, 'Ca': 40.08, 'Sc': 44.956, 'Ti': 47.867, 'V': 50.942, 'Cr': 52.00,
    'Mn': 54.938, 'Fe': 55.845, 'Co': 58.933, 'Ni': 58.69, 'Cu': 63.55, 'Zn': 65.38
}

def parse_inchi(row):
    inchi_str = row['InChI']
    if not isinstance(inchi_str, str):
        inchi_str = str(inchi_str)
    formula = ''
    molecular_weight = 0
    element_counts = {}

    formula_match = re.search(r"InChI=1S/([^/]+)/c", inchi_str)
    if formula_match:
        formula = formula_match.group(1)

    for element, count in re.findall(r"([A-Z][a-z]*)([0-9]*)", formula):
        count = int(count) if count else 1
        element_mass = atomic_masses.get(element.upper(), 0)
        molecular_weight += element_mass * count
        element_counts[element.upper()] = count

    return pd.Series({
        'Formula': formula,
        'MolecularWeight': molecular_weight,
        'ElementCounts': element_counts
    })

train[['Formula', 'MolecularWeight', 'ElementCounts']] = train.apply(parse_inchi, axis=1)

# 定义存在的key
keys = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn']

# 创建一个空的DataFrame，列名为keys
df_expanded = pd.DataFrame({key: pd.Series() for key in keys})

# 遍历数据，填充DataFrame
for index, item in enumerate(train['ElementCounts'].values):
    for key in keys:
        # 将字典中的值填充到相应的列中
        df_expanded.at[index, key] = item.get(key, 0)
        
df_expanded = pd.DataFrame(df_expanded)

In [52]:
def clean_feature_names(df):
    df.columns = [re.sub('[^A-Za-z0-9_]+', '', col) for col in df.columns]
    return df

def cv_model(clf, train_x, train_y, test_x, clf_name, seed=2023):
    folds = 5
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    oof = np.zeros(train_x.shape[0])
    test_predict = np.zeros(test_x.shape[0])
    cv_scores = []
    
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
        
        # 清理特征名，以防万一再清理验证集和测试集
        trn_x = clean_feature_names(trn_x.copy())
        val_x = clean_feature_names(val_x.copy())
        test_x = clean_feature_names(test_x.copy())
        
        try:
            if clf_name == "lgb":
                train_matrix = clf.Dataset(trn_x, label=trn_y)
                valid_matrix = clf.Dataset(val_x, label=val_y)
                params = {
                    'boosting_type': 'gbdt',
                    'objective': 'binary',
                    'min_child_weight': 6,
                    'num_leaves': 2 ** 6,
                    'lambda_l2': 10,
                    'feature_fraction': 0.8,
                    'bagging_fraction': 0.8,
                    'bagging_freq': 4,
                    'learning_rate': 0.35,
                    'seed': 2024,
                    'nthread' : 16,
                    'verbose' : -1,
                }
                model = clf.train(
                    params, 
                    train_matrix, 
                    2000, 
                    valid_sets=[train_matrix, valid_matrix],
                    categorical_feature=[], 
                    callbacks=[early_stopping(stopping_rounds=100), log_evaluation(period=1000)]
                )
                val_pred = model.predict(val_x, num_iteration=model.best_iteration)
                test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            if clf_name == "xgb":
                xgb_params = {
                  'booster': 'gbtree', 
                  'objective': 'binary:logistic',
                  'max_depth': 5,
                  'lambda': 10,
                  'subsample': 0.7,
                  'colsample_bytree': 0.7,
                  'colsample_bylevel': 0.7,
                  'eta': 0.35,
                  'tree_method': 'hist',
                  'seed': 520,
                  'nthread': 16
                }
                train_matrix = xgb.DMatrix(trn_x, label=trn_y)
                valid_matrix = xgb.DMatrix(val_x, label=val_y)
                test_matrix = xgb.DMatrix(test_x)
                
                watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]
                
                model = xgb.train(xgb_params, train_matrix, num_boost_round=2000, evals=watchlist, verbose_eval=1000, early_stopping_rounds=100)
                val_pred = model.predict(valid_matrix)
                test_pred = model.predict(test_matrix)
            
            if clf_name == "cat":
                params = {'learning_rate': 0.35, 'depth': 5, 'bootstrap_type': 'Bernoulli', 'random_seed': 2024,
                          'od_type': 'Iter', 'od_wait': 100, 'allow_writing_files': False}
                
                model = CatBoostClassifier(iterations=2000, **params)
                model.fit(trn_x, trn_y, eval_set=[(val_x, val_y)],
                          logging_level='Verbose',
                          use_best_model=True, 
                          cat_features=[])
                
                val_pred = model.predict_proba(val_x)[:, 1]
                test_pred = model.predict_proba(test_x)[:, 1]
            
            oof[valid_index] = val_pred
            test_predict += test_pred / kf.n_splits
            
            F1_score = f1_score(val_y, np.where(val_pred > 0.5, 1, 0))
            cv_scores.append(F1_score)
            print(cv_scores)
        
        except Exception as e:
            print(f"Error in fold {i+1}: {str(e)}")
        
    return oof, test_predict

# 示例数据
# 请确保 x_train, y_train, x_test 变量已经被正确定义并赋值
# x_train = ...
# y_train = ...
# x_test = ...
# test = ...

# 选择LightGBM模型
try:
    lgb_oof, lgb_test = cv_model(lgb, x_train, y_train, x_test, 'lgb')
except Exception as e:
    print(f"Error with LightGBM: {str(e)}")

# 选择XGBoost模型
try:
    xgb_oof, xgb_test = cv_model(xgb, x_train, y_train, x_test, 'xgb')
except Exception as e:
    print(f"Error with XGBoost: {str(e)}")

# 选择CatBoost模型
try:
    cat_oof, cat_test = cv_model(CatBoostClassifier, x_train, y_train, x_test, 'cat')
except Exception as e:
    print(f"Error with CatBoost: {str(e)}")

# 进行取平均融合
try:
    final_test = (lgb_test + xgb_test + cat_test) / 3
    # 保存结果文件到本地
    pd.DataFrame(
        {
            'uuid': test['uuid'],
            'Label': np.where(final_test > 0.5, 1, 0)  # 根据预测概率生成标签
        }
    ).to_csv('submit.csv', index=None)
except Exception as e:
    print(f"Error during final fusion and saving: {str(e)}")

************************************ 1 ************************************
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[128]	training's binary_logloss: 0.176065	valid_1's binary_logloss: 0.344137
[0.9009009009009009]
************************************ 2 ************************************
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[34]	training's binary_logloss: 0.247589	valid_1's binary_logloss: 0.429617
[0.9009009009009009, 0.8421052631578947]
************************************ 3 ************************************
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[159]	training's binary_logloss: 0.176589	valid_1's binary_logloss: 0.290863
[0.9009009009009009, 0.8421052631578947, 0.9302325581395349]
************************************ 4 ************************************
Training until validation scores don't improve for 1