In [None]:
import pandas as pd
import numpy as np
import sklearn
import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, roc_auc_score,mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_label_df = pd.read_excel("../data/ERα_activity.xlsx",sheet_name="training")
test_label_df = pd.read_excel("../data/ERα_activity.xlsx",sheet_name="test")
train_data_df = pd.read_excel("../data/Molecular_Descriptor.xlsx", sheet_name = "training")
test_data_df = pd.read_excel("../data/Molecular_Descriptor.xlsx", sheet_name = "test")

In [None]:
pd.set_option("display.max_rows", None)      # 显示所有行
pd.set_option("display.float_format",lambda x: "%.2f" % x) #为了直观的显示数字，不采用科学计数法

In [None]:
train = pd.merge(train_data_df, train_label_df, on="SMILES", how="outer")
#去除离群点
train = train[train.IC50_nM<60000]
train = train.reset_index().drop(["index"], axis=1)

In [None]:
test = pd.merge(test_data_df, test_label_df, on="SMILES", how="outer")
test = test[train.IC50_nM<60000].reset_index().drop(["index"], axis=1)

In [None]:
#找出train全部为0的列（特征）--无意义
df1=train.loc[:, (train == 0).all(axis=0)]
zero_features = list(df1.columns)
print(f"全部为0的列有{len(zero_features)}列")
#排除train中全部为0的列
exclude_col = ['SMILES','IC50_nM','pIC50']+zero_features
#exclude_col

In [None]:
## 训练数据及测试数据准备
all_cols = [f for f in train.columns if f not in exclude_col]
x_train = train[all_cols]
x_test = test[all_cols]
y_IC_train = train['IC50_nM']
y_PIC_train = train["pIC50"]

In [None]:
import eli5
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

#选择lgb的模型特征
train_X, val_X, train_y, val_y = train_test_split(x_train, y_PIC_train, test_size=0.1,random_state=1)
print(f"训练集数{len(train_X)},测试集数{len(val_X)}")

model_2 = lgb.LGBMRegressor(objective='rmse',random_state=1024).fit(train_X, train_y)
perm = PermutationImportance(model_2, random_state = 1).fit(val_X,val_y) # 实例化
eli5.show_weights(perm,feature_names = val_X.columns.tolist(),top=20)

In [None]:
choose_features = ["maxHsOH","MDEC-23","LipoaffinityIndex","MLFER_A","C1SP2","BCUTc-1l","C3SP2",
                  "minsssN","XLogP","VPC-6","maxsF","nHBAcc","MDEC-33","BCUTp-1l","maxssO","SdssC","SHBint10","nC","TopoPSA","BCUTc-1h"]
print(len(choose_features))

In [None]:
lgb_x_train = x_train[choose_features]
lgb_x_test = x_test[choose_features]

In [None]:
#LightGBM模型train&test(for PIC50)
def cv_model(train_x, train_y, test_x, clf_name='lgb'):
    folds = 40
    seed = 2021
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        train_matrix = lgb.Dataset(trn_x, label=trn_y)
        valid_matrix = lgb.Dataset(val_x, label=val_y)
        

        params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': "rmse",
            'min_child_weight': 0.2,
            'num_leaves': 20,
            'lambda_l2': 1,
            "max_depth":50,
            "max_bin":255,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.9,
            'bagging_freq': 4,
            'learning_rate': 0.1,
            'seed': 2021,
            'nthread': 28,
            'n_jobs':-1,
            'verbose': -1,
        }

        model = lgb.train(params, 
                          train_matrix, 
                          10000, 
                          valid_sets=[train_matrix, valid_matrix], 
                          verbose_eval=100,
                          early_stopping_rounds=500)
        
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)

        train[valid_index] = val_pred
        test += test_pred / kf.n_splits
        cv_scores.append(np.sqrt(mean_squared_error(val_y, val_pred)))
        
        print(cv_scores)
       
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test

lgb_train, lgb_test = cv_model(lgb_x_train, y_PIC_train, lgb_x_test)

In [None]:
import matplotlib.pyplot as plt
f, ax = plt.subplots(1)
f.set_figheight(6)
f.set_figwidth(20)
ax.grid(True)
line1, = ax.plot(lgb_train[:],label='lgb_forecast')

line2, =ax.plot(y_PIC_train[:],label='ground_truth')

ax.set_xlabel(xlabel='Date', fontsize=15)
ax.set_ylabel(ylabel='price actual', fontsize=14)
plt.legend()
#plt.savefig("examples.jpg")
plt.show()