In [None]:
# run_cb_rfa_cv_pc.py

In [None]:
#!/data2/zhoujb/anaconda3/envs/PyTorchTabular/bin/python
import os, pickle, logging, pickle, joblib, sys, warnings
warnings.simplefilter('ignore')
from scipy import stats
import numpy as np
import pandas as pd
from sklearn import metrics, model_selection, preprocessing

import catboost as cb
import lightgbm as lgb
import shap

from mpire import WorkerPool

RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"
OUT_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/fs_PC/"

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%m-%d %H:%M:%S',
    stream=sys.stderr)

def _get_cb_test_score(indice):
    # 若输入为整数，转为单元素列表
    if isinstance(indice, int):
        indice = [indice]

    # 读取原始数据
    raw_data = pd.read_table(os.path.join(RAW_PATH, "raw_data_PC.txt"), sep="\t", index_col=0)
    # 获取以 "fea_" 开头的特征列，并选出indice指定的特征子集
    feat_col = list(np.array([x for x in raw_data.columns if x.startswith("fea_")])[indice])
    
    # 定义目标列，这里是荚长（pod length）
    target_col = ['GZ-PC']
    raw_data = raw_data.dropna(subset=target_col)
    # 进行5折交叉验证
    kf = model_selection.KFold(n_splits=5, shuffle=True,  random_state=0)
    y_test_final, y_pred_final = [], []
    for i, (train_index, test_index) in enumerate(kf.split(raw_data)):
        data_train = raw_data.iloc[train_index].copy()
        data_test = raw_data.iloc[test_index].copy()
        
        # 特征标准化处理
        scale_tool = preprocessing.StandardScaler()
        scale_tool.fit(data_train.loc[:, feat_col])
        data_train.loc[:, feat_col] = scale_tool.transform(data_train.loc[:, feat_col])
        data_test.loc[:, feat_col] = scale_tool.transform(data_test.loc[:, feat_col])
        # 从训练集划分出80%作为训练，20%作为验证
        train_sel = data_train.sample(frac=0.8, random_state=0)
        val_sel = data_train.drop(train_sel.index).copy()
        # 分别提取训练/验证/测试特征与标签
        X_train = train_sel[feat_col].copy()
        y_train = train_sel[target_col].values.ravel()

        X_val = val_sel[feat_col].copy()
        y_val = val_sel[target_col].values.ravel()

        X_test = data_test[feat_col].copy()
        y_test = data_test[target_col].values.ravel()
        
        
        # Initialize CatBoostClassifier
        # 初始化 CatBoost 回归模型
        clf_model = cb.CatBoostRegressor(random_state=0, thread_count=4, loss_function='RMSE', verbose=0)
        
        # Fit model
        clf_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=0, plot=False)
        
        y_pred = clf_model.predict(X_test)
        # 在测试集上进行预测
        y_test_final.extend(y_test)
        y_pred_final.extend(y_pred)
    
    # 计算模型的 RMSE 分数
    score_rmse = round(metrics.root_mean_squared_error(y_test_final, y_pred_final), 4)

    return score_rmse

def add_feature(indices, n_jobs=12):
    # 读取数据并提取所有 "fea_" 开头的特征
    raw_data_1 = pd.read_table(os.path.join(RAW_PATH, "raw_data_PC.txt"), sep="\t", index_col=0)
    feat_col = [x for x in raw_data_1.columns if x.startswith("fea_")]

    new_indices = list(set(range(len(feat_col))) - set(indices))
    params_list = [[indices + [i]] for i in new_indices]

    with WorkerPool(n_jobs=n_jobs) as pool:
        scores = pool.map(_get_cb_test_score, params_list, progress_bar=False)
    
    #indices.append(new_indices[scores.index(max(scores))])
    indices.append(new_indices[scores.index(min(scores))])
    return min(scores)

if __name__ == "__main__":
    features=[]
    best_score_flag = np.inf
    raw_data_2 = pd.read_table(os.path.join(RAW_PATH, "raw_data_PC.txt"), sep="\t", index_col=0)
    all_features = np.array([x for x in raw_data_2.columns if x.startswith("fea_")])
    indices = [list(all_features).index(feature) for feature in features]

    best_num = 1
    for i in range(len(all_features)):
        score_tmp = add_feature(indices, n_jobs=12)
        feature_order_list = all_features[indices]
        if score_tmp < best_score_flag:
            best_score_flag = score_tmp
            best_features = all_features[indices]
            best_num = i + 1
    
        logging.info("Round {}: Score:{}, best score:{}({}), feature list:{}".format(i+1, score_tmp, 
                                                                                   best_score_flag, 
                                                                                     best_num,
                                                                                   feature_order_list))
        
        if (i + 1) - best_num > 50:
            break

    with open(os.path.join(OUT_PATH, "cb_rfa_cv_pc_rmse"), "w") as feat_f:
        for item in best_features:
            print(item, file=feat_f)
        
    logging.info("JOB DONE")


In [None]:
# run_cb_rfe_cv_pc.py

In [None]:
#!/data2/zhoujb/anaconda3/envs/PyTorchTabular/bin/python
import os, pickle, logging, pickle, joblib, sys, warnings
warnings.simplefilter('ignore')
from scipy import stats
import numpy as np
import pandas as pd
from sklearn import metrics, model_selection, preprocessing

import catboost as cb
import lightgbm as lgb
import shap

RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"
OUT_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/fs_PC/"

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%m-%d %H:%M:%S',
    stream=sys.stderr)

def getCBShapLessFeat(feat_col=None):
    
    raw_data = pd.read_table(os.path.join(RAW_PATH, "raw_data_PC.txt"), sep="\t", index_col=0)

    target_col = ['GZ-PC']
    raw_data = raw_data.dropna(subset=target_col)

    if feat_col is None:
        feat_col = [x for x in raw_data.columns if x.startswith("fea_")]

    kf = model_selection.KFold(n_splits=5, shuffle=True,  random_state=0)
    y_test_final, y_pred_final = [], []
    for i, (train_index, test_index) in enumerate(kf.split(raw_data)):
        data_train = raw_data.iloc[train_index].copy()
        data_test = raw_data.iloc[test_index].copy()

        scale_tool = preprocessing.StandardScaler()
        scale_tool.fit(data_train.loc[:, feat_col])
        data_train.loc[:, feat_col] = scale_tool.transform(data_train.loc[:, feat_col])
        data_test.loc[:, feat_col] = scale_tool.transform(data_test.loc[:, feat_col])

        train_sel = data_train.sample(frac=0.8, random_state=0)
        val_sel = data_train.drop(train_sel.index).copy()

        X_train = train_sel[feat_col].copy()
        y_train = train_sel[target_col].values.ravel()

        X_val = val_sel[feat_col].copy()
        y_val = val_sel[target_col].values.ravel()

        X_test = data_test[feat_col].copy()
        y_test = data_test[target_col].values.ravel()

        # Initialize CatBoostClassifier
        clf_model = cb.CatBoostRegressor(random_state=0, thread_count=48, loss_function='RMSE', verbose=0)

        # Fit model
        clf_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=0, plot=False)

        y_pred = clf_model.predict(X_test)

        y_test_final.extend(y_test)
        y_pred_final.extend(y_pred)

        if i == 0:
            fs_val_df = pd.DataFrame(clf_model.feature_importances_, index=feat_col)
            fs_val_df = fs_val_df.rename(columns={0:i+1})
        else:
            fs_val_tmp = pd.DataFrame(clf_model.feature_importances_, index=feat_col)
            fs_val_tmp = fs_val_tmp.rename(columns={0:i+1})
            fs_val_df = pd.concat([fs_val_df, fs_val_tmp], axis=1, sort=False)

    score_rmse = round(metrics.root_mean_squared_error(y_test_final, y_pred_final), 4)

    
    fs_val_df_mean = fs_val_df.mean(axis=1).to_frame()
    fs_val_df_mean = fs_val_df_mean.sort_values(by=[0], ascending=False)
    less_import_feat = fs_val_df_mean.index[-1]
    return score_rmse, less_import_feat

if __name__ == "__main__":
    min_feats = 10
    raw_data = pd.read_table(os.path.join(RAW_PATH, "raw_data_PC.txt"), sep="\t", index_col=0)

    feat_col = [x for x in raw_data.columns if x.startswith("fea_")]
    logging.info("Run number of features:{}".format(len(feat_col)))
    #auc_mean_flag, test_auc_flag, less_import_feat = getCBShapLessFeat()
    #auc_mean, test_auc, less_import_feat = getCBShapLessFeat()
    score_rmse, less_import_feat = getCBShapLessFeat()

    test_score_flag = score_rmse
    best_round = len(feat_col)
    feat_list_best = feat_col.copy()
    for num_feats in range(len(feat_col)-1, min_feats-1, -1):
        logging.info("Round:{}, The less important feature is: {}, test_score:{}, best_score:{}({})".format(num_feats+1, 
                                                                                                     less_import_feat,score_rmse, 
                                                                                                            test_score_flag, best_round))
        feat_col.remove(less_import_feat)
        score_rmse, less_import_feat = getCBShapLessFeat(feat_col=feat_col)
        if score_rmse < test_score_flag:
            test_score_flag = min(score_rmse, test_score_flag)
            best_round = num_feats
            feat_list_best = feat_col.copy()
        
        if best_round - num_feats > 20:
            break
            
    logging.info("Round:{}, The less important feature is: {}, test_score:{}, best_score:{}({})".format(num_feats+1, 
                                                                                                     less_import_feat,score_rmse, 
                                                                                                            test_score_flag, best_round))
    if score_rmse < test_score_flag:
        test_score_flag = min(score_rmse, test_score_flag)
        feat_list_best = feat_col.copy()

    logging.info("The number of best features:{}".format(len(feat_list_best)))
    logging.info("Best features:{}".format(feat_list_best))
    with open(os.path.join(OUT_PATH, "cb_rfe_cv_pc_rmse"), "w") as feat_f:
        for item in feat_list_best:
            print(item, file=feat_f)

    logging.info("JOB DONE")
