In [1]:
#!/data2/zhoujb/anaconda3/envs/PyTorchTabular/bin/python
import os, pickle, logging, pickle, joblib, sys, warnings
warnings.simplefilter('ignore')
from scipy import stats
import numpy as np
import pandas as pd
from sklearn import metrics, model_selection, preprocessing, feature_selection

import torch
from tabpfn import TabPFNRegressor

from mpire import WorkerPool

from scipy.stats import spearmanr

RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"
OUT_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/fs_PL/"

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%m-%d %H:%M:%S',
    stream=sys.stderr)

In [2]:
def _get_tabPFN_test_score(feat_col):

    # Read data
    raw_data = pd.read_table(os.path.join(RAW_PATH, "raw_data_PL.txt"), sep="\t", index_col=0)
    # set target
    target_col = ['HZ-PL']
    raw_data = raw_data.dropna(subset=target_col)
    
    kf = model_selection.KFold(n_splits=5, shuffle=True,  random_state=0)
    y_test_final, y_pred_final = [], []
    for i, (train_index, test_index) in enumerate(kf.split(raw_data)):
        data_train = raw_data.iloc[train_index].copy()
        data_test = raw_data.iloc[test_index].copy()

        scale_tool = preprocessing.StandardScaler()
        scale_tool.fit(data_train.loc[:, feat_col])
        data_train.loc[:, feat_col] = scale_tool.transform(data_train.loc[:, feat_col])
        data_test.loc[:, feat_col] = scale_tool.transform(data_test.loc[:, feat_col])

        X_train = data_train[feat_col].copy()
        y_train = data_train[target_col].values.ravel()

        X_test = data_test[feat_col].copy()
        y_test = data_test[target_col].values.ravel()
        
        os.environ["TABPFN_MODEL_CACHE_DIR"] = "/data2/zhoujb/package/tabpfn_ckpt"
        if len(feat_col) > 500:
            clf_model =  TabPFNRegressor(device="cpu", random_state=0, n_jobs=-1, ignore_pretraining_limits=True)
        else:
            clf_model =  TabPFNRegressor(device="cpu", random_state=0, n_jobs=-1, ignore_pretraining_limits=False) 
            
        # Fit model
        clf_model.fit(X_train, y_train)
        
        y_pred = clf_model.predict(X_test)
        y_test_final.extend(y_test)
        y_pred_final.extend(y_pred)

    #score_rmse = round(metrics.root_mean_squared_error(y_test_final, y_pred_final), 4)
    score_pear = round(spearmanr(y_test_final, y_pred_final)[0], 4)

    #return score_rmse
    return score_pear

def _get_feat_and_score(k): 
    # Read data
    raw_data_1 = pd.read_table(os.path.join(RAW_PATH, "raw_data_PL.txt"), sep="\t", index_col=0)
    feat_col_1 = [x for x in raw_data_1.columns if x.startswith("fea_")]
    # Get target
    target_col = ['HZ-PL']
    raw_data_1 = raw_data_1.dropna(subset=target_col)

    X_1 = raw_data_1[feat_col_1].copy()
    X_1 = X_1.fillna(-999)
    y_1 = raw_data_1[target_col].values.ravel()

    selector = feature_selection.SelectKBest(score_func=feature_selection.f_regression, k=k)
    _ = selector.fit_transform(X_1, y_1)

    feat_sel_list = X_1.columns[selector.get_support()]

    #score_rmse = _get_tabPFN_test_score(feat_sel_list)
    score_pear = _get_tabPFN_test_score(feat_sel_list)
    #return feat_sel_list, score_rmse
    return feat_sel_list, score_pear

if __name__ == "__main__":
    best_num = 1
    #best_score_flag = np.inf
    best_score_flag = 0
    
    # Read data
    raw_data_2 = pd.read_table(os.path.join(RAW_PATH, "raw_data_PL.txt"), sep="\t", index_col=0)
    feat_col_2 = [x for x in raw_data_2.columns if x.startswith("fea_")]

    need_run_list = list(range(1, len(feat_col_2)//2)) + list(range(len(feat_col_2)//2, len(feat_col_2), 50))
    for i in need_run_list:
        #feat_sel_list, score_rmse = _get_feat_and_score(i)
        feat_sel_list, score_pear = _get_feat_and_score(i)
        #if score_rmse < best_score_flag:
        if score_pear > best_score_flag:
            #best_score_flag = score_rmse
            best_score_flag = score_pear
            best_num = i
            best_features = feat_sel_list
        logging.info("Round {}: Score:{}, best score:{}({})".format(i, 
                                                                    score_pear,
                                                                    best_score_flag,
                                                                    best_num))
    #if score_rmse < best_score_flag:
    if score_pear > best_score_flag:
        best_score_flag = score_pear
        #best_score_flag = score_rmse
        best_num = i
        best_features = feat_sel_list
    logging.info("Round {}: Score:{}, best score:{}({})".format(i, 
                                                                score_pear,
                                                                best_score_flag,
                                                                best_num))

    with open(os.path.join(OUT_PATH, "tabPFN_rfa_cv_pl_rmse"), "w") as feat_f:
        for item in best_features:
            print(item, file=feat_f)
        
    logging.info("JOB DONE")

05-08 17:32:32 INFO     Round 1: Score:0.1146, best score:0.1146(1)
05-08 17:32:54 INFO     Round 2: Score:0.3357, best score:0.3357(2)
05-08 17:33:18 INFO     Round 3: Score:0.3432, best score:0.3432(3)
05-08 17:33:43 INFO     Round 4: Score:0.3629, best score:0.3629(4)
05-08 17:34:09 INFO     Round 5: Score:0.459, best score:0.459(5)
05-08 17:34:37 INFO     Round 6: Score:0.572, best score:0.572(6)
05-08 17:35:06 INFO     Round 7: Score:0.5659, best score:0.572(6)
05-08 17:35:41 INFO     Round 8: Score:0.568, best score:0.572(6)
05-08 17:36:14 INFO     Round 9: Score:0.5907, best score:0.5907(9)
05-08 17:36:48 INFO     Round 10: Score:0.6259, best score:0.6259(10)
05-08 17:37:21 INFO     Round 11: Score:0.6292, best score:0.6292(11)
05-08 17:37:58 INFO     Round 12: Score:0.6319, best score:0.6319(12)
05-08 17:38:34 INFO     Round 13: Score:0.6289, best score:0.6319(12)
05-08 17:39:14 INFO     Round 14: Score:0.6337, best score:0.6337(14)
05-08 17:39:51 INFO     Round 15: Score:0.625

In [None]:




def add_feature(indices, n_jobs=12):
    
    raw_data_1 = pd.read_table(os.path.join(RAW_PATH, "raw_data_PL.txt"), sep="\t", index_col=0)
    feat_col = [x for x in raw_data_1.columns if x.startswith("fea_")]

    new_indices = list(set(range(len(feat_col))) - set(indices))
    params_list = [[indices + [i]] for i in new_indices]

    with WorkerPool(n_jobs=n_jobs) as pool:
        scores = pool.map(_get_cb_test_score, params_list, progress_bar=False)
    
    #indices.append(new_indices[scores.index(max(scores))])
    indices.append(new_indices[scores.index(min(scores))])
    return min(scores)

if __name__ == "__main__":
    features=[]
    best_score_flag = np.inf
    raw_data_2 = pd.read_table(os.path.join(RAW_PATH, "raw_data_PL.txt"), sep="\t", index_col=0)
    all_features = np.array([x for x in raw_data_2.columns if x.startswith("fea_")])
    indices = [list(all_features).index(feature) for feature in features]

    best_num = 1
    for i in range(len(all_features)):
        score_tmp = add_feature(indices, n_jobs=6)
        feature_order_list = all_features[indices]
        if score_tmp < best_score_flag:
            best_score_flag = score_tmp
            best_features = all_features[indices]
            best_num = i + 1
    
        logging.info("Round {}: Score:{}, best score:{}({}), feature list:{}".format(i+1, score_tmp, 
                                                                                   best_score_flag, 
                                                                                     best_num,
                                                                                   feature_order_list))
        
        if (i + 1) - best_num > 10:
            break

    with open(os.path.join(OUT_PATH, "tabPFN_rfa_cv_pl_rmse"), "w") as feat_f:
        for item in best_features:
            print(item, file=feat_f)
        
    logging.info("JOB DONE")


In [None]:
#!/data2/zhoujb/anaconda3/envs/PyTorchTabular/bin/python
import os, pickle, logging, pickle, joblib, sys, warnings
warnings.simplefilter('ignore')
from scipy import stats
import numpy as np
import pandas as pd
from sklearn import metrics, model_selection, preprocessing

#import catboost as cb
#import lightgbm as lgb
#import shap

import torch
from tabpfn import TabPFNRegressor

RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"
OUT_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/fs_PL/"

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%m-%d %H:%M:%S',
    stream=sys.stderr)

def getCBShapLessFeat(feat_col=None):
    
    raw_data = pd.read_table(os.path.join(RAW_PATH, "raw_data_PL.txt"), sep="\t", index_col=0)

    target_col = ['HZ-PL']
    raw_data = raw_data.dropna(subset=target_col)

    if feat_col is None:
        feat_col = [x for x in raw_data.columns if x.startswith("fea_")]

    #cat_col_names_raw = ["heading_stage_d", "leaf_blast_rep1", "leaf_blast_rep2", "leaf_blast_average", "GR_D3", "GR_D2", "GR_D1"]
    #cat_col_names_raw = ["feat_{}".format(x) for x in cat_col_names_raw]
    #cat_col_names = [i for i in feat_col if i in cat_col_names_raw]
    
    #num_col_names = [i for i in feat_col if (i.startswith("feat")) and (i not in cat_col_names)]
    #cat_col_num = []
    #for item in cat_col_names:
        #cat_col_num.append(feat_col.index(item))

    #for col in cat_col_names:
        #raw_data[col] = raw_data[col].astype("str")

    kf = model_selection.KFold(n_splits=5, shuffle=True,  random_state=0)
    y_test_final, y_pred_final = [], []
    for i, (train_index, test_index) in enumerate(kf.split(raw_data)):
        data_train = raw_data.iloc[train_index].copy()
        data_test = raw_data.iloc[test_index].copy()

        scale_tool = preprocessing.StandardScaler()
        scale_tool.fit(data_train.loc[:, feat_col])
        data_train.loc[:, feat_col] = scale_tool.transform(data_train.loc[:, feat_col])
        data_test.loc[:, feat_col] = scale_tool.transform(data_test.loc[:, feat_col])

        #train_sel = data_train.sample(frac=0.8, random_state=0)
        #val_sel = data_train.drop(train_sel.index).copy()

        #X_train = train_sel[feat_col].copy()
        #y_train = train_sel[target_col].values.ravel()

        #X_val = val_sel[feat_col].copy()
        #y_val = val_sel[target_col].values.ravel()
        
        X_train = data_train[feat_col].copy()
        y_train = data_train[target_col].values.ravel()

        X_test = data_test[feat_col].copy()
        y_test = data_test[target_col].values.ravel()
        
        
        # Initialize CatBoostClassifier
        #clf_model = cb.CatBoostRegressor(random_state=0, thread_count=4, loss_function='RMSE', verbose=0)
        os.environ["TABPFN_MODEL_CACHE_DIR"] = "/data2/zhoujb/package/tabpfn_ckpt"
        if len(feat_col) > 500:
            clf_model =  TabPFNRegressor(device="cpu", random_state=0, n_jobs=-1, ignore_pretraining_limits=True)
        else:
            clf_model =  TabPFNRegressor(device="cpu", random_state=0, n_jobs=-1, ignore_pretraining_limits=False) 
            
        # Initialize CatBoostClassifier
        #clf_model = cb.CatBoostRegressor(random_state=0, thread_count=48, loss_function='RMSE', verbose=0)

        # Fit model
        #clf_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=0, plot=False)
        clf_model.fit(X_train, y_train)

        y_pred = clf_model.predict(X_test)

        y_test_final.extend(y_test)
        y_pred_final.extend(y_pred)

        if i == 0:
            fs_val_df = pd.DataFrame(clf_model.feature_importances_, index=feat_col)
            fs_val_df = fs_val_df.rename(columns={0:i+1})
        else:
            fs_val_tmp = pd.DataFrame(clf_model.feature_importances_, index=feat_col)
            fs_val_tmp = fs_val_tmp.rename(columns={0:i+1})
            fs_val_df = pd.concat([fs_val_df, fs_val_tmp], axis=1, sort=False)

    #score_pear = pearsonr(y_test, y_pred)[0]
    #score_rmse = round(metrics.mean_squared_error(y_test_final, y_pred_final, squared=False), 4)
    #score_rmse = metrics.mean_squared_error(y_test_final, y_pred_final, squared=False)
    #score_nrmse = round(score_rmse / np.std(y_test), 4)
    score_rmse = round(metrics.root_mean_squared_error(y_test_final, y_pred_final), 4)
    
    fs_val_df_mean = fs_val_df.mean(axis=1).to_frame()
    fs_val_df_mean = fs_val_df_mean.sort_values(by=[0], ascending=False)
    less_import_feat = fs_val_df_mean.index[-1]
    return score_rmse, less_import_feat
    #return score_nrmse, less_import_feat

if __name__ == "__main__":
    min_feats = 10
    raw_data = pd.read_table(os.path.join(RAW_PATH, "raw_data_PL.txt"), sep="\t", index_col=0)

    feat_col = [x for x in raw_data.columns if x.startswith("fea_")]
    logging.info("Run number of features:{}".format(len(feat_col)))
    #auc_mean_flag, test_auc_flag, less_import_feat = getCBShapLessFeat()
    #auc_mean, test_auc, less_import_feat = getCBShapLessFeat()
    score_rmse, less_import_feat = getCBShapLessFeat()

    test_score_flag = score_rmse
    best_round = len(feat_col)
    feat_list_best = feat_col.copy()
    for num_feats in range(len(feat_col)-1, min_feats-1, -1):
        logging.info("Round:{}, The less important feature is: {}, test_score:{}, best_score:{}({})".format(num_feats+1, 
                                                                                                     less_import_feat,score_rmse, 
                                                                                                            test_score_flag, best_round))
        feat_col.remove(less_import_feat)
        score_rmse, less_import_feat = getCBShapLessFeat(feat_col=feat_col)
        if score_rmse < test_score_flag:
            test_score_flag = min(score_rmse, test_score_flag)
            best_round = num_feats
            feat_list_best = feat_col.copy()
        
        if best_round - num_feats > 10:
            break
            
    logging.info("Round:{}, The less important feature is: {}, test_score:{}, best_score:{}({})".format(num_feats+1, 
                                                                                                     less_import_feat,score_rmse, 
                                                                                                            test_score_flag, best_round))
    if score_rmse < test_score_flag:
        test_score_flag = min(score_rmse, test_score_flag)
        feat_list_best = feat_col.copy()

    logging.info("The number of best features:{}".format(len(feat_list_best)))
    logging.info("Best features:{}".format(feat_list_best))
    with open(os.path.join(OUT_PATH, "tabPFN_rfe_cv_pl_rmse"), "w") as feat_f:
        for item in feat_list_best:
            print(item, file=feat_f)

    logging.info("JOB DONE")