In [None]:
# run_tabPFN_rfa_cv_pc.py

In [None]:
#!/data2/zhoujb/anaconda3/envs/PyTorchTabular/bin/python
import os, pickle, logging, pickle, joblib, sys, warnings
warnings.simplefilter('ignore')
from scipy import stats
import numpy as np
import pandas as pd
from sklearn import metrics, model_selection, preprocessing, feature_selection

import torch
from tabpfn import TabPFNRegressor

from mpire import WorkerPool

from scipy.stats import spearmanr

RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"
OUT_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/fs_PC/"

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%m-%d %H:%M:%S',
    stream=sys.stderr)

def _get_tabPFN_test_score(feat_col):

    # Read data
    raw_data = pd.read_table(os.path.join(RAW_PATH, "raw_data_PC.txt"), sep="\t", index_col=0)
    # set target
    target_col = ['GZ-PC']
    raw_data = raw_data.dropna(subset=target_col)
    
    kf = model_selection.KFold(n_splits=5, shuffle=True,  random_state=0)
    y_test_final, y_pred_final = [], []
    for i, (train_index, test_index) in enumerate(kf.split(raw_data)):
        data_train = raw_data.iloc[train_index].copy()
        data_test = raw_data.iloc[test_index].copy()

        scale_tool = preprocessing.StandardScaler()
        scale_tool.fit(data_train.loc[:, feat_col])
        data_train.loc[:, feat_col] = scale_tool.transform(data_train.loc[:, feat_col])
        data_test.loc[:, feat_col] = scale_tool.transform(data_test.loc[:, feat_col])

        X_train = data_train[feat_col].copy()
        y_train = data_train[target_col].values.ravel()

        X_test = data_test[feat_col].copy()
        y_test = data_test[target_col].values.ravel()
        
        os.environ["TABPFN_MODEL_CACHE_DIR"] = "/data2/zhoujb/package/tabpfn_ckpt"
        if len(feat_col) > 500:
            clf_model =  TabPFNRegressor(device="cpu", random_state=0, n_jobs=-1, ignore_pretraining_limits=True)
        else:
            clf_model =  TabPFNRegressor(device="cpu", random_state=0, n_jobs=-1, ignore_pretraining_limits=False) 
            
        # Fit model
        clf_model.fit(X_train, y_train)
        
        y_pred = clf_model.predict(X_test)
        y_test_final.extend(y_test)
        y_pred_final.extend(y_pred)

    score_rmse = round(metrics.root_mean_squared_error(y_test_final, y_pred_final), 4)

    return score_rmse

def _get_feat_and_score(k): 
    # Read data
    raw_data_1 = pd.read_table(os.path.join(RAW_PATH, "raw_data_PC.txt"), sep="\t", index_col=0)
    feat_col_1 = [x for x in raw_data_1.columns if x.startswith("fea_")]
    # Get target
    target_col = ['GZ-PC']
    raw_data_1 = raw_data_1.dropna(subset=target_col)

    X_1 = raw_data_1[feat_col_1].copy()
    X_1 = X_1.fillna(-999)
    y_1 = raw_data_1[target_col].values.ravel()

    selector = feature_selection.SelectKBest(score_func=feature_selection.f_regression, k=k)
    _ = selector.fit_transform(X_1, y_1)

    feat_sel_list = X_1.columns[selector.get_support()]

    score_rmse = _get_tabPFN_test_score(feat_sel_list)
    #score_pear = _get_tabPFN_test_score(feat_sel_list)
    return feat_sel_list, score_rmse
    #return feat_sel_list, score_pear

if __name__ == "__main__":
    best_num = 1
    best_score_flag = np.inf
    #best_score_flag = 0
    
    # Read data
    raw_data_2 = pd.read_table(os.path.join(RAW_PATH, "raw_data_PC.txt"), sep="\t", index_col=0)
    feat_col_2 = [x for x in raw_data_2.columns if x.startswith("fea_")]

    need_run_list = list(range(1, len(feat_col_2)//2)) + list(range(len(feat_col_2)//2, len(feat_col_2), 50))
    for i in need_run_list:
        feat_sel_list, score_rmse = _get_feat_and_score(i)
        #feat_sel_list, score_pear = _get_feat_and_score(i)
        if score_rmse < best_score_flag:
        #if score_pear > best_score_flag:
            best_score_flag = score_rmse
            #best_score_flag = score_pear
            best_num = i
            best_features = feat_sel_list
        logging.info("Round {}: Score:{}, best score:{}({})".format(i, 
                                                                    score_rmse,
                                                                    best_score_flag,
                                                                    best_num))
    if score_rmse < best_score_flag:
    #if score_pear > best_score_flag:
        #best_score_flag = score_pear
        best_score_flag = score_rmse
        best_num = i
        best_features = feat_sel_list
    logging.info("Round {}: Score:{}, best score:{}({})".format(i, 
                                                                score_rmse,
                                                                best_score_flag,
                                                                best_num))

    with open(os.path.join(OUT_PATH, "tabPFN_rfa_cv_pc_rmse"), "w") as feat_f:
        for item in best_features:
            print(item, file=feat_f)
        
    logging.info("JOB DONE")

In [None]:
#!/bin/bash

source /data2/zhoujb/anaconda3/etc/profile.d/conda.sh
cd /data2/zhoujb/project/cowpea_project/basedXPXLR/ML/fs_PC

conda activate PyTorchTabular
/data2/zhoujb/anaconda3/envs/PyTorchTabular/bin/python run_cb_rfa_cv_pc.py > ./1.log 2>&1 
conda deactivate
echo "run_cb_rfa_cv_pc DONE"

conda activate PyTorchTabular
/data2/zhoujb/anaconda3/envs/PyTorchTabular/bin/python run_lgb_rfa_cv_pc.py > ./2.log 2>&1 
conda deactivate
echo "run_lgb_rfa_cv_pc DONE"

conda activate PyTorchTabular
/data2/zhoujb/anaconda3/envs/PyTorchTabular/bin/python run_cb_rfe_cv_pc.py > ./3.log 2>&1 
conda deactivate
echo "run_cb_rfe_cv_pc DONE"

conda activate PyTorchTabular
/data2/zhoujb/anaconda3/envs/PyTorchTabular/bin/python run_lgb_rfe_cv_pc.py > ./4.log 2>&1 
conda deactivate
echo "run_lgb_rfe_cv_pc DONE"

conda activate PyTorchTabular
/data2/zhoujb/anaconda3/envs/PyTorchTabular/bin/python run_tabPFN_rfa_cv_pc.py > ./5.log 2>&1 
conda deactivate
echo "run_tabPFN_rfa_cv_pc DONE"

echo "ALL DONE"
