In [1]:
import os, pickle, logging, pickle, joblib, sys, warnings, time
warnings.simplefilter('ignore')
from scipy import stats
import numpy as np
import pandas as pd

#import catboost as cb

from sklearn import ensemble, metrics, pipeline, preprocessing, impute, model_selection
from scipy.stats import pearsonr, spearmanr

import torch
from tabpfn import TabPFNRegressor
from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import AutoTabPFNRegressor

ML_RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"
FS_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/fs_PL/"
TEST_RES_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/tesRes/"

In [2]:
feat_col = []
with open(os.path.join(FS_PATH, "tabPFN_rfa_cv_pl_rmse")) as f:
    for line in f:
        feat_col.append(line.strip())

raw_data = pd.read_table(os.path.join(ML_RAW_PATH, "raw_data_PL.txt"), sep="\t", index_col=0)

target_col = ['HZ-PL']
raw_data = raw_data.dropna(subset=target_col)

kf = model_selection.KFold(n_splits=5, shuffle=True,  random_state=0)
y_test_final, y_pred_final = [], []
for i, (train_index, test_index) in enumerate(kf.split(raw_data)):
    print("Round {} start...".format(i))
    start_time = time.time()
    
    data_train = raw_data.iloc[train_index].copy()
    data_test = raw_data.iloc[test_index].copy()

    scale_tool = preprocessing.StandardScaler()
    scale_tool.fit(data_train.loc[:, feat_col])
    data_train.loc[:, feat_col] = scale_tool.transform(data_train.loc[:, feat_col])
    data_test.loc[:, feat_col] = scale_tool.transform(data_test.loc[:, feat_col])

    #train_sel = data_train.sample(frac=0.8, random_state=0)
    #val_sel = data_train.drop(train_sel.index).copy()

    #X_train = train_sel[feat_col].copy()
    #y_train = train_sel[target_col].values.ravel()

    #X_val = val_sel[feat_col].copy()
    #y_val = val_sel[target_col].values.ravel()

    X_train = data_train[feat_col].copy()
    y_train = data_train[target_col].values.ravel()

    X_test = data_test[feat_col].copy()
    y_test = data_test[target_col].values.ravel()

    # Initialize CatBoostClassifier
    os.environ["TABPFN_MODEL_CACHE_DIR"] = "/data2/zhoujb/package/tabpfn_ckpt"
    # >500 ignore_pretraining_limits=True
    clf_model =  TabPFNRegressor(device="cpu", random_state=0, n_jobs=-1, ignore_pretraining_limits=True) 
    #clf_model = AutoTabPFNRegressor(max_time=100, device="cpu", random_state=0)
    # Fit model
    clf_model.fit(X_train, y_train)

    y_pred = clf_model.predict(X_test)

    y_test_final.append(y_test)
    y_pred_final.append(y_pred)

    end_time = time.time()
    print("Round {}:{}".format(i, end_time-start_time))

Round 0 start...
Round 0:15.799893617630005
Round 1 start...
Round 1:13.613173007965088
Round 2 start...
Round 2:14.005930185317993
Round 3 start...
Round 3:14.692450523376465
Round 4 start...
Round 4:15.060964822769165


In [3]:
res_df = pd.DataFrame(columns=["Model", "Times", "Score", "Type"])
for i in range(len(y_test_final)):
    ## Get score
    #score_pear = pearsonr(y_test_final[i], y_pred_final[i])[0]
    score_spear = spearmanr(y_test_final[i], y_pred_final[i])[0]
    score_rmse = metrics.root_mean_squared_error(y_test_final[i], y_pred_final[i])
    score_nrmse = score_rmse / np.std(y_test_final[i])

    #res_df.loc[len(res_df)] = ["CB_ALL", i+1, score_pear, "R"]
    res_df.loc[len(res_df)] = ["tabPFN_FS", i+1, score_spear, "R"]
    res_df.loc[len(res_df)] = ["tabPFN_FS", i+1, score_rmse, "RMSE"]
    res_df.loc[len(res_df)] = ["tabPFN_FS", i+1, score_nrmse, "NRMSE"]

with open(os.path.join(TEST_RES_PATH, "PL_tabPFN_FS.pickle"), "wb") as out_f:
    pickle.dump(res_df, out_f)

In [4]:
res_df.groupby(["Type"])["Score"].mean()

Type
NRMSE    0.486319
R        0.848131
RMSE     7.894737
Name: Score, dtype: float64

In [3]:
with open(os.path.join(TEST_RES_PATH, "PL_tabPFN_FS.pickle"), "rb") as f:
    res_df = pickle.load(f)

res_df

Unnamed: 0,Model,Times,Score,Type
0,tabPFN_FS,1,0.874296,R
1,tabPFN_FS,1,7.591552,RMSE
2,tabPFN_FS,1,0.456161,NRMSE
3,tabPFN_FS,2,0.816849,R
4,tabPFN_FS,2,9.757565,RMSE
5,tabPFN_FS,2,0.547337,NRMSE
6,tabPFN_FS,3,0.880479,R
7,tabPFN_FS,3,8.729176,RMSE
8,tabPFN_FS,3,0.536206,NRMSE
9,tabPFN_FS,4,0.74504,R


In [4]:
res_df.groupby(["Type"])["Score"].mean()

Type
NRMSE    0.521345
R        0.836881
RMSE     8.459383
Name: Score, dtype: float64