In [1]:
import os, pickle, logging, pickle, joblib, sys, warnings, time
warnings.simplefilter('ignore')
from scipy import stats
import numpy as np
import pandas as pd

import catboost as cb

from sklearn import ensemble, metrics, pipeline, preprocessing, impute, model_selection
from scipy.stats import pearsonr, spearmanr

#from shaphypetune import BoostRFE, BoostBoruta

ML_RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"
TEST_RES_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/tesRes/"

In [5]:
raw_data = pd.read_table(os.path.join(ML_RAW_PATH, "raw_data_PL.txt"), sep="\t", index_col=0)

target_col = ['HZ-PL']
raw_data = raw_data.dropna(subset=target_col)

feat_col = [x for x in raw_data.columns if x.startswith("fea_")]

kf = model_selection.KFold(n_splits=5, shuffle=True,  random_state=0)
y_test_final, y_pred_final = [], []
for i, (train_index, test_index) in enumerate(kf.split(raw_data)):
    print("Round {} start...".format(i))
    start_time = time.time()
    
    data_train = raw_data.iloc[train_index].copy()
    data_test = raw_data.iloc[test_index].copy()

    scale_tool = preprocessing.StandardScaler()
    scale_tool.fit(data_train.loc[:, feat_col])
    data_train.loc[:, feat_col] = scale_tool.transform(data_train.loc[:, feat_col])
    data_test.loc[:, feat_col] = scale_tool.transform(data_test.loc[:, feat_col])

    train_sel = data_train.sample(frac=0.8, random_state=0)
    val_sel = data_train.drop(train_sel.index).copy()

    X_train = train_sel[feat_col].copy()
    y_train = train_sel[target_col].values.ravel()

    X_val = val_sel[feat_col].copy()
    y_val = val_sel[target_col].values.ravel()

    X_test = data_test[feat_col].copy()
    y_test = data_test[target_col].values.ravel()

    # Initialize CatBoostClassifier
    clf_model = cb.CatBoostRegressor(random_state=0, thread_count=4, loss_function='RMSE')
    # Fit model
    clf_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=0, plot=False)

    y_pred = clf_model.predict(X_test)

    y_test_final.append(y_test)
    y_pred_final.append(y_pred)

    end_time = time.time()
    print("Round {}:{}".format(i, end_time-start_time))

Round 0 start...
Round 0:18.383485794067383
Round 1 start...
Round 1:18.021739959716797
Round 2 start...
Round 2:16.468129873275757
Round 3 start...
Round 3:17.537561416625977
Round 4 start...
Round 4:17.223567724227905


In [6]:
res_df = pd.DataFrame(columns=["Model", "Times", "Score", "Type"])
for i in range(len(y_test_final)):
    ## Get score
    #score_pear = pearsonr(y_test_final[i], y_pred_final[i])[0]
    score_spear = spearmanr(y_test_final[i], y_pred_final[i])[0]
    #score_rmse = metrics.mean_squared_error(y_test_final[i], y_pred_final[i], squared=False)
    score_rmse = metrics.root_mean_squared_error(y_test_final[i], y_pred_final[i])
    score_nrmse = score_rmse / np.std(y_test_final[i])

    #res_df.loc[len(res_df)] = ["CB_ALL", i+1, score_pear, "R"]
    res_df.loc[len(res_df)] = ["CB_ALL", i+1, score_spear, "R"]
    res_df.loc[len(res_df)] = ["CB_ALL", i+1, score_rmse, "RMSE"]
    res_df.loc[len(res_df)] = ["CB_ALL", i+1, score_nrmse, "NRMSE"]

with open(os.path.join(TEST_RES_PATH, "PL_CB_ALL.pickle"), "wb") as out_f:
    pickle.dump(res_df, out_f)

In [7]:
res_df.groupby(["Type"])["Score"].mean()

Type
NRMSE    0.583562
R        0.809264
RMSE     9.518654
Name: Score, dtype: float64

In [3]:
with open(os.path.join(TEST_RES_PATH, "PL_CB_ALL.pickle"), "rb") as f:
    res_df = pickle.load(f)

res_df

Unnamed: 0,Model,Times,Score,Type
0,CB_ALL,1,0.829795,R
1,CB_ALL,1,9.060302,RMSE
2,CB_ALL,1,0.544415,NRMSE
3,CB_ALL,2,0.768419,R
4,CB_ALL,2,11.130413,RMSE
5,CB_ALL,2,0.624345,NRMSE
6,CB_ALL,3,0.845719,R
7,CB_ALL,3,9.462149,RMSE
8,CB_ALL,3,0.58123,NRMSE
9,CB_ALL,4,0.778272,R


In [4]:
res_df.groupby(["Type"])["Score"].mean()

Type
NRMSE    0.583419
R        0.799206
RMSE     9.469988
Name: Score, dtype: float64