In [1]:
import os, pickle, logging, pickle, joblib, sys, warnings
warnings.simplefilter('ignore')
from scipy import stats
import numpy as np
import pandas as pd

import catboost as cb

from sklearn import ensemble, metrics, pipeline, preprocessing, impute, model_selection
from scipy.stats import pearsonr, spearmanr

ML_RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"
FS_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/fs_PL/"
TEST_RES_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/tesRes/"

In [2]:
feat_col = []
with open(os.path.join(FS_PATH, "cb_rfe_cv_pl_rmse")) as f:
    for line in f:
        feat_col.append(line.strip())

raw_data = pd.read_table(os.path.join(ML_RAW_PATH, "raw_data_PL.txt"), sep="\t", index_col=0)

target_col = ['HZ-PL']
raw_data = raw_data.dropna(subset=target_col)

#cat_col_names_raw = ["heading_stage_d", "leaf_blast_rep1", "leaf_blast_rep2", "leaf_blast_average", "GR_D3", "GR_D2", "GR_D1"]
#cat_col_names_raw = ["feat_{}".format(x) for x in cat_col_names_raw]
#cat_col_names = [i for i in feat_col if i in cat_col_names_raw]
#num_col_names = [i for i in feat_col if (i.startswith("feat")) and (i not in cat_col_names)]
#cat_col_num = []
#for item in cat_col_names:
    #cat_col_num.append(feat_col.index(item))

#for col in cat_col_names:
    #raw_data[col] = raw_data[col].astype("str")

kf = model_selection.KFold(n_splits=5, shuffle=True,  random_state=0)
y_test_final, y_pred_final = [], []
for i, (train_index, test_index) in enumerate(kf.split(raw_data)):
    data_train = raw_data.iloc[train_index].copy()
    data_test = raw_data.iloc[test_index].copy()

    scale_tool = preprocessing.StandardScaler()
    scale_tool.fit(data_train.loc[:, feat_col])
    data_train.loc[:, feat_col] = scale_tool.transform(data_train.loc[:, feat_col])
    data_test.loc[:, feat_col] = scale_tool.transform(data_test.loc[:, feat_col])

    train_sel = data_train.sample(frac=0.8, random_state=0)
    val_sel = data_train.drop(train_sel.index).copy()

    X_train = train_sel[feat_col].copy()
    y_train = train_sel[target_col].values.ravel()

    X_val = val_sel[feat_col].copy()
    y_val = val_sel[target_col].values.ravel()

    X_test = data_test[feat_col].copy()
    y_test = data_test[target_col].values.ravel()

    # Initialize CatBoostClassifier
    clf_model = cb.CatBoostRegressor(random_state=0, thread_count=4, loss_function='RMSE')
    # Fit model
    clf_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=1, plot=False)

    y_pred = clf_model.predict(X_test)

    y_test_final.append(y_test)
    y_pred_final.append(y_pred)

Learning rate set to 0.039298
0:	learn: 16.1845380	test: 16.1845380	test1: 16.4694117	best: 16.4694117 (0)	total: 95.6ms	remaining: 1m 35s
1:	learn: 15.9241743	test: 15.9241743	test1: 16.2464045	best: 16.2464045 (1)	total: 123ms	remaining: 1m 1s
2:	learn: 15.6695984	test: 15.6695984	test1: 16.0372086	best: 16.0372086 (2)	total: 146ms	remaining: 48.6s
3:	learn: 15.4492256	test: 15.4492256	test1: 15.9202608	best: 15.9202608 (3)	total: 167ms	remaining: 41.7s
4:	learn: 15.1866081	test: 15.1866081	test1: 15.7128845	best: 15.7128845 (4)	total: 186ms	remaining: 37.1s
5:	learn: 14.9732778	test: 14.9732778	test1: 15.5776638	best: 15.5776638 (5)	total: 205ms	remaining: 34s
6:	learn: 14.6898237	test: 14.6898237	test1: 15.3579003	best: 15.3579003 (6)	total: 225ms	remaining: 32s
7:	learn: 14.4765950	test: 14.4765950	test1: 15.1977731	best: 15.1977731 (7)	total: 245ms	remaining: 30.3s
8:	learn: 14.2839800	test: 14.2839800	test1: 15.0703071	best: 15.0703071 (8)	total: 264ms	remaining: 29s
9:	learn: 1

In [5]:
res_df = pd.DataFrame(columns=["Model", "Times", "Score", "Type"])
for i in range(len(y_test_final)):
    ## Get score
    #score_pear = pearsonr(y_test_final[i], y_pred_final[i])[0]
    score_spear = spearmanr(y_test_final[i], y_pred_final[i])[0]
    #score_rmse = metrics.mean_squared_error(y_test_final[i], y_pred_final[i], squared=False)
    score_rmse = metrics.root_mean_squared_error(y_test_final[i], y_pred_final[i])
    score_nrmse = score_rmse / np.std(y_test_final[i])

    #res_df.loc[len(res_df)] = ["CB_RFE", i+1, score_pear, "R"]
    res_df.loc[len(res_df)] = ["CB_RFE", i+1, score_spear, "R"]
    res_df.loc[len(res_df)] = ["CB_RFE", i+1, score_rmse, "RMSE"]
    res_df.loc[len(res_df)] = ["CB_RFE", i+1, score_nrmse, "NRMSE"]

with open(os.path.join(TEST_RES_PATH, "PL_CB_RFE.pickle"), "wb") as out_f:
    pickle.dump(res_df, out_f)

In [6]:
res_df.groupby(["Type"])["Score"].mean()

Type
NRMSE    0.570448
R        0.821370
RMSE     9.303675
Name: Score, dtype: float64

In [2]:
with open(os.path.join(TEST_RES_PATH, "PL_CB_RFE.pickle"), "rb") as f:
    res_df = pickle.load(f)

res_df

Unnamed: 0,Model,Times,Score,Type
0,CB_RFE,1,0.840741,R
1,CB_RFE,1,9.143219,RMSE
2,CB_RFE,1,0.549397,NRMSE
3,CB_RFE,2,0.795482,R
4,CB_RFE,2,10.66686,RMSE
5,CB_RFE,2,0.598343,NRMSE
6,CB_RFE,3,0.859973,R
7,CB_RFE,3,9.507094,RMSE
8,CB_RFE,3,0.583991,NRMSE
9,CB_RFE,4,0.784385,R


In [3]:
res_df.groupby(["Type"])["Score"].mean()

Type
NRMSE    0.582375
R        0.810115
RMSE     9.442856
Name: Score, dtype: float64