In [1]:
import os, pickle, logging, pickle, joblib, sys, warnings
warnings.simplefilter('ignore')
from scipy import stats
import numpy as np
import pandas as pd

import catboost as cb

from sklearn import ensemble, metrics, pipeline, preprocessing, impute, model_selection
from scipy.stats import pearsonr, spearmanr

ML_RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"
FS_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/fs_PStarch/"
TEST_RES_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/tesRes/"

In [2]:
feat_col = []
with open(os.path.join(FS_PATH, "cb_rfa_cv_pstarch_rmse")) as f:
    for line in f:
        feat_col.append(line.strip())

raw_data = pd.read_table(os.path.join(ML_RAW_PATH, "raw_data_PStarch.txt"), sep="\t", index_col=0)

target_col = ['HZ-PStarch']
raw_data = raw_data.dropna(subset=target_col)

#cat_col_names_raw = ["heading_stage_d", "leaf_blast_rep1", "leaf_blast_rep2", "leaf_blast_average", "GR_D3", "GR_D2", "GR_D1"]
#cat_col_names_raw = ["feat_{}".format(x) for x in cat_col_names_raw]
#cat_col_names = [i for i in feat_col if i in cat_col_names_raw]
#num_col_names = [i for i in feat_col if (i.startswith("feat")) and (i not in cat_col_names)]
#cat_col_num = []
#for item in cat_col_names:
    #cat_col_num.append(feat_col.index(item))

#for col in cat_col_names:
    #raw_data[col] = raw_data[col].astype("str")

kf = model_selection.KFold(n_splits=5, shuffle=True,  random_state=0)
y_test_final, y_pred_final = [], []
for i, (train_index, test_index) in enumerate(kf.split(raw_data)):
    data_train = raw_data.iloc[train_index].copy()
    data_test = raw_data.iloc[test_index].copy()

    scale_tool = preprocessing.StandardScaler()
    scale_tool.fit(data_train.loc[:, feat_col])
    data_train.loc[:, feat_col] = scale_tool.transform(data_train.loc[:, feat_col])
    data_test.loc[:, feat_col] = scale_tool.transform(data_test.loc[:, feat_col])

    train_sel = data_train.sample(frac=0.8, random_state=0)
    val_sel = data_train.drop(train_sel.index).copy()

    X_train = train_sel[feat_col].copy()
    y_train = train_sel[target_col].values.ravel()

    X_val = val_sel[feat_col].copy()
    y_val = val_sel[target_col].values.ravel()

    X_test = data_test[feat_col].copy()
    y_test = data_test[target_col].values.ravel()

    # Initialize CatBoostClassifier
    clf_model = cb.CatBoostRegressor(random_state=0, thread_count=4, loss_function='RMSE')
    # Fit model
    clf_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=1, plot=False)

    y_pred = clf_model.predict(X_test)

    y_test_final.append(y_test)
    y_pred_final.append(y_pred)

Learning rate set to 0.039266
0:	learn: 1.7975737	test: 1.7975737	test1: 1.9076402	best: 1.9076402 (0)	total: 49.4ms	remaining: 49.4s
1:	learn: 1.7800934	test: 1.7800934	test1: 1.8932731	best: 1.8932731 (1)	total: 51.4ms	remaining: 25.6s
2:	learn: 1.7660220	test: 1.7660220	test1: 1.8758963	best: 1.8758963 (2)	total: 53.2ms	remaining: 17.7s
3:	learn: 1.7534214	test: 1.7534214	test1: 1.8678957	best: 1.8678957 (3)	total: 54.9ms	remaining: 13.7s
4:	learn: 1.7398321	test: 1.7398321	test1: 1.8530235	best: 1.8530235 (4)	total: 56.6ms	remaining: 11.3s
5:	learn: 1.7286436	test: 1.7286436	test1: 1.8425690	best: 1.8425690 (5)	total: 58.3ms	remaining: 9.65s
6:	learn: 1.7185812	test: 1.7185812	test1: 1.8363997	best: 1.8363997 (6)	total: 59.9ms	remaining: 8.5s
7:	learn: 1.7040608	test: 1.7040608	test1: 1.8235466	best: 1.8235466 (7)	total: 61.5ms	remaining: 7.63s
8:	learn: 1.6889317	test: 1.6889317	test1: 1.8131548	best: 1.8131548 (8)	total: 63.2ms	remaining: 6.96s
9:	learn: 1.6772123	test: 1.6772123

In [3]:
res_df = pd.DataFrame(columns=["Model", "Times", "Score", "Type"])
for i in range(len(y_test_final)):
    ## Get score
    #score_pear = pearsonr(y_test_final[i], y_pred_final[i])[0]
    score_spear = spearmanr(y_test_final[i], y_pred_final[i])[0]
    #score_rmse = metrics.mean_squared_error(y_test_final[i], y_pred_final[i], squared=False)
    score_rmse = metrics.root_mean_squared_error(y_test_final[i], y_pred_final[i])
    score_nrmse = score_rmse / np.std(y_test_final[i])

    #res_df.loc[len(res_df)] = ["CB_RFA", i+1, score_pear, "R"]
    res_df.loc[len(res_df)] = ["CB_RFA", i+1, score_spear, "R"]
    res_df.loc[len(res_df)] = ["CB_RFA", i+1, score_rmse, "RMSE"]
    res_df.loc[len(res_df)] = ["CB_RFA", i+1, score_nrmse, "NRMSE"]

with open(os.path.join(TEST_RES_PATH, "PStarch_CB_RFA.pickle"), "wb") as out_f:
    pickle.dump(res_df, out_f)

In [3]:
with open(os.path.join(TEST_RES_PATH, "PStarch_CB_RFA.pickle"), "rb") as f:
    res_df = pickle.load(f)

res_df

Unnamed: 0,Model,Times,Score,Type
0,CB_RFA,1,0.658461,R
1,CB_RFA,1,1.389991,RMSE
2,CB_RFA,1,0.74457,NRMSE
3,CB_RFA,2,0.711142,R
4,CB_RFA,2,1.422375,RMSE
5,CB_RFA,2,0.735408,NRMSE
6,CB_RFA,3,0.642234,R
7,CB_RFA,3,1.275113,RMSE
8,CB_RFA,3,0.740638,NRMSE
9,CB_RFA,4,0.605613,R


In [4]:
res_df.groupby(["Type"])["Score"].mean()

Type
NRMSE    0.765513
R        0.652829
RMSE     1.364247
Name: Score, dtype: float64