In [1]:
import os, pickle, logging, pickle, joblib, sys, warnings
warnings.simplefilter('ignore')
from scipy import stats
import numpy as np
import pandas as pd

import catboost as cb

from sklearn import ensemble, metrics, pipeline, preprocessing, impute, model_selection
from scipy.stats import pearsonr, spearmanr

ML_RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"
FS_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/fs_PSugar/"
TEST_RES_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/tesRes/"

In [2]:
feat_col = []
with open(os.path.join(FS_PATH, "cb_rfe_cv_psugar_rmse")) as f:
    for line in f:
        feat_col.append(line.strip())

raw_data = pd.read_table(os.path.join(ML_RAW_PATH, "raw_data_PSugar.txt"), sep="\t", index_col=0)

target_col = ['GZ-PSugar']
raw_data = raw_data.dropna(subset=target_col)

#cat_col_names_raw = ["heading_stage_d", "leaf_blast_rep1", "leaf_blast_rep2", "leaf_blast_average", "GR_D3", "GR_D2", "GR_D1"]
#cat_col_names_raw = ["feat_{}".format(x) for x in cat_col_names_raw]
#cat_col_names = [i for i in feat_col if i in cat_col_names_raw]
#num_col_names = [i for i in feat_col if (i.startswith("feat")) and (i not in cat_col_names)]
#cat_col_num = []
#for item in cat_col_names:
    #cat_col_num.append(feat_col.index(item))

#for col in cat_col_names:
    #raw_data[col] = raw_data[col].astype("str")

kf = model_selection.KFold(n_splits=5, shuffle=True,  random_state=0)
y_test_final, y_pred_final = [], []
for i, (train_index, test_index) in enumerate(kf.split(raw_data)):
    data_train = raw_data.iloc[train_index].copy()
    data_test = raw_data.iloc[test_index].copy()

    scale_tool = preprocessing.StandardScaler()
    scale_tool.fit(data_train.loc[:, feat_col])
    data_train.loc[:, feat_col] = scale_tool.transform(data_train.loc[:, feat_col])
    data_test.loc[:, feat_col] = scale_tool.transform(data_test.loc[:, feat_col])

    train_sel = data_train.sample(frac=0.8, random_state=0)
    val_sel = data_train.drop(train_sel.index).copy()

    X_train = train_sel[feat_col].copy()
    y_train = train_sel[target_col].values.ravel()

    X_val = val_sel[feat_col].copy()
    y_val = val_sel[target_col].values.ravel()

    X_test = data_test[feat_col].copy()
    y_test = data_test[target_col].values.ravel()

    # Initialize CatBoostClassifier
    clf_model = cb.CatBoostRegressor(random_state=0, thread_count=4, loss_function='RMSE')
    # Fit model
    clf_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=1, plot=False)

    y_pred = clf_model.predict(X_test)

    y_test_final.append(y_test)
    y_pred_final.append(y_pred)

Learning rate set to 0.039298
0:	learn: 1.9963955	test: 1.9963955	test1: 2.5369668	best: 2.5369668 (0)	total: 93.6ms	remaining: 1m 33s
1:	learn: 1.9738575	test: 1.9738575	test1: 2.5230479	best: 2.5230479 (1)	total: 120ms	remaining: 59.9s
2:	learn: 1.9587333	test: 1.9587333	test1: 2.5171372	best: 2.5171372 (2)	total: 144ms	remaining: 47.9s
3:	learn: 1.9383792	test: 1.9383792	test1: 2.5050060	best: 2.5050060 (3)	total: 165ms	remaining: 41.2s
4:	learn: 1.9164441	test: 1.9164441	test1: 2.4978511	best: 2.4978511 (4)	total: 186ms	remaining: 36.9s
5:	learn: 1.8916672	test: 1.8916672	test1: 2.4797888	best: 2.4797888 (5)	total: 205ms	remaining: 34s
6:	learn: 1.8713494	test: 1.8713494	test1: 2.4665833	best: 2.4665833 (6)	total: 225ms	remaining: 31.9s
7:	learn: 1.8493987	test: 1.8493987	test1: 2.4577943	best: 2.4577943 (7)	total: 245ms	remaining: 30.3s
8:	learn: 1.8282422	test: 1.8282422	test1: 2.4493986	best: 2.4493986 (8)	total: 265ms	remaining: 29.2s
9:	learn: 1.8062296	test: 1.8062296	test1: 

In [3]:
res_df = pd.DataFrame(columns=["Model", "Times", "Score", "Type"])
for i in range(len(y_test_final)):
    ## Get score
    #score_pear = pearsonr(y_test_final[i], y_pred_final[i])[0]
    score_spear = spearmanr(y_test_final[i], y_pred_final[i])[0]
    #score_rmse = metrics.mean_squared_error(y_test_final[i], y_pred_final[i], squared=False)
    score_rmse = metrics.root_mean_squared_error(y_test_final[i], y_pred_final[i])
    score_nrmse = score_rmse / np.std(y_test_final[i])

    #res_df.loc[len(res_df)] = ["CB_RFE", i+1, score_pear, "R"]
    res_df.loc[len(res_df)] = ["CB_RFE", i+1, score_spear, "R"]
    res_df.loc[len(res_df)] = ["CB_RFE", i+1, score_rmse, "RMSE"]
    res_df.loc[len(res_df)] = ["CB_RFE", i+1, score_nrmse, "NRMSE"]

with open(os.path.join(TEST_RES_PATH, "PSugar_CB_RFE.pickle"), "wb") as out_f:
    pickle.dump(res_df, out_f)

In [4]:
res_df.groupby(["Type"])["Score"].mean()

Type
NRMSE    0.791122
R        0.627674
RMSE     1.690476
Name: Score, dtype: float64