In [1]:
import os, pickle, logging, pickle, joblib, sys, warnings
warnings.simplefilter('ignore')
from scipy import stats
import numpy as np
import pandas as pd

import catboost as cb

from sklearn import ensemble, metrics, pipeline, preprocessing, impute, model_selection
from scipy.stats import pearsonr, spearmanr

ML_RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"
FS_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/fs_PStarch/"
TEST_RES_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/tesRes/"

In [2]:
feat_col = []
with open(os.path.join(FS_PATH, "cb_rfe_cv_pstarch_rmse")) as f:
    for line in f:
        feat_col.append(line.strip())

raw_data = pd.read_table(os.path.join(ML_RAW_PATH, "raw_data_PStarch.txt"), sep="\t", index_col=0)

target_col = ['HZ-PStarch']
raw_data = raw_data.dropna(subset=target_col)

#cat_col_names_raw = ["heading_stage_d", "leaf_blast_rep1", "leaf_blast_rep2", "leaf_blast_average", "GR_D3", "GR_D2", "GR_D1"]
#cat_col_names_raw = ["feat_{}".format(x) for x in cat_col_names_raw]
#cat_col_names = [i for i in feat_col if i in cat_col_names_raw]
#num_col_names = [i for i in feat_col if (i.startswith("feat")) and (i not in cat_col_names)]
#cat_col_num = []
#for item in cat_col_names:
    #cat_col_num.append(feat_col.index(item))

#for col in cat_col_names:
    #raw_data[col] = raw_data[col].astype("str")

kf = model_selection.KFold(n_splits=5, shuffle=True,  random_state=0)
y_test_final, y_pred_final = [], []
for i, (train_index, test_index) in enumerate(kf.split(raw_data)):
    data_train = raw_data.iloc[train_index].copy()
    data_test = raw_data.iloc[test_index].copy()

    scale_tool = preprocessing.StandardScaler()
    scale_tool.fit(data_train.loc[:, feat_col])
    data_train.loc[:, feat_col] = scale_tool.transform(data_train.loc[:, feat_col])
    data_test.loc[:, feat_col] = scale_tool.transform(data_test.loc[:, feat_col])

    train_sel = data_train.sample(frac=0.8, random_state=0)
    val_sel = data_train.drop(train_sel.index).copy()

    X_train = train_sel[feat_col].copy()
    y_train = train_sel[target_col].values.ravel()

    X_val = val_sel[feat_col].copy()
    y_val = val_sel[target_col].values.ravel()

    X_test = data_test[feat_col].copy()
    y_test = data_test[target_col].values.ravel()

    # Initialize CatBoostClassifier
    clf_model = cb.CatBoostRegressor(random_state=0, thread_count=4, loss_function='RMSE')
    # Fit model
    clf_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=1, plot=False)

    y_pred = clf_model.predict(X_test)

    y_test_final.append(y_test)
    y_pred_final.append(y_pred)

Learning rate set to 0.039266
0:	learn: 1.7946546	test: 1.7946546	test1: 1.9077563	best: 1.9077563 (0)	total: 97ms	remaining: 1m 36s
1:	learn: 1.7879943	test: 1.7879943	test1: 1.9080097	best: 1.9077563 (0)	total: 126ms	remaining: 1m 2s
2:	learn: 1.7759404	test: 1.7759404	test1: 1.9007032	best: 1.9007032 (2)	total: 152ms	remaining: 50.6s
3:	learn: 1.7681880	test: 1.7681880	test1: 1.8937268	best: 1.8937268 (3)	total: 177ms	remaining: 44.1s
4:	learn: 1.7522984	test: 1.7522984	test1: 1.8795698	best: 1.8795698 (4)	total: 200ms	remaining: 39.8s
5:	learn: 1.7373270	test: 1.7373270	test1: 1.8763690	best: 1.8763690 (5)	total: 224ms	remaining: 37s
6:	learn: 1.7342904	test: 1.7342904	test1: 1.8760481	best: 1.8760481 (6)	total: 247ms	remaining: 35s
7:	learn: 1.7214080	test: 1.7214080	test1: 1.8725826	best: 1.8725826 (7)	total: 270ms	remaining: 33.4s
8:	learn: 1.7114906	test: 1.7114906	test1: 1.8668847	best: 1.8668847 (8)	total: 292ms	remaining: 32.2s
9:	learn: 1.7018491	test: 1.7018491	test1: 1.86

In [3]:
res_df = pd.DataFrame(columns=["Model", "Times", "Score", "Type"])
for i in range(len(y_test_final)):
    ## Get score
    #score_pear = pearsonr(y_test_final[i], y_pred_final[i])[0]
    score_spear = spearmanr(y_test_final[i], y_pred_final[i])[0]
    #score_rmse = metrics.mean_squared_error(y_test_final[i], y_pred_final[i], squared=False)
    score_rmse = metrics.root_mean_squared_error(y_test_final[i], y_pred_final[i])
    score_nrmse = score_rmse / np.std(y_test_final[i])

    #res_df.loc[len(res_df)] = ["CB_RFE", i+1, score_pear, "R"]
    res_df.loc[len(res_df)] = ["CB_RFE", i+1, score_spear, "R"]
    res_df.loc[len(res_df)] = ["CB_RFE", i+1, score_rmse, "RMSE"]
    res_df.loc[len(res_df)] = ["CB_RFE", i+1, score_nrmse, "NRMSE"]

with open(os.path.join(TEST_RES_PATH, "PStarch_CB_RFE.pickle"), "wb") as out_f:
    pickle.dump(res_df, out_f)

In [4]:
res_df.groupby(["Type"])["Score"].mean()

Type
NRMSE    0.878472
R        0.484629
RMSE     1.572729
Name: Score, dtype: float64