In [1]:
import os, pickle, logging, pickle, joblib, sys, warnings
warnings.simplefilter('ignore')
from scipy import stats
import numpy as np
import pandas as pd

import catboost as cb

from sklearn import ensemble, metrics, pipeline, preprocessing, impute, model_selection
from scipy.stats import pearsonr, spearmanr

from shaphypetune import BoostRFE, BoostBoruta

ML_RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"
TEST_RES_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/tesRes/"

In [2]:
raw_data = pd.read_table(os.path.join(ML_RAW_PATH, "raw_data_PStarch.txt"), sep="\t", index_col=0)

target_col = ['HZ-PStarch']
raw_data = raw_data.dropna(subset=target_col)

feat_col = [x for x in raw_data.columns if x.startswith("fea_")]

kf = model_selection.KFold(n_splits=5, shuffle=True,  random_state=0)
y_test_final, y_pred_final = [], []
for i, (train_index, test_index) in enumerate(kf.split(raw_data)):
    data_train = raw_data.iloc[train_index].copy()
    data_test = raw_data.iloc[test_index].copy()

    scale_tool = preprocessing.StandardScaler()
    scale_tool.fit(data_train.loc[:, feat_col])
    data_train.loc[:, feat_col] = scale_tool.transform(data_train.loc[:, feat_col])
    data_test.loc[:, feat_col] = scale_tool.transform(data_test.loc[:, feat_col])

    train_sel = data_train.sample(frac=0.8, random_state=0)
    val_sel = data_train.drop(train_sel.index).copy()

    X_train = train_sel[feat_col].copy()
    y_train = train_sel[target_col].values.ravel()

    X_val = val_sel[feat_col].copy()
    y_val = val_sel[target_col].values.ravel()

    X_test = data_test[feat_col].copy()
    y_test = data_test[target_col].values.ravel()

    # Initialize CatBoostClassifier
    clf_model = cb.CatBoostRegressor(random_state=0, thread_count=4, loss_function='RMSE')
    # Fit model
    clf_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=1, plot=False)

    y_pred = clf_model.predict(X_test)

    y_test_final.append(y_test)
    y_pred_final.append(y_pred)

Learning rate set to 0.039266
0:	learn: 1.8034255	test: 1.8034255	test1: 1.9114224	best: 1.9114224 (0)	total: 85ms	remaining: 1m 24s
1:	learn: 1.7884429	test: 1.7884429	test1: 1.8980877	best: 1.8980877 (1)	total: 109ms	remaining: 54.5s
2:	learn: 1.7786750	test: 1.7786750	test1: 1.8964170	best: 1.8964170 (2)	total: 131ms	remaining: 43.7s
3:	learn: 1.7695152	test: 1.7695152	test1: 1.8896177	best: 1.8896177 (3)	total: 153ms	remaining: 38s
4:	learn: 1.7584497	test: 1.7584497	test1: 1.8796969	best: 1.8796969 (4)	total: 173ms	remaining: 34.5s
5:	learn: 1.7441112	test: 1.7441112	test1: 1.8649784	best: 1.8649784 (5)	total: 194ms	remaining: 32.1s
6:	learn: 1.7311820	test: 1.7311820	test1: 1.8565914	best: 1.8565914 (6)	total: 214ms	remaining: 30.4s
7:	learn: 1.7163037	test: 1.7163037	test1: 1.8531670	best: 1.8531670 (7)	total: 235ms	remaining: 29.2s
8:	learn: 1.7082850	test: 1.7082850	test1: 1.8501062	best: 1.8501062 (8)	total: 256ms	remaining: 28.2s
9:	learn: 1.6962047	test: 1.6962047	test1: 1.

In [3]:
res_df = pd.DataFrame(columns=["Model", "Times", "Score", "Type"])
for i in range(len(y_test_final)):
    ## Get score
    #score_pear = pearsonr(y_test_final[i], y_pred_final[i])[0]
    score_spear = spearmanr(y_test_final[i], y_pred_final[i])[0]
    #score_rmse = metrics.mean_squared_error(y_test_final[i], y_pred_final[i], squared=False)
    score_rmse = metrics.root_mean_squared_error(y_test_final[i], y_pred_final[i])
    score_nrmse = score_rmse / np.std(y_test_final[i])

    #res_df.loc[len(res_df)] = ["CB_ALL", i+1, score_pear, "R"]
    res_df.loc[len(res_df)] = ["CB_ALL", i+1, score_spear, "R"]
    res_df.loc[len(res_df)] = ["CB_ALL", i+1, score_rmse, "RMSE"]
    res_df.loc[len(res_df)] = ["CB_ALL", i+1, score_nrmse, "NRMSE"]

with open(os.path.join(TEST_RES_PATH, "PStarch_CB_ALL.pickle"), "wb") as out_f:
    pickle.dump(res_df, out_f)

In [4]:
res_df.groupby(["Type"])["Score"].mean()

Type
NRMSE    0.895632
R        0.470711
RMSE     1.602218
Name: Score, dtype: float64