In [2]:
import os, pickle, logging, pickle, joblib, sys, warnings
warnings.simplefilter('ignore')
from scipy import stats
import numpy as np
import pandas as pd

import catboost as cb

from sklearn import ensemble, metrics, pipeline, preprocessing, impute, model_selection
from scipy.stats import pearsonr, spearmanr

from shaphypetune import BoostRFE, BoostBoruta

ML_RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"
TEST_RES_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/tesRes/"

In [3]:
raw_data = pd.read_table(os.path.join(ML_RAW_PATH, "raw_data_PSugar.txt"), sep="\t", index_col=0)

target_col = ['GZ-PSugar']
raw_data = raw_data.dropna(subset=target_col)

feat_col = [x for x in raw_data.columns if x.startswith("fea_")]

kf = model_selection.KFold(n_splits=5, shuffle=True,  random_state=0)
y_test_final, y_pred_final = [], []
for i, (train_index, test_index) in enumerate(kf.split(raw_data)):
    data_train = raw_data.iloc[train_index].copy()
    data_test = raw_data.iloc[test_index].copy()

    scale_tool = preprocessing.StandardScaler()
    scale_tool.fit(data_train.loc[:, feat_col])
    data_train.loc[:, feat_col] = scale_tool.transform(data_train.loc[:, feat_col])
    data_test.loc[:, feat_col] = scale_tool.transform(data_test.loc[:, feat_col])

    train_sel = data_train.sample(frac=0.8, random_state=0)
    val_sel = data_train.drop(train_sel.index).copy()

    X_train = train_sel[feat_col].copy()
    y_train = train_sel[target_col].values.ravel()

    X_val = val_sel[feat_col].copy()
    y_val = val_sel[target_col].values.ravel()

    X_test = data_test[feat_col].copy()
    y_test = data_test[target_col].values.ravel()

    # Initialize CatBoostClassifier
    clf_model = cb.CatBoostRegressor(random_state=0, thread_count=4, loss_function='RMSE')
    # Fit model
    clf_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=1, plot=False)

    y_pred = clf_model.predict(X_test)

    y_test_final.append(y_test)
    y_pred_final.append(y_pred)

Learning rate set to 0.039298
0:	learn: 1.9930421	test: 1.9930421	test1: 2.5442395	best: 2.5442395 (0)	total: 93.7ms	remaining: 1m 33s
1:	learn: 1.9695538	test: 1.9695538	test1: 2.5263135	best: 2.5263135 (1)	total: 119ms	remaining: 59.6s
2:	learn: 1.9507558	test: 1.9507558	test1: 2.5135021	best: 2.5135021 (2)	total: 142ms	remaining: 47.1s
3:	learn: 1.9300036	test: 1.9300036	test1: 2.4992588	best: 2.4992588 (3)	total: 162ms	remaining: 40.5s
4:	learn: 1.9123955	test: 1.9123955	test1: 2.4830352	best: 2.4830352 (4)	total: 182ms	remaining: 36.3s
5:	learn: 1.8887588	test: 1.8887588	test1: 2.4641121	best: 2.4641121 (5)	total: 202ms	remaining: 33.4s
6:	learn: 1.8715098	test: 1.8715098	test1: 2.4573747	best: 2.4573747 (6)	total: 222ms	remaining: 31.5s
7:	learn: 1.8551314	test: 1.8551314	test1: 2.4465854	best: 2.4465854 (7)	total: 241ms	remaining: 29.9s
8:	learn: 1.8369067	test: 1.8369067	test1: 2.4307618	best: 2.4307618 (8)	total: 260ms	remaining: 28.6s
9:	learn: 1.8166749	test: 1.8166749	test1

In [4]:
res_df = pd.DataFrame(columns=["Model", "Times", "Score", "Type"])
for i in range(len(y_test_final)):
    ## Get score
    #score_pear = pearsonr(y_test_final[i], y_pred_final[i])[0]
    score_spear = spearmanr(y_test_final[i], y_pred_final[i])[0]
    #score_rmse = metrics.mean_squared_error(y_test_final[i], y_pred_final[i], squared=False)
    score_rmse = metrics.root_mean_squared_error(y_test_final[i], y_pred_final[i])
    score_nrmse = score_rmse / np.std(y_test_final[i])

    #res_df.loc[len(res_df)] = ["CB_ALL", i+1, score_pear, "R"]
    res_df.loc[len(res_df)] = ["CB_ALL", i+1, score_spear, "R"]
    res_df.loc[len(res_df)] = ["CB_ALL", i+1, score_rmse, "RMSE"]
    res_df.loc[len(res_df)] = ["CB_ALL", i+1, score_nrmse, "NRMSE"]

with open(os.path.join(TEST_RES_PATH, "PSugar_CB_ALL.pickle"), "wb") as out_f:
    pickle.dump(res_df, out_f)

In [5]:
res_df.groupby(["Type"])["Score"].mean()

Type
NRMSE    0.806842
R        0.600500
RMSE     1.723509
Name: Score, dtype: float64