In [1]:
import os, pickle, logging, pickle, joblib, sys, warnings
warnings.simplefilter('ignore')
from scipy import stats
import numpy as np
import pandas as pd

import catboost as cb
import lightgbm as lgb

from sklearn import ensemble, metrics, pipeline, preprocessing, impute, model_selection
from scipy.stats import pearsonr, spearmanr

from shaphypetune import BoostRFE, BoostBoruta

ML_RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"
TEST_RES_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/tesRes/"

In [2]:
raw_data = pd.read_table(os.path.join(ML_RAW_PATH, "raw_data_PSugar.txt"), sep="\t", index_col=0)

target_col = ['GZ-PSugar']
raw_data = raw_data.dropna(subset=target_col)

feat_col = [x for x in raw_data.columns if x.startswith("fea_")]

kf = model_selection.KFold(n_splits=5, shuffle=True,  random_state=0)
y_test_final, y_pred_final = [], []
for i, (train_index, test_index) in enumerate(kf.split(raw_data)):
    data_train = raw_data.iloc[train_index].copy()
    data_test = raw_data.iloc[test_index].copy()

    scale_tool = preprocessing.StandardScaler()
    scale_tool.fit(data_train.loc[:, feat_col])
    data_train.loc[:, feat_col] = scale_tool.transform(data_train.loc[:, feat_col])
    data_test.loc[:, feat_col] = scale_tool.transform(data_test.loc[:, feat_col])

    train_sel = data_train.sample(frac=0.8, random_state=0)
    val_sel = data_train.drop(train_sel.index).copy()

    X_train = train_sel[feat_col].copy()
    y_train = train_sel[target_col].values.ravel()

    X_val = val_sel[feat_col].copy()
    y_val = val_sel[target_col].values.ravel()

    X_test = data_test[feat_col].copy()
    y_test = data_test[target_col].values.ravel()

    clf_model = lgb.LGBMRegressor(boosting_type="gbdt", n_estimators=1000, random_state=0, n_jobs=4, num_leaves=5)

    clf_model.fit(X_train, y_train, 
                  eval_set=[(X_train, y_train), (X_val, y_val)],
                  callbacks=[lgb.log_evaluation(period=10), 
                             lgb.early_stopping(stopping_rounds=10)])

    y_pred = clf_model.predict(X_test)

    y_test_final.append(y_test)
    y_pred_final.append(y_pred)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15598
[LightGBM] [Info] Number of data points in the train set: 192, number of used features: 512
[LightGBM] [Info] Start training from score 12.519382
Training until validation scores don't improve for 10 rounds
[10]	training's l2: 1.94141	valid_1's l2: 4.68651
[20]	training's l2: 1.21275	valid_1's l2: 4.17862
[30]	training's l2: 0.818753	valid_1's l2: 4.16434
[40]	training's l2: 0.55816	valid_1's l2: 4.00933
[50]	training's l2: 0.390461	valid_1's l2: 3.97357
Early stopping, best iteration is:
[46]	training's l2: 0.450807	valid_1's l2: 3.95959
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15648
[LightGBM] [Info] Number of data points in the train set: 192, number of used features: 514
[LightGBM] [Info] Start training from score 12.346826
Training until validation scores don't improve for 10 rounds
[10]	training's l2: 1.81844	valid_1's l2: 4.92465
[20]	trainin

In [3]:
res_df = pd.DataFrame(columns=["Model", "Times", "Score", "Type"])
for i in range(len(y_test_final)):
    ## Get score
    #score_pear = pearsonr(y_test_final[i], y_pred_final[i])[0]
    score_spear = spearmanr(y_test_final[i], y_pred_final[i])[0]
    #score_rmse = metrics.mean_squared_error(y_test_final[i], y_pred_final[i], squared=False)
    score_rmse = metrics.root_mean_squared_error(y_test_final[i], y_pred_final[i])
    score_nrmse = score_rmse / np.std(y_test_final[i])

    #res_df.loc[len(res_df)] = ["LB_ALL", i+1, score_pear, "R"]
    res_df.loc[len(res_df)] = ["LB_ALL", i+1, score_spear, "R"]
    res_df.loc[len(res_df)] = ["LB_ALL", i+1, score_rmse, "RMSE"]
    res_df.loc[len(res_df)] = ["LB_ALL", i+1, score_nrmse, "NRMSE"]

with open(os.path.join(TEST_RES_PATH, "PSugar_LB_ALL.pickle"), "wb") as out_f:
    pickle.dump(res_df, out_f)

In [4]:
res_df.groupby(["Type"])["Score"].mean()

Type
NRMSE    0.796767
R        0.625229
RMSE     1.701180
Name: Score, dtype: float64