In [1]:
import os, pickle, logging, pickle, joblib, sys, warnings, time
warnings.simplefilter('ignore')
from scipy import stats
import numpy as np
import pandas as pd

import catboost as cb
import lightgbm as lgb

from sklearn import ensemble, metrics, pipeline, preprocessing, impute, model_selection
from scipy.stats import pearsonr, spearmanr

#from shaphypetune import BoostRFE, BoostBoruta

ML_RAW_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/rawData/"
TEST_RES_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/tesRes/"

In [None]:
raw_data = pd.read_table(os.path.join(ML_RAW_PATH, "raw_data_PC.txt"), sep="\t", index_col=0)

target_col = ['GZ-PC']
raw_data = raw_data.dropna(subset=target_col)

feat_col = [x for x in raw_data.columns if x.startswith("fea_")]

kf = model_selection.KFold(n_splits=5, shuffle=True,  random_state=0)
y_test_final, y_pred_final = [], []
for i, (train_index, test_index) in enumerate(kf.split(raw_data)):
    print("Round {} start...".format(i))
    start_time = time.time()
    
    data_train = raw_data.iloc[train_index].copy()
    data_test = raw_data.iloc[test_index].copy()

    scale_tool = preprocessing.StandardScaler()
    scale_tool.fit(data_train.loc[:, feat_col])
    data_train.loc[:, feat_col] = scale_tool.transform(data_train.loc[:, feat_col])
    data_test.loc[:, feat_col] = scale_tool.transform(data_test.loc[:, feat_col])

    train_sel = data_train.sample(frac=0.8, random_state=0)
    val_sel = data_train.drop(train_sel.index).copy()

    X_train = train_sel[feat_col].copy()
    y_train = train_sel[target_col].values.ravel()

    X_val = val_sel[feat_col].copy()
    y_val = val_sel[target_col].values.ravel()

    X_test = data_test[feat_col].copy()
    y_test = data_test[target_col].values.ravel()

    clf_model = lgb.LGBMRegressor(boosting_type="gbdt", n_estimators=1000, random_state=0, n_jobs=4, num_leaves=5)

    clf_model.fit(X_train, y_train, 
                  eval_set=[(X_train, y_train), (X_val, y_val)],
                  callbacks=[lgb.log_evaluation(period=10), 
                             lgb.early_stopping(stopping_rounds=10)])

    y_pred = clf_model.predict(X_test)

    y_test_final.append(y_test)
    y_pred_final.append(y_pred)

    end_time = time.time()
    print("Round {}:{}".format(i, end_time-start_time))

In [None]:
res_df = pd.DataFrame(columns=["Model", "Times", "Score", "Type"])
for i in range(len(y_test_final)):
    ## Get score
    #score_pear = pearsonr(y_test_final[i], y_pred_final[i])[0]
    score_spear = spearmanr(y_test_final[i], y_pred_final[i])[0]
    score_rmse = metrics.root_mean_squared_error(y_test_final[i], y_pred_final[i])
    score_nrmse = score_rmse / np.std(y_test_final[i])

    #res_df.loc[len(res_df)] = ["LB_ALL", i+1, score_pear, "R"]
    res_df.loc[len(res_df)] = ["LB_ALL", i+1, score_spear, "R"]
    res_df.loc[len(res_df)] = ["LB_ALL", i+1, score_rmse, "RMSE"]
    res_df.loc[len(res_df)] = ["LB_ALL", i+1, score_nrmse, "NRMSE"]

with open(os.path.join(TEST_RES_PATH, "PC_LB_ALL.pickle"), "wb") as out_f:
    pickle.dump(res_df, out_f)

In [4]:
with open(os.path.join(TEST_RES_PATH, "PC_LB_ALL.pickle"), "rb") as f:
    res_df = pickle.load(f)

res_df

Unnamed: 0,Model,Times,Score,Type
0,LB_ALL,1,0.367944,R
1,LB_ALL,1,9.212,RMSE
2,LB_ALL,1,0.908138,NRMSE
3,LB_ALL,2,0.374652,R
4,LB_ALL,2,10.222444,RMSE
5,LB_ALL,2,0.946494,NRMSE
6,LB_ALL,3,0.335849,R
7,LB_ALL,3,13.542716,RMSE
8,LB_ALL,3,0.94447,NRMSE
9,LB_ALL,4,0.199203,R


In [5]:
res_df.groupby(["Type"])["Score"].mean()

Type
NRMSE     0.931973
R         0.340523
RMSE     11.156104
Name: Score, dtype: float64