In [1]:
import gc  
import os  
import time  
import warnings 
from itertools import combinations  
from warnings import simplefilter 
import joblib  
import playground.optivarfuncs as of
import lightgbm as lgb  
import numpy as np  
import pandas as pd  
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import KFold, TimeSeriesSplit  
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
is_offline = False 
is_train = True  
is_infer = True 
max_lookback = np.nan 
split_day = 435  

# Load Data

In [2]:
df=pd.read_parquet("./tmpalldfgb.parquet")

# Simple evaluation

In [4]:
#what is the average target
_,_,_,target,_,_=of.getDatasets(df,dep_var='target')
av_target_train=target.mean()
av_target_train

df shape=(5236560, 92), min_date=0.0,max_date=480.0, val_start=344.0, tst_start=384.0


-0.045557416726880184

In [5]:
params = {
    'random_seed':123
}
def evaluate_simple(model, X_train, X_val, X_tst, y_train, y_val, y_tst):   
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True),lgb.callback.log_evaluation(period=100)])

    y_pred = model.predict(X_tst)
    return len(y_pred), mean_absolute_error(y_pred, y_tst)
    
def average_target(av_target_train, X_train, X_val, X_tst, y_train, y_val, y_tst):   
    # model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50, verbose=False)])
    return len(y_tst), mean_absolute_error([av_target_train]*len(y_tst), y_tst)

In [6]:
def evaluate_2models(df,params):
    #inferred
    df1=df[df.syn_near_price==1]
    model = lgb.LGBMRegressor(**params)
    # l1,mae1=evaluate_simple(model, *of.getDatasets(df1,dep_var='target'))
    l1,mae1=average_target(av_target_train,*of.getDatasets(df1,dep_var='target'))
    
    #not infered
    df1=df[df.syn_near_price==0]
    model = lgb.LGBMRegressor(**params)
    # l1,mae1=evaluate_simple(model, *of.getDatasets(df1,dep_var='target'))
    l2,mae2=evaluate_simple(model, *of.getDatasets(df1,dep_var='target'))

    #combine
    return (l1*mae1 + l2*mae2)/(l1 + l2)

In [6]:
#synthetic score only
df1=df[df.syn_near_price==1]
model = lgb.LGBMRegressor(**params)
l2,mae2=average_target(model, *of.getDatasets(df1,dep_var='target'))
print(f"l2={l2}, mae2={mae2}")

df shape=(2855760, 92), min_date=0.0,max_date=480.0, val_start=344.0, tst_start=384.0
l2=581940, mae2=6.7156331435713295


In [24]:
#synthetic score only, average target
df1=df[df.syn_near_price==1]
average_target(av_target_train,*of.getDatasets(df1,dep_var='target'))

df shape=(2855760, 92), min_date=0.0,max_date=480.0, val_start=344.0, tst_start=384.0


(581940, 6.715444192813097)

In [7]:
#non synthetic score only
df1=df[df.syn_near_price==0]
model = lgb.LGBMRegressor()
# l1,mae1=evaluate_simple(model, *of.getDatasets(df1,dep_var='target'))
evaluate_simple(model, *of.getDatasets(df1,dep_var='target'))

df shape=(2380800, 92), min_date=0.0,max_date=480.0, val_start=344.0, tst_start=384.0
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.323632 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19884
[LightGBM] [Info] Number of data points in the train set: 1695850, number of used features: 90
[LightGBM] [Info] Start training from score -0.034302


(484950, 5.189393988997699)

In [10]:
#do it all with 1 model
model = lgb.LGBMRegressor()
evaluate_simple(model, *of.getDatasets(df,dep_var='target'))

df shape=(5236560, 92), min_date=0.0,max_date=480.0, val_start=344.0, tst_start=384.0
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.946039 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19916
[LightGBM] [Info] Number of data points in the train set: 3729670, number of used features: 91
[LightGBM] [Info] Start training from score -0.045557


(1066890, 5.965150822011987)

In [26]:
# use 2 models
print(f"MAE={evaluate_2models(df,params)}")

df shape=(2855760, 92), min_date=0.0,max_date=480.0, val_start=344.0, tst_start=384.0
df shape=(2380800, 92), min_date=0.0,max_date=480.0, val_start=344.0, tst_start=384.0
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.356607 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19887
[LightGBM] [Info] Number of data points in the train set: 1695850, number of used features: 90
[LightGBM] [Info] Start training from score -0.034302
MAE=6.020388626083104


In [13]:
# use 2 models 
print(f"MAE={evaluate_2models(df,params)}")

df shape=(2855760, 92), min_date=0.0,max_date=480.0, val_start=344.0, tst_start=384.0
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.425242 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19886
[LightGBM] [Info] Number of data points in the train set: 2033820, number of used features: 89
[LightGBM] [Info] Start training from score -0.054942
df shape=(2380800, 92), min_date=0.0,max_date=480.0, val_start=344.0, tst_start=384.0
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.349780 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19887
[LightGBM] [Info] Number of data points in the train set: 1695850, number of used features: 90
[LightGBM] [Info] Start training from score -0.034302
MAE=5.963954970292424


In [63]:
#REMOVE all rows with synthesized near_price and far_price
params = {
    'random_seed':123
}
# df=pd.read_parquet("./tmpall.parquet")  #MAE=5.193618272135373 
df=pd.read_parquet("./tmpalldfgb.parquet") #MAE=5.189393988997699
# remove syn generated near_price and far price rows
df=df[df.syn_near_price==0]
bs.remove_syn_columns(df)
X_train, X_val,X_tst, y_train, y_val,y_tst=getDatasets(df,dep_var='target')

model = lgb.LGBMRegressor()
print(f"MAE={evaluate_simple(model)}")

min_date=0.0,max_date=480.0, val_start=344.0, tst_start=384.0
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.403960 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19882
[LightGBM] [Info] Number of data points in the train set: 1695850, number of used features: 89
[LightGBM] [Info] Start training from score -0.034302
MAE=5.189393988997699


# From other notebook

In [7]:
lgb_params = {
        "objective": "mae",
        "n_estimators": 6000,
        "num_leaves": 256,
        "subsample": 0.6,
        "colsample_bytree": 0.8,
#         "learning_rate": 0.00871,
        "learning_rate": 0.01,
        'max_depth': 11,
        "n_jobs": 4,
        "device": "gpu",
        "verbosity": -1,
        "importance_type": "gain",
#         "reg_alpha": 0.1,
        "reg_alpha": 0.2,
        "reg_lambda": 3.25
    }


In [8]:
%%time
# Train a LightGBM model for the current fold
model = lgb.LGBMRegressor(**lgb_params)
evaluate_simple(model, *of.getDatasets(df,dep_var='target'))

df shape=(5236560, 92), min_date=0.0,max_date=480.0, val_start=344.0, tst_start=384.0
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 6.34834
[200]	valid_0's l1: 6.32739
[300]	valid_0's l1: 6.31635
[400]	valid_0's l1: 6.30841
[500]	valid_0's l1: 6.3042
[600]	valid_0's l1: 6.30241
[700]	valid_0's l1: 6.30134
[800]	valid_0's l1: 6.30041
[900]	valid_0's l1: 6.29987
[1000]	valid_0's l1: 6.29935
[1100]	valid_0's l1: 6.29875
[1200]	valid_0's l1: 6.299
Early stopping, best iteration is:
[1138]	valid_0's l1: 6.29869
CPU times: user 16min 9s, sys: 4.11 s, total: 16min 13s
Wall time: 4min 16s


(1066890, 5.946751250435294)