In [2]:
import pandas as pd
# from lightgbm import LGBMRegressor
import gc
import json
from pathlib import Path
from numerapi import NumerAPI
import pickle

"""Era Split Model"""
from sklearn.ensemble import EraHistGradientBoostingRegressor

'''Baseline Model'''
from sklearn.ensemble import HistGradientBoostingRegressor

napi = NumerAPI()

Path("./v4").mkdir(parents=False, exist_ok=True)
# napi.download_dataset("v4/train.parquet")
# napi.download_dataset("v4/validation.parquet")
# napi.download_dataset("v4/live.parquet", f"v4/live_{current_round}.parquet")
# napi.download_dataset("v4/validation_example_preds.parquet")
# napi.download_dataset("v4/features.json")

print('Reading minimal training data')
# read the feature metadata and get a feature set (or all the features)
with open("v4/features.json", "r") as f:
    feature_metadata = json.load(f)
features = feature_metadata["feature_sets"]['small']



Reading minimal training data


In [3]:
def run_save( model, trial_identifier, fit_eras = False ):
    if fit_eras:
        model.fit(training_data[ features ], training_data[ TARGET_COL ], training_data['era'].values)
    else:
        model.fit(training_data[ features ], training_data[ TARGET_COL ])
    validation_data.loc[:, f"preds_{trial_identifier}"] = model.predict( validation_data[features] )

    corrs = validation_data.groupby('era').apply( 
        lambda x: x[[f"preds_{trial_identifier}", TARGET_COL]].corr().iloc[0,1] 
    )

    desc = corrs.describe()
    desc['sharpe'] = corrs.mean()/corrs.std()
    desc['win_rate'] = ( corrs.dropna() > 0 ).sum() / len(corrs)

    # Save the trained model with trial identifier
    model_filename = f"{trial_identifier}_model.pkl"
    with open(model_filename, "wb") as model_file:
        pickle.dump(model, model_file)

    # Save the desc DataFrame with trial identifier
    desc_filename = f"{trial_identifier}_desc.csv"
    desc.to_csv(desc_filename, index=True)

    # Save the validation data with predictions and trial identifier
    validation_data_filename = f"{trial_identifier}_validation_data_with_preds.csv"
    validation_data[[f'preds_{trial_identifier}']].to_csv(validation_data_filename, index=False)


In [4]:
TARGET_COL="target_cyrus_v4_20"

training_data = pd.read_parquet('v4/train.parquet')
validation_data = pd.read_parquet('v4/validation.parquet')

# features = [ f for f in list(training_data) if 'feature' in f ]

n_iter = 100

# Define a trial identifier string
trial_identifier = "baseline"  # You can change this to identify different trials

model = HistGradientBoostingRegressor( 
    early_stopping=False, 
    max_iter=n_iter, 
    max_depth=5, 
    learning_rate=.1, 
    colsample_bytree=.1, 
    max_leaf_nodes=32, 

)

run_save( model, trial_identifier )



In [5]:
'''Era Split'''

training_data['era'] = training_data['era'].astype('int')

trial_identifier = "erasplit"

model = EraHistGradientBoostingRegressor( 
    early_stopping=False, 
    boltzmann_alpha=0, 
    max_iter=n_iter, 
    max_depth=5, 
    learning_rate=.1, 
    colsample_bytree=.1, 
    max_leaf_nodes=32, 
    gamma=0, 
    blama=0, 
    era_boosting=False,
    gain_debug=False,
    vanna=0
)
run_save( model, trial_identifier, fit_eras=True )



In [6]:
'''Era Split Mixin'''

trial_identifier = "erasplit_mixin"

training_data['era'] = training_data['era'].astype('int')

era_model = EraHistGradientBoostingRegressor( 
    early_stopping=False, 
    boltzmann_alpha=0, 
    max_iter=n_iter, 
    max_depth=5, 
    learning_rate=.1, 
    colsample_bytree=.1, 
    max_leaf_nodes=32, 
    gamma=0.5, 
    blama=0, 
    era_boosting=False,
    gain_debug=False,
    vanna=0
)
run_save( model, trial_identifier, fit_eras=True )


In [None]:
'''Directional Era Split'''

trial_identifier = "directional_erasplit"

training_data['era'] = training_data['era'].astype('int')

era_model = EraHistGradientBoostingRegressor( 
    early_stopping=False, 
    boltzmann_alpha=0, 
    max_iter=n_iter, 
    max_depth=5, 
    learning_rate=.1, 
    colsample_bytree=.1, 
    max_leaf_nodes=32, 
    gamma=0, 
    blama=1, 
    era_boosting=False,
    gain_debug=False,
    vanna=0
)
run_save( model, trial_identifier, fit_eras=True )


In [8]:
'''Directional Era Split Mixin'''

trial_identifier = "directional_erasplit_mixin"

training_data['era'] = training_data['era'].astype('int')

era_model = EraHistGradientBoostingRegressor( 
    early_stopping=False, 
    boltzmann_alpha=0, 
    max_iter=n_iter, 
    max_depth=5, 
    learning_rate=.1, 
    colsample_bytree=.1, 
    max_leaf_nodes=32, 
    gamma=0.5, 
    blama=0.5, 
    era_boosting=False,
    gain_debug=False,
    vanna=0
)
run_save( model, trial_identifier, fit_eras=True )

In [9]:
import pickle
import pandas as pd

trials = [
    "baseline",
    "erasplit",
    "erasplit_mixin",
    "directional_erasplit",
    "directional_erasplit_mixin"
]

loaded_desc = []

for trial in trials:

    model = f"{trial}_model.pkl"
    desc = f"{trial}_desc.csv"
    validation = f"{trial}_validation_data_with_preds.csv"

#     loaded_model = pickle.load(open(model, "rb"))
#     loaded_desc = pd.read_csv(desc, index_col=0)
#     loaded_validation = pd.read_csv(validation)

    # Now you can use loaded_model, loaded_desc, and loaded_validation as needed
    loaded_desc.append( pd.read_csv(desc, index_col=0) )

In [11]:
results = pd.concat(loaded_desc, axis=1)
results.columns = trials
results

Unnamed: 0,baseline,erasplit,erasplit_mixin,directional_erasplit,directional_erasplit_mixin
count,499.0,499.0,499.0,499.0,499.0
mean,0.014349,0.011998,0.013206,0.012214,0.012306
std,0.020185,0.019749,0.019858,0.020063,0.020065
min,-0.049736,-0.04333,-0.044734,-0.042902,-0.048945
25%,0.000213,-0.001809,-0.000583,-0.002111,-0.002547
50%,0.014734,0.012376,0.012476,0.011249,0.011073
75%,0.028303,0.026073,0.026765,0.025079,0.026942
max,0.071494,0.066212,0.066284,0.070439,0.069449
sharpe,0.71087,0.607492,0.665031,0.60876,0.613338
win_rate,0.742063,0.71627,0.730159,0.712302,0.702381
