In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.multiclass import OneVsRestClassifier
import gc
import json
from halo import Halo
from numerapi import NumerAPI
from sklearn.metrics import roc_auc_score

In [3]:
from utils import (
    save_model,
    load_model,
    neutralize,
    get_biggest_change_features,
    validation_metrics,
    get_time_series_cross_val_splits,
    save_model_config,
    load_model_config,
    ERA_COL,
    DATA_TYPE_COL,
    TARGET_COL,
    EXAMPLE_PREDS_COL
)
napi = NumerAPI()



## All training and model configs

In [4]:
model_configs = {
    "LGBM_cfg1": {
        "n_estimators": 2000,
        "learning_rate": 0.01,
        "max_depth": 5,
        "num_leaves": 2 ** 5,
        "colsample_bytree": 0.1
    },
    "LGBM_cfg2": {
        "n_estimators": 3000,
        "learning_rate": 0.005,
        "max_depth": 5,
        "num_leaves": 2 ** 5,
        "colsample_bytree": 0.1
    },
    "RF_cfg1": {
        "n_estimators": 1000,
        "criterion": "squared_error",
        "max_depth": 5,
        "min_samples_leaf": 5
    },
    "RF_cfg2": {
        "n_estimators": 2000,
        "criterion": "squared_error",
        "max_depth": 3,
        "min_samples_leaf": 5
    },
    "XGB_cfg1": {
        "n_estimators": 2000,
        "learning_rate": 0.01,
        "max_depth": 5,
        "colsample_bytree": 0.1
    },
    "RF_cfg2": {
        "n_estimators": 3000,
        "learning_rate": 0.005,
        "max_depth": 5,
        "colsample_bytree": 0.1
    },
}



In [11]:
training_configs = {
    "FEATURE_SET": "small",
    "MODEL_CONFIG": "LGBM_cfg1",
}



In [12]:
current_round = napi.get_current_round(tournament=8)  # tournament 8 is the primary Numerai Tournament
# napi.download_dataset("numerai_tournament_data.parquet", f"data/tournament_data_{current_round}.parquet")



In [13]:
spinner = Halo(text='', spinner='dots')



## Feature set information

In [14]:
with open("data/features.json", "r") as f:
    feature_metadata = json.load(f)



In [15]:
feature_set = training_configs["FEATURE_SET"]
features = feature_metadata["feature_sets"][feature_set]



In [16]:
len(feature_metadata["feature_sets"]["small"])

38



In [17]:
feature_metadata["feature_sets"].keys()

dict_keys(['legacy', 'small', 'medium'])



In [18]:
len(features)

38



In [19]:
# read in just those features along with era and target columns
read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]
training_data = pd.read_parquet('data/numerai_training_data.parquet', columns=read_columns)



In [13]:
training_data["target_nomi_20"].value_counts()

0.50    1206036
0.75     482458
0.25     482411
0.00     120613
1.00     120587
Name: target_nomi_20, dtype: int64



In [14]:
model_config = training_configs["MODEL_CONFIG"]



## Train Regression models (LGBM, RF, XGBOOST)

In [12]:
model_name = "model_{}_{}".format(feature_set, model_config)
print(f"Checking for existing model '{model_name}'")
model = load_model(model_name)
if not model:
    print(f"model not found, creating new one")
    params = model_configs[model_config]

    if model_config.startswith("LGBM"):
        model = LGBMRegressor(**params)
    elif model_config.startswith("RF"):
        model = RandomForestRegressor(**params)
    else:
        model = XGBRegressor(**params)

    # train on all of train and save the model so we don't have to train next time
    spinner.start('Training model')
    model.fit(training_data.filter(like='feature_', axis='columns'),
              training_data[TARGET_COL])
    print(f"saving new model: {model_name}")
    save_model(model, model_name)
    spinner.succeed()


Checking for existing model 'model_medium_LGBM_cfg1'


In [13]:
gc.collect()

94



In [29]:
print('Reading minimal features of validation and tournament data...')
validation_data = pd.read_parquet('data/numerai_validation_data.parquet',
                                  columns=read_columns)

Reading minimal features of validation and tournament data...


In [30]:
tournament_data = pd.read_parquet(f'data/tournament_data_{current_round}.parquet',
                                  columns=read_columns)
nans_per_col = tournament_data[tournament_data["data_type"] == "live"].isna().sum()



In [16]:
# check for nans and fill nans
if nans_per_col.any():
    total_rows = len(tournament_data[tournament_data["data_type"] == "live"])
    print(f"Number of nans per column this week: {nans_per_col[nans_per_col > 0]}")
    print(f"out of {total_rows} total rows")
    print(f"filling nans with 0.5")
    tournament_data.loc[:, features].fillna(0.5, inplace=True)
else:
    print("No nans in the features this week!")


Number of nans per column this week: target_nomi_20    5384
dtype: int64
out of 5384 total rows
filling nans with 0.5


In [17]:
spinner.start('Predicting on validation and tournament data')
# double check the feature that the model expects vs what is available to prevent our
# pipeline from failing if Numerai adds more data and we don't have time to retrain!
model_expected_features = model.booster_.feature_name()
if set(model_expected_features) != set(features):
    print(f"New features are available! Might want to retrain model {model_name}.")
validation_data.loc[:, f"preds_{model_name}"] = model.predict(
    validation_data.loc[:, model_expected_features])
tournament_data.loc[:, f"preds_{model_name}"] = model.predict(
    tournament_data.loc[:, model_expected_features])
spinner.succeed()

gc.collect()

✔ Predicting on validation and tournament data


23



In [18]:
all_feature_corrs = training_data.groupby(ERA_COL).apply(
    lambda era: era[features].corrwith(era[TARGET_COL])
)
riskiest_features = get_biggest_change_features(all_feature_corrs, 50)



In [19]:
spinner.start('Neutralizing to risky features')

# neutralize our predictions to the riskiest features
validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
    df=validation_data,
    columns=[f"preds_{model_name}"],
    neutralizers=riskiest_features,
    proportion=1.0,
    normalize=True,
    era_col=ERA_COL
)

tournament_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
    df=tournament_data,
    columns=[f"preds_{model_name}"],
    neutralizers=riskiest_features,
    proportion=1.0,
    normalize=True,
    era_col=ERA_COL
)
spinner.succeed()


✔ Neutralizing to risky features


<halo.halo.Halo at 0x1231d3340>



In [28]:
current_round

292



In [22]:
model_to_submit = f"preds_{model_name}_neutral_riskiest_50"

# rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
tournament_data["prediction"] = tournament_data[model_to_submit].rank(pct=True)
validation_data["prediction"].to_csv(f"pred/validation_predictions_{model_name}_{current_round}.csv")
tournament_data["prediction"].to_csv(f"pred/tournament_predictions_{model_name}_{current_round}.csv")




In [82]:
model_to_submit = f"preds_{model_name}"

validation_data["prediction_w_risk"] = validation_data[model_to_submit].rank(pct=True)
tournament_data["prediction_w_risk"] = tournament_data[model_to_submit].rank(pct=True)
validation_data["prediction_w_risk"].to_csv(f"pred/validation_predictions_{model_name}_{current_round}.csv")
tournament_data["prediction_w_risk"].to_csv(f"pred/tournament_predictions_{model_name}_{current_round}.csv")



In [None]:
model_to_submit = f"preds_{model_name}_neutral_riskiest_50"

# rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
tournament_data["prediction"] = tournament_data[model_to_submit].rank(pct=True)
validation_data["prediction"].to_csv(f"pred/validation_predictions_{model_name}_{current_round}.csv")
tournament_data["prediction"].to_csv(f"pred/tournament_predictions_{model_name}_{current_round}.csv")

In [27]:
validation_data.head()

Unnamed: 0_level_0,feature_dichasial_hammier_spawner,feature_rheumy_epistemic_prancer,feature_pert_performative_hormuz,feature_hillier_unpitied_theobromine,feature_perigean_bewitching_thruster,feature_renegade_undomestic_milord,feature_koranic_rude_corf,feature_demisable_expiring_millepede,feature_unscheduled_malignant_shingling,feature_clawed_unwept_adaptability,...,feature_dipped_sent_giuseppe,feature_undivorced_unsatisfying_praetorium,feature_reclinate_cruciform_lilo,era,data_type,target_nomi_20,preds_model_medium_LGBM_cfg1,preds_model_medium_LGBM_cfg1_neutral_riskiest_50,prediction,example_preds
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n000777698096000,0.5,0.5,0.25,0.25,0.0,0.0,0.0,0.75,1.0,0.25,...,1.0,0.5,1.0,857,validation,0.25,0.475515,-1.015812,0.141486,
n0009793a3b91c27,0.75,0.25,0.5,0.75,1.0,0.0,0.25,0.25,1.0,0.25,...,0.5,0.25,0.5,857,validation,0.5,0.50963,0.296737,0.623966,
n00099ccd6698ab0,0.25,0.75,0.0,0.75,1.0,0.75,1.0,1.0,0.75,1.0,...,1.0,0.25,1.0,857,validation,0.0,0.517587,0.562613,0.725224,
n0019e36bbb8702b,0.5,1.0,0.25,0.75,0.75,0.5,0.0,0.75,0.0,0.25,...,0.75,0.25,1.0,857,validation,0.5,0.507401,0.124685,0.552808,
n0028cb874439df8,0.0,0.75,0.0,0.25,1.0,0.5,0.0,0.0,0.0,0.5,...,1.0,1.0,1.0,857,validation,0.5,0.486497,-0.633783,0.247748,




In [124]:
validation_data[["prediction_w_risk", "prediction", "preds_model_medium_LGBM_cfg1", "preds_model_medium_LGBM_cfg1_neutral_riskiest_50", "target_nomi_20"]].head()

Unnamed: 0_level_0,prediction_w_risk,prediction,preds_model_medium_LGBM_cfg1,preds_model_medium_LGBM_cfg1_neutral_riskiest_50,target_nomi_20
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
n000777698096000,0.026383,0.141486,0.475515,-1.015812,0.25
n0009793a3b91c27,0.766374,0.623966,0.50963,0.296737,0.5
n00099ccd6698ab0,0.90605,0.725224,0.517587,0.562613,0.0
n0019e36bbb8702b,0.71168,0.552808,0.507401,0.124685,0.5
n0028cb874439df8,0.129547,0.247748,0.486497,-0.633783,0.5




In [33]:
validation_data["target_nomi_20"].value_counts()

0.50    269876
0.25    107918
0.75    107918
1.00     26980
0.00     26966
Name: target_nomi_20, dtype: int64



## Regression to Classification

In [39]:
def val2class(x):
    if 0 <= x < 0.125:
        return 0
    elif 0.125 <= x < 0.375:
        return 1
    elif 0.375 <= x < 0.625:
        return 2
    elif 0.625 <= x < 0.875:
        return 3
    else:
        return 4




In [86]:
# model as multiclass classification
y_class = validation_data["target_nomi_20"].apply(val2class).values.reshape(-1, 1)
y_pred = validation_data["prediction_w_risk"].apply(val2class).values.reshape(-1, 1)



In [87]:
print(y_class.shape)
print(y_pred.shape)

(539658, 1)
(539658, 1)


In [88]:
ohe = OneHotEncoder()
y_class = ohe.fit_transform(y_class).toarray()
y_pred = ohe.transform(y_pred).toarray()



In [89]:
print(y_class.shape)
print(y_pred.shape)

(539658, 5)
(539658, 5)


In [90]:
roc_auc_score(y_class, y_pred, multi_class="ovr")

0.5125012650278737



## Train classifiers

In [20]:
model_clf_configs = {
    "LGBM_cfg1": {
        "n_estimators": 2000,
        "learning_rate": 0.01,
        "max_depth": 5,
        "num_leaves": 2 ** 5,
        "colsample_bytree": 0.1
    },
    "LGBM_cfg2": {
        "n_estimators": 3000,
        "learning_rate": 0.005,
        "max_depth": 5,
        "num_leaves": 2 ** 5,
        "colsample_bytree": 0.1
    },
}



In [21]:
model_clf_config = "LGBM_cfg1"



In [25]:
model_name = "model_clf_{}_{}".format(feature_set, model_clf_config)
print(f"Checking for existing model '{model_name}'")
model = load_model(model_name)
if not model:
    print(f"model not found, creating new one")
    params = model_clf_configs[model_clf_config]

    if model_clf_config.startswith("LGBM"):
        model = LGBMClassifier(**params)
    else:
        print("Not support")
    # elif model_config.startswith("RF"):
    #     model = RandomForestRegressor(**params)
    # else:
    #     model = XGBRegressor(**params)

    # train on all of train and save the model so we don't have to train next time
    spinner.start('Training model')
    y = training_data[TARGET_COL].values.reshape(-1, 1)
    ohe = OneHotEncoder()
    y = ohe.fit_transform(y).toarray()
    
    model = OneVsRestClassifier(model).fit(training_data.filter(like='feature_', axis='columns'), y)#.predict(X)
    # model.fit(training_data.filter(like='feature_', axis='columns'), y)
    print(f"saving new model: {model_name}")
    save_model(model, model_name)
    spinner.succeed()


Checking for existing model 'model_clf_small_LGBM_cfg1'
model not found, creating new one
⠋
✔ Training model


In [35]:
y = validation_data[TARGET_COL].values.reshape(-1, 1)
ohe = OneHotEncoder()
y = ohe.fit_transform(y).toarray()



In [33]:
# model_expected_features = model.booster_.feature_name()
y_pred = model.predict(validation_data.filter(like='feature_', axis='columns'))



In [36]:
roc_auc_score(y, y_pred, multi_class="ovr")

0.51803957892829



In [None]:
spinner.start('Reading example validation predictions')
# validation_preds = pd.read_parquet(f"pred/validation_predictions_{model_name}_{current_round}.csv")
validation_preds = pd.read_csv(f"pred/validation_predictions_{model_name}_{current_round}.csv")
validation_data[EXAMPLE_PREDS_COL] = validation_preds["prediction"]
spinner.succeed()

# get some stats about each of our models to compare...
# fast_mode=True so that we skip some of the stats that are slower to calculate
# validation_stats = validation_metrics(validation_data, [model_to_submit], example_col=EXAMPLE_PREDS_COL, fast_mode=True)
# print(validation_stats[["mean", "sharpe"]].to_markdown())
# print(validation_stats)

## Advanced Model

In [41]:
advanced_model_configs = {
    "LGBM_cfg1": {
        "n_estimators": 2000,
        "learning_rate": 0.01,
        "max_depth": 5,
        "num_leaves": 2 ** 5,
        "colsample_bytree": 0.1
    },
    "LGBM_cfg2": {
        "n_estimators": 3000,
        "learning_rate": 0.005,
        "max_depth": 5,
        "num_leaves": 2 ** 5,
        "colsample_bytree": 0.1
    },
    "RF_cfg1": {
        "n_estimators": 1000,
        "criterion": "squared_error",
        "max_depth": 5,
        "min_samples_leaf": 5
    },
    "RF_cfg2": {
        "n_estimators": 2000,
        "criterion": "squared_error",
        "max_depth": 3,
        "min_samples_leaf": 5
    },

}



In [42]:
advanced_training_configs = {
    "FEATURE_SET": "medium",
    "MODEL_CONFIG": "LGBM_cfg1",
    "downsample_cross_val": 20,
    "downsample_full_train": 1,
    "model_selection_loop": True,
    "model_config_name": "advanced_example_model",
}



In [None]:
model_config_name = "LGBM_cfg1"
model_params = advanced_model_configs[model_config_name]

print("Entering model selection loop.  This may take awhile.")
if advanced_training_configs["model_selection_loop"]:
    model_config = {}
    print('downloading training_data')

    # keep track of some prediction columns
    ensemble_cols = set()
    pred_cols = set()

    # pick some targets to use
    possible_targets = [c for c in training_data.columns if c.startswith("target_")]
    # randomly pick a handful of targets
    # this can be vastly improved
    targets = ["target_nomi_20"]

    # all the possible features to train on
    feature_cols = [c for c in training_data if c.startswith("feature_")]

    """ do cross val to get out of sample training preds"""
    cv = 3
    train_test_zip = get_time_series_cross_val_splits(training_data, cv=cv, embargo=12)
    # get out of sample training preds via embargoed time series cross validation
    # optionally downsample training data to speed up this section.
    print("entering time series cross validation loop")
    for split, train_test_split in enumerate(train_test_zip):
        gc.collect()
        print(f"doing split {split+1} out of {cv}")
        train_split, test_split = train_test_split
        train_split_index = training_data[ERA_COL].isin(train_split)
        test_split_index = training_data[ERA_COL].isin(test_split)
        downsampled_train_split_index = train_split_index[train_split_index].index[::advanced_training_configs["downsample_cross_val"]]

        # getting the per era correlation of each feature vs the primary target across the training split
        print("getting feature correlations over time and identifying riskiest features")
        all_feature_corrs_split = training_data.loc[downsampled_train_split_index, :].groupby(ERA_COL).apply(
            lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
        # find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
        # there are probably more clever ways to do this
        riskiest_features_split = get_biggest_change_features(all_feature_corrs_split, 50)

        print(f"entering model training loop for split {split+1}")
        for target in targets:
            model_name = f"model_{target}"
            print(f"model: {model_name}")

            # train a model on the training split (and save it for future use)
            downsample_cross_val = advanced_training_configs["downsample_cross_val"]
            split_model_name = f"model_{target}_split{split+1}cv{cv}downsample{downsample_cross_val}"
            split_model = load_model(split_model_name)
            if not split_model:
                print(f"training model: {model_name}")
                split_model = LGBMRegressor(**model_params)
                split_model.fit(training_data.loc[downsampled_train_split_index, feature_cols],
                                training_data.loc[downsampled_train_split_index,
                                                  [target]])
                save_model(split_model, split_model_name)
            # now we can predict on the test part of the split
            model_expected_features = split_model.booster_.feature_name()
            if set(model_expected_features) != set(feature_cols):
                print(f"New features are available! Might want to retrain model {split_model_name}.")
            print(f"predicting {model_name}")
            training_data.loc[test_split_index, f"preds_{model_name}"] = \
                split_model.predict(training_data.loc[test_split_index, model_expected_features])

            # do neutralization
            print("doing neutralization to riskiest features")
            training_data.loc[test_split_index, f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
                df=training_data.loc[test_split_index, :],
                columns=[f"preds_{model_name}"],
                neutralizers=riskiest_features_split,
                proportion=1.0,
                normalize=True,
                era_col=ERA_COL)[f"preds_{model_name}"]

            # remember that we made all of these different pred columns
            pred_cols.add(f"preds_{model_name}")
            pred_cols.add(f"preds_{model_name}_neutral_riskiest_50")

        print("creating ensembles")
        # ranking per era for all of our pred cols so we can combine safely on the same scales
        training_data[list(pred_cols)] = training_data.groupby(ERA_COL).apply(
            lambda d: d[list(pred_cols)].rank(pct=True))
        # do ensembles
        training_data["ensemble_neutral_riskiest_50"] = sum(
            [training_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
            pct=True)
        training_data["ensemble_not_neutral"] = sum(
            [training_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
        training_data["ensemble_all"] = sum([training_data[pred_col] for pred_col in pred_cols]).rank(pct=True)

        ensemble_cols.add("ensemble_neutral_riskiest_50")
        ensemble_cols.add("ensemble_not_neutral")
        ensemble_cols.add("ensemble_all")

    """ Now get some stats and pick our favorite model"""
    print("gathering validation metrics for out of sample training results")
    all_model_cols = list(pred_cols) + list(ensemble_cols)
    # use example_col preds_model_target as an estimates since no example preds provided for training
    # fast_mode=True so that we skip some of the stats that are slower to calculate
    training_stats = validation_metrics(training_data, all_model_cols, example_col="preds_model_target",
                                        fast_mode=True)
    print(training_stats[["mean", "sharpe"]].sort_values(by="sharpe", ascending=False).to_markdown())

    # pick the model that has the highest correlation sharpe
    best_pred_col = training_stats.sort_values(by="sharpe", ascending=False).head(1).index[0]
    print(f"selecting model {best_pred_col} as our highest sharpe model in validation")

    """ Now do a full train"""
    print("entering full training section")
    # getting the per era correlation of each feature vs the target across all of training data
    print("getting feature correlations with target and identifying riskiest features")
    all_feature_corrs = training_data.groupby(ERA_COL).apply(
        lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
    # find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
    riskiest_features = get_biggest_change_features(all_feature_corrs, 50)

    for target in targets:
        gc.collect()
        downsample_full_train = advanced_training_configs["downsample_full_train"]
        model_name = f"model_{target}_downsample{downsample_full_train}"
        model = load_model(model_name)
        if not model:
            print(f"training {model_name}")
            model = LGBMRegressor(**model_params)
            # train on all of train, predict on val, predict on tournament
            model.fit(training_data.iloc[::advanced_training_configs["downsample_full_train"]].loc[:, feature_cols],
                      training_data.iloc[::advanced_training_configs["downsample_full_train"]][target])
            save_model(model, model_name)
        gc.collect()

    model_config["feature_cols"] = feature_cols
    model_config["targets"] = targets
    model_config["best_pred_col"] = best_pred_col
    model_config["riskiest_features"] = riskiest_features
    print(f"saving model config for {model_config_name}")
    save_model_config(model_config, model_config_name)
else:
    # load model config from previous model selection loop
    print(f"loading model config for {model_config_name}")
    model_config = load_model_config(model_config_name)
    feature_cols = model_config["feature_cols"]
    targets = model_config["targets"]
    best_pred_col = model_config["best_pred_col"]
    riskiest_features = model_config["riskiest_features"]


""" Things that we always do even if we've already trained """
gc.collect()