# Model: Cobra v0

**Model Parameters:**
* LightGBM
* Trained on large features
* Top 60 riskiest features neuralized
* Params:
    * "n_estimators": 2000
    * "learning_rate": 0.01
    * "max_depth": 5
    * "num_leaves": 2 ** 5
    * "colsample_bytree": 0.1

**Note:**
* This is the base model with the numerai given params
* Updated: 2022-04-29

---
# Load data & libraries

In [30]:
# TOP VERSION
MODEL_NAME = "cobra"
N_FEATURES_NEUTRALIZED = 60
VERSION = 0

In [31]:
# import dependencies
import itertools
import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from lightgbm import LGBMRegressor
import gc
import json
from pathlib import Path
from numerapi import NumerAPI
from utils.utils import (
    save_model,
    load_model,
    neutralize,
    get_biggest_change_features,
    ERA_COL,
    DATA_TYPE_COL,
    TARGET_COL,
)
from utils.api_keys import (
    PUBLIC_ID,
    SECRET_KEY,
    COBRA_MODEL_ID
)

In [32]:
# instantiate numerai api
napi = NumerAPI(public_id=PUBLIC_ID, secret_key=SECRET_KEY)
current_round = napi.get_current_round()
print(f"Current round #: {current_round}")

Current round #: 313


In [33]:
# read the feature metadata and get the "large" feature set
print('Reading training data...')

with open("data/features.json", "r") as f:
    feature_metadata = json.load(f)

small_features = feature_metadata["feature_sets"]["small"]
medium_features = feature_metadata["feature_sets"]["medium"]
all_features_values = feature_metadata["feature_sets"].values()
all_features = list(set(list(itertools.chain(*all_features_values))))
features = [x for x in all_features if x not in small_features and x not in medium_features]
read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]

training_data = pd.read_parquet('data/train.parquet', columns=read_columns)
validation_data = pd.read_parquet('data/validation.parquet', columns=read_columns)
live_data = pd.read_parquet(f'data/live_{current_round}.parquet', columns=read_columns)

print('...Done.')

Reading training data


---
# Feature selection

In [34]:
# getting the per era correlation of each feature vs the target
all_feature_corrs = training_data.groupby(ERA_COL).apply(
    lambda era: era[features].corrwith(era[TARGET_COL])
)

In [35]:
# get riskiest features (biggest change in correlation to target between h1 era vs h2 era)
riskiest_features = get_biggest_change_features(all_feature_corrs, N_FEATURES_NEUTRALIZED)

In [36]:
# "garbage collection" (gc) gets rid of unused data and frees up memory
gc.collect()

8

---
# Create model

In [37]:
# model name
model_name = f"dh_{MODEL_NAME}_v{VERSION}"
model_name

'dh_cobra_v0'

In [38]:
# check existing model, if not, train it
print(f"Checking for existing model: {model_name}...")
model = load_model(model_name)

if not model:
    print(f"Model not found, creating new one")
    params = {"n_estimators": 2000,
              "learning_rate": 0.01,
              "max_depth": 5,
              "num_leaves": 2 ** 5,
              "colsample_bytree": 0.1,
              "verbosity": 1,
              "n_jobs": -1}
    model = LGBMRegressor(**params)
    
    print(f"Training model: {model_name}...")
    model.fit(
        training_data.filter(like='feature_', axis='columns'),
        training_data[TARGET_COL]
    )
    
    print(f"Saving new model: {model_name}...")
    save_model(model, model_name)

Checking for existing model: dh_cobra_v0...
Model not found, creating new one
Training model: dh_cobra_v0...
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3040
[LightGBM] [Info] Number of data points in the train set: 2420521, number of used features: 608
[LightGBM] [Info] Start training from score 0.500001
Saving new model: dh_cobra_v0...


In [39]:
# clear memory of unused data
gc.collect()

48

---
# Create predictions

In [40]:
# check for nans
nans_per_col = live_data[live_data["data_type"] == "live"].isna().sum()

if nans_per_col.any():
    total_rows = len(live_data[live_data["data_type"] == "live"])
    print(f"Number of nans per column this week: {nans_per_col[nans_per_col > 0]}")
    print(f"out of {total_rows} total rows")
    print(f"filling nans with 0.5")
    live_data.loc[:, features] = live_data.loc[:, features].fillna(0.5)
else:
    print("No nans in the features this week!")

Number of nans per column this week: target_nomi_v4_20    5324
dtype: int64
out of 5324 total rows
filling nans with 0.5


In [41]:
# check feature name
model_expected_features = model.booster_.feature_name()

if set(model_expected_features) != set(features):
    print(f"New features are available! Might want to retrain model {model_name}.")
validation_data.loc[:, f"preds_{model_name}"] = model.predict(
    validation_data.loc[:, model_expected_features])
live_data.loc[:, f"preds_{model_name}"] = model.predict(
    live_data.loc[:, model_expected_features])

In [42]:
# clear memory of unused data
gc.collect()

23

In [43]:
# neutralize our predictions to the riskiest features
validation_data[f"preds_{model_name}_neutral_riskiest_{N_FEATURES_NEUTRALIZED}"] = neutralize(
    df=validation_data,
    columns=[f"preds_{model_name}"],
    neutralizers=riskiest_features,
    proportion=1.0,
    normalize=True,
    era_col=ERA_COL
)

live_data[f"preds_{model_name}_neutral_riskiest_{N_FEATURES_NEUTRALIZED}"] = neutralize(
    df=live_data,
    columns=[f"preds_{model_name}"],
    neutralizers=riskiest_features,
    proportion=1.0,
    normalize=True,
    era_col=ERA_COL
)

In [44]:
# rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
model_to_submit = f"preds_{model_name}_neutral_riskiest_{N_FEATURES_NEUTRALIZED}"

validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
live_data["prediction"] = live_data[model_to_submit].rank(pct=True)
validation_data["prediction"].to_csv(f"predictions/{model_name}_val_preds_{current_round}.csv")
live_data["prediction"].to_csv(f"predictions/{model_name}_live_preds_{current_round}.csv")

---
# Submit predictions

In [45]:
# submit validation file for diagnostics
napi.upload_diagnostics(
    file_path=f"predictions/{model_name}_val_preds_{current_round}.csv",
    model_id=COBRA_MODEL_ID
)

print(f"Submitted validation prediction file '{model_name}_val_preds_{current_round}.csv'")

2022-04-29 14:54:42,254 INFO numerapi.base_api: uploading diagnostics...


Submitted validation prediction file 'dh_cobra_v0_val_preds_313.csv'


In [46]:
# submit live predictions
napi.upload_predictions(
    file_path=f"predictions/{model_name}_live_preds_{current_round}.csv", 
    model_id=COBRA_MODEL_ID
)

print(f"Submitted live prediction file '{model_name}_live_preds_{current_round}.csv'")