# Model: BeautyBeast v0

---
# Load data & libraries

In [2]:
# TOP VERSION
MODEL_NAME = "beautybeast"
VERSION = 0

In [3]:
# import dependencies
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
import gc
import json
from pathlib import Path
from numerapi import NumerAPI
from utils.utils import (
    save_model,
    load_model,
    ERA_COL, 
    DATA_TYPE_COL
)
from utils.api_keys import PUBLIC_ID, SECRET_KEY

In [4]:
# instantiate numerai api
napi = NumerAPI(public_id=PUBLIC_ID, secret_key=SECRET_KEY)
current_round = napi.get_current_round()
print(f"Current round #: {current_round}")

Current round #: 316


In [5]:
# read the feature metadata and get the "medium" feature set
with open("data/features.json", "r") as f:
    feature_metadata = json.load(f)
features = feature_metadata["feature_sets"]["medium"]
targets = [
    "target_nomi_v4_20", "target_jerome_v4_20", "target_janet_v4_20", "target_ben_v4_20", 
    "target_alan_v4_20", "target_paul_v4_20", "target_george_v4_20", "target_william_v4_20", 
    "target_arthur_v4_20", "target_thomas_v4_20"
]
read_columns = features + targets + [ERA_COL, DATA_TYPE_COL]

training_data = pd.read_parquet('data/train.parquet', columns=read_columns)
validation_data = pd.read_parquet('data/validation.parquet', columns=read_columns)
live_data = pd.read_parquet(f'data/live_{current_round}.parquet', columns=read_columns)

---
# Feature selection

In [6]:
# set main and auxiliary 
main_target = "target_nomi_v4_20"
aux_targets = [col for col in training_data.columns if col.endswith("_20") and col != main_target]

In [7]:
# "garbage collection" (gc) gets rid of unused data and frees up memory
gc.collect()

0

---
# Create auxiliary models

In [8]:
# model name
model_name = f"dh_{MODEL_NAME}_v{VERSION}"
model_name

'dh_beautybeast_v0'

In [9]:
# look for trained auxiliary models or train new ones
model_list = []

for t in aux_targets:
    print(f"Checking for existing auxiliary model: {t}...")
    model = load_model(t)

    if not model:
        print(f"Auxiliary model not found, creating new one.")
        params = {
            "n_estimators": 500,
            "learning_rate": 0.01,
            "max_depth": 5,
            "num_leaves": 2 ** 5,
            "colsample_bytree": 0.1,
            "verbosity": 0
        }
        model = LGBMRegressor(**params)
        
        print(f"Training auxiliary model...")
        model.fit(
            training_data.filter(like='feature_', axis='columns'),
            training_data[t]
        )
        
        print(f"Saving new auxiliary model...")
        save_model(model, t)
    
    print("Appending trained auxiliary model to list.")
    model_list.append(model)

Checking for existing auxiliary model: target_jerome_v4_20...
Appending trained auxiliary model to list.
Checking for existing auxiliary model: target_janet_v4_20...
Appending trained auxiliary model to list.
Checking for existing auxiliary model: target_ben_v4_20...
Appending trained auxiliary model to list.
Checking for existing auxiliary model: target_alan_v4_20...
Appending trained auxiliary model to list.
Checking for existing auxiliary model: target_paul_v4_20...
Appending trained auxiliary model to list.
Checking for existing auxiliary model: target_george_v4_20...
Appending trained auxiliary model to list.
Checking for existing auxiliary model: target_william_v4_20...
Appending trained auxiliary model to list.
Checking for existing auxiliary model: target_arthur_v4_20...
Appending trained auxiliary model to list.
Checking for existing auxiliary model: target_thomas_v4_20...
Appending trained auxiliary model to list.


In [10]:
# clean up memory
gc.collect()

0

---
# Create auxiliary predictions on validation data

In [None]:
# create auxiliary predictions for validation data
val_preds_list = []

for t, m in zip(aux_targets, model_list):
    print(f"Creating auxiliary predictions for {t}...")
    val_preds = pd.DataFrame(m.predict(validation_data[features])).rename(columns={0:f"{t}"})
    val_preds_list.append(val_preds)

print("Concatenating predictions into a single dataframe...")
val_preds_all = pd.concat(val_preds_list, axis=1)

In [None]:
# check correlation
val_preds_avg_ranked = val_preds_all.mean(axis=1).rank(pct=True, method="first")
np.corrcoef(validation_data[main_target].fillna(0.5), val_preds_avg_ranked)[0, 1]

In [None]:
# clear memory of unused data
gc.collect()

---
# Create auxiliary predictions on live data

In [11]:
# create auxiliary predictions for live data
live_preds_list = []

for t, m in zip(aux_targets, model_list):
    print(f"Creating auxiliary predictions for {t}...")
    live_preds = pd.DataFrame(m.predict(live_data[features])).rename(columns={0:f"{t}"})
    live_preds_list.append(live_preds)

print("Concatenating predictions into a single dataframe...")
live_preds_all = pd.concat(live_preds_list, axis=1)

Creating auxiliary predictions for target_jerome_v4_20...
Creating auxiliary predictions for target_janet_v4_20...
Creating auxiliary predictions for target_ben_v4_20...
Creating auxiliary predictions for target_alan_v4_20...
Creating auxiliary predictions for target_paul_v4_20...
Creating auxiliary predictions for target_george_v4_20...
Creating auxiliary predictions for target_william_v4_20...
Creating auxiliary predictions for target_arthur_v4_20...
Creating auxiliary predictions for target_thomas_v4_20...
Concatenating predictions into a single dataframe...


In [12]:
# check correlation
live_preds_avg_ranked = live_preds_all.mean(axis=1).rank(pct=True, method="first")

In [13]:
# clear memory of unused data
gc.collect()

32

---
# Save predictions

In [14]:
# rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
val_preds = pd.DataFrame(val_preds_avg_ranked).rename(columns={0:"prediction"}).set_index(validation_data.index)
live_preds = pd.DataFrame(live_preds_avg_ranked).rename(columns={0:"prediction"}).set_index(live_data.index)

val_preds.to_csv(f"predictions/{model_name}_val_preds.csv")
live_preds.to_csv(f"predictions/{model_name}_live_preds_{current_round}.csv")