# Base Model

Updated: 2022-04-09

---
# Load data & libraries

In [1]:
# import dependencies
import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from lightgbm import LGBMRegressor
import gc
import json
from pathlib import Path
from numerapi import NumerAPI
from utils import (
    save_model,
    load_model,
    neutralize,
    get_biggest_change_features,
    validation_metrics,
    ERA_COL,
    DATA_TYPE_COL,
    TARGET_COL,
    EXAMPLE_PREDS_COL
)



In [2]:
# instantiate numerai api
napi = NumerAPI()
current_round = napi.get_current_round()
print(f"Current round #: {current_round}")

Current round #: 312


In [4]:
# read the feature metadata and get the "medium" feature set
print('Reading minimal training data')

with open("data/features.json", "r") as f:
    feature_metadata = json.load(f)
features = feature_metadata["feature_sets"]["medium"]
read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]

training_data = pd.read_parquet('data/train.parquet', columns=read_columns)
validation_data = pd.read_parquet('data/validation.parquet', columns=read_columns)
live_data = pd.read_parquet(f'data/live_{current_round}.parquet', columns=read_columns)

Reading minimal training data


---
# Feature selection

In [5]:
# getting the per era correlation of each feature vs the target
all_feature_corrs = training_data.groupby(ERA_COL).apply(
    lambda era: era[features].corrwith(era[TARGET_COL])
)



In [6]:
# get riskiest features
riskiest_features = get_biggest_change_features(all_feature_corrs, 50)
riskiest_features

['feature_censorial_leachier_rickshaw',
 'feature_basophil_urdy_matzo',
 'feature_unsustaining_chewier_adnoun',
 'feature_confusable_pursy_plosion',
 'feature_protonematal_springtime_varioloid',
 'feature_choosier_uncongenial_coachwood',
 'feature_scissile_dejected_kainite',
 'feature_unimaginable_sec_kaka',
 'feature_percipient_atelectatic_cinnamon',
 'feature_narcotized_collectivist_evzone',
 'feature_pruinose_raploch_roubaix',
 'feature_trespassing_unmacadamized_villeneuve',
 'feature_holy_chic_cali',
 'feature_revitalizing_rutilant_swastika',
 'feature_pulverized_unified_dupery',
 'feature_skim_unmeant_bandsman',
 'feature_southerly_assonant_amicability',
 'feature_illuvial_algebraic_modem',
 'feature_unsizable_ancestral_collocutor',
 'feature_unreversed_fain_jute',
 'feature_coraciiform_sciurine_reef',
 'feature_galactopoietic_luckiest_protecting',
 'feature_lateral_confervoid_belgravia',
 'feature_flawed_demonological_toady',
 'feature_unrecognisable_waxier_paging',
 'feature_pal



In [7]:
# "garbage collection" (gc) gets rid of unused data and frees up memory
gc.collect()

0



---
# Create model

In [9]:
model_name = f"base_model"
print(f"Checking for existing model: {model_name}...")
model = load_model(model_name)

if not model:
    print(f"Model not found, creating new one")
    params = {"n_estimators": 2000,
              "learning_rate": 0.01,
              "max_depth": 5,
              "num_leaves": 2 ** 5,
              "colsample_bytree": 0.1,
              "verbosity": 1}
    model = LGBMRegressor(**params)
    
    print(f"Training model: {model_name}...")
    model.fit(
        training_data.filter(like='feature_', axis='columns'),
        training_data[TARGET_COL]
    )
    
    print(f"Saving new model: {model_name}...")
    save_model(model, model_name)

Checking for existing model: base_model...


In [10]:
# clear memory of unused data
gc.collect()

32



---
# Create predictions

In [11]:
# check for nans
nans_per_col = live_data[live_data["data_type"] == "live"].isna().sum()

if nans_per_col.any():
    total_rows = len(live_data[live_data["data_type"] == "live"])
    print(f"Number of nans per column this week: {nans_per_col[nans_per_col > 0]}")
    print(f"out of {total_rows} total rows")
    print(f"filling nans with 0.5")
    live_data.loc[:, features] = live_data.loc[:, features].fillna(0.5)
else:
    print("No nans in the features this week!")

Number of nans per column this week: target_nomi_v4_20    5341
dtype: int64
out of 5341 total rows
filling nans with 0.5


In [12]:
# check feature name
model_expected_features = model.booster_.feature_name()

if set(model_expected_features) != set(features):
    print(f"New features are available! Might want to retrain model {model_name}.")
validation_data.loc[:, f"preds_{model_name}"] = model.predict(
    validation_data.loc[:, model_expected_features])
live_data.loc[:, f"preds_{model_name}"] = model.predict(
    live_data.loc[:, model_expected_features])



In [13]:
# clear memory of unused data
gc.collect()

24



In [14]:
# neutralize our predictions to the riskiest features
validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
    df=validation_data,
    columns=[f"preds_{model_name}"],
    neutralizers=riskiest_features,
    proportion=1.0,
    normalize=True,
    era_col=ERA_COL
)

live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
    df=live_data,
    columns=[f"preds_{model_name}"],
    neutralizers=riskiest_features,
    proportion=1.0,
    normalize=True,
    era_col=ERA_COL
)



In [15]:
# rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
model_to_submit = f"preds_{model_name}_neutral_riskiest_50"

validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
live_data["prediction"] = live_data[model_to_submit].rank(pct=True)
validation_data["prediction"].to_csv(f"predictions/validation_predictions_{current_round}.csv")
live_data["prediction"].to_csv(f"predictions/live_predictions_{current_round}.csv")



In [16]:
print(f'''
Done! Next steps:
    1. Go to numer.ai/tournament (make sure you have an account)
    2. Submit validation_predictions_{current_round}.csv to the diagnostics tool
    3. Submit live_predictions_{current_round}.csv to the "Upload Predictions" button
''')


Done! Next steps:
    1. Go to numer.ai/tournament (make sure you have an account)
    2. Submit validation_predictions_312.csv to the diagnostics tool
    3. Submit live_predictions_312.csv to the "Upload Predictions" button

