# Base Model

---
# Load data & libraries

In [47]:
# import dependencies
import pandas as pd
import numpy as np
import scipy
from pathlib import Path
from halo import Halo
import gc
import json
from numerapi import NumerAPI
from sklearn.ensemble import HistGradientBoostingRegressor

from utils import (
    save_model,
    load_model,
    neutralize,
    get_biggest_change_features,
    validation_metrics,
    ERA_COL,
    DATA_TYPE_COL,
    TARGET_COL,
    EXAMPLE_PREDS_COL
)





In [39]:
# functions
def get_biggest_change_features(corrs, n):
    all_eras = corrs.index.sort_values()
    h1_eras = all_eras[:len(all_eras) // 2]
    h2_eras = all_eras[len(all_eras) // 2:]
    h1_corr_means = corrs.loc[h1_eras, :].mean()
    h2_corr_means = corrs.loc[h2_eras, :].mean()
    corr_diffs = h2_corr_means - h1_corr_means
    worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist()
    return worst_n

def load_model(name):
    path = Path(f"{MODEL_FOLDER}/{name}.pkl")
    if path.is_file():
        model = pd.read_pickle(f"{MODEL_FOLDER}/{name}.pkl")
    else:
        model = False
    return model

def save_model(model, name):
    try:
        Path(MODEL_FOLDER).mkdir(exist_ok=True, parents=True)
    except Exception as ex:
        pass
    pd.to_pickle(model, f"{MODEL_FOLDER}/{name}.pkl")

def neutralize(df, columns, neutralizers=None, proportion=1.0, normalize=True, era_col="era"):
    if neutralizers is None:
        neutralizers = []
    unique_eras = df[era_col].unique()
    computed = []
    for u in unique_eras:
        df_era = df[df[era_col] == u]
        scores = df_era[columns].values
        if normalize:
            scores2 = []
            for x in scores.T:
                x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x)
                x = scipy.stats.norm.ppf(x)
                scores2.append(x)
            scores = np.array(scores2).T
        exposures = df_era[neutralizers].values
        scores -= proportion * exposures.dot(
            np.linalg.pinv(exposures.astype(np.float32)).dot(scores.astype(np.float32)))
        scores /= scores.std(ddof=0)
        computed.append(scores)
    return pd.DataFrame(np.concatenate(computed), columns=columns, index=df.index)



In [2]:
# instantiate numerai api
napi = NumerAPI()
current_round = napi.get_current_round()
print(f"Current round #: {current_round}")

Current round #: 310


In [5]:
# read the feature metadata and get the "small" feature set
print('Reading minimal training data')

with open("data/features.json", "r") as f:
    feature_metadata = json.load(f)
features = feature_metadata["feature_sets"]["small"]
read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]

training_data = pd.read_parquet('data/training_data.parquet', columns=read_columns)

Reading minimal training data


---
# Feature selection

In [6]:
# getting the per era correlation of each feature vs the target
all_feature_corrs = training_data.groupby(ERA_COL).apply(
    lambda era: era[features].corrwith(era[TARGET_COL])
)

In [10]:
riskiest_features = get_biggest_change_features(all_feature_corrs, 50)
riskiest_features

['feature_undivorced_unsatisfying_praetorium',
 'feature_exorbitant_myeloid_crinkle',
 'feature_unvaried_social_bangkok',
 'feature_haziest_lifelike_horseback',
 'feature_lofty_acceptable_challenge',
 'feature_unaired_operose_lactoprotein',
 'feature_canalicular_peeling_lilienthal',
 'feature_apomictical_motorized_vaporisation',
 'feature_travelled_semipermeable_perruquier',
 'feature_silver_handworked_scauper',
 'feature_antichristian_slangiest_idyllist',
 'feature_slack_calefacient_tableau',
 'feature_jerkwater_eustatic_electrocardiograph',
 'feature_unforbidden_highbrow_kafir',
 'feature_flintier_enslaved_borsch',
 'feature_assenting_darn_arthropod',
 'feature_univalve_abdicant_distrail',
 'feature_beery_somatologic_elimination',
 'feature_bhutan_imagism_dolerite',
 'feature_glare_factional_assessment',
 'feature_unsealed_suffixal_babar',
 'feature_grandmotherly_circumnavigable_homonymity',
 'feature_winsome_irreproachable_milkfish',
 'feature_branched_dilatory_sunbelt',
 'feature_s

In [13]:
# "garbage collection" (gc) gets rid of unused data and frees up memory
gc.collect()

25

---
# Create model

In [28]:
model_name = f"base_model"
print(f"Checking for existing model '{model_name}'")
model = load_model(model_name)

if not model:
    print(f"model not found, creating new one")
    params = {"max_iter": 2000,
              "learning_rate": 0.01,
              "max_depth": 5,
              "max_leaf_nodes": 2 ** 5}

    model = HistGradientBoostingRegressor(**params)
    print(f"Training model: {model_name}")
    model.fit(
        training_data.filter(like='feature_', axis='columns'),
        training_data[TARGET_COL]
    )
    print(f"saving new model: {model_name}")
    save_model(model, model_name)

Checking for existing model 'base_model'
model not found, creating new one
Training model: base_model
saving new model: base_model


In [29]:
# clear memory of unused data
gc.collect()

0



---
# Create predictions

In [31]:
print('Reading minimal features of validation and tournament data...')
validation_data = pd.read_parquet('data/validation_data.parquet', columns=read_columns)
tournament_data = pd.read_parquet(f'data/tournament_data_{current_round}.parquet', columns=read_columns)

Reading minimal features of validation and tournament data...


In [32]:
nans_per_col = tournament_data[tournament_data["data_type"] == "live"].isna().sum()

if nans_per_col.any():
    total_rows = len(tournament_data[tournament_data["data_type"] == "live"])
    print(f"Number of nans per column this week: {nans_per_col[nans_per_col > 0]}")
    print(f"out of {total_rows} total rows")
    print(f"filling nans with 0.5")
    tournament_data.loc[:, features] = tournament_data.loc[:, features].fillna(0.5)
else:
    print("No nans in the features this week!")

Number of nans per column this week: target_nomi_20    5351
dtype: int64
out of 5351 total rows
filling nans with 0.5


In [36]:
# check feature name
model_expected_features = model.feature_names_in_

if set(model_expected_features) != set(features):
    print(f"New features are available! Might want to retrain model {model_name}.")
validation_data.loc[:, f"preds_{model_name}"] = model.predict(
    validation_data.loc[:, model_expected_features])
tournament_data.loc[:, f"preds_{model_name}"] = model.predict(
    tournament_data.loc[:, model_expected_features])



In [37]:
# clear memory of unused data
gc.collect()

4361



In [42]:
# neutralize our predictions to the riskiest features
validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
    df=validation_data,
    columns=[f"preds_{model_name}"],
    neutralizers=riskiest_features,
    proportion=1.0,
    normalize=True,
    era_col=ERA_COL
)

tournament_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
    df=tournament_data,
    columns=[f"preds_{model_name}"],
    neutralizers=riskiest_features,
    proportion=1.0,
    normalize=True,
    era_col=ERA_COL
)



In [46]:
# rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
model_to_submit = f"preds_{model_name}_neutral_riskiest_50"
validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
tournament_data["prediction"] = tournament_data[model_to_submit].rank(pct=True)
validation_data["prediction"].to_csv(f"predictions/validation_predictions_{current_round}.csv")
tournament_data["prediction"].to_csv(f"predictions/tournament_predictions_{current_round}.csv")



In [45]:
print(f'''
Done! Next steps:
    1. Go to numer.ai/tournament (make sure you have an account)
    2. Submit validation_predictions_{current_round}.csv to the diagnostics tool
    3. Submit tournament_predictions_{current_round}.csv to the "Upload Predictions" button
''')


Done! Next steps:
    1. Go to numer.ai/tournament (make sure you have an account)
    2. Submit validation_predictions_310.csv to the diagnostics tool
    3. Submit tournament_predictions_310.csv to the "Upload Predictions" button

