In [1]:
# from numerapi import NumerAPI
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
import multiprocessing
import lightgbm
import json
from collections import Counter
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')
import gc
from pathlib import Path

def save_model(model, name):
    try:
        Path("models").mkdir(exist_ok=True, parents=True)
    except Exception as ex:
        pass
    pd.to_pickle(model, f"models/{name}.pkl")


def load_model(model_folder, name):
    path = Path(f"{model_folder}/{name}.pkl")
    if path.is_file():
        model = pd.read_pickle(f"{model_folder}/{name}.pkl")
    else:
        model = False
    return model

def save_prediction(prediction, name):
    try:
        Path("predictions").mkdir(exist_ok=True, parents=True)
    except Exception as ex:
        pass
    pd.to_pickle(prediction, f"predictions/{name}.pkl")

# napi = NumerAPI()

In [2]:
# napi.download_dataset("v4.1/train_int8.parquet", "train_int8.parquet")
# napi.download_dataset("v4.1/validation_int8.parquet", "validation_int8.parquet")
# napi.download_dataset("v4.1/features.json", "features.json")

path = "/kaggle/input/numerai-latest-tournament-data/v4.1/"

In [3]:
with open(path+"features.json", "r") as f:
    feature_metadata = json.load(f)
features = feature_metadata["feature_sets"]["medium"]
print(len(features))

targets = feature_metadata["targets"][1:]
targets_v20 = [t for t in targets if t.endswith('20') == True]
targets_v60 = [t for t in targets if t.endswith('60') == True]

print(len(targets_v20), len(targets_v60))

641
18 18


In [4]:
TARGET = 60

if(TARGET == 20):
    targets = targets_v20
    divide = 4
elif(TARGET == 60):
    targets = targets_v60
    divide = 12
else:
    print("error, TARGET has to be 20 or 60")

In [5]:
train_df = pd.read_parquet(path+'train_int8.parquet', columns=['id', 'era'] + targets + features).reset_index()
train_df.loc[:, "era"] = train_df.era.astype(int)
validation_df = pd.read_parquet(path+'validation_int8.parquet', columns=['id', 'era'] + targets + features).reset_index()
validation_df.loc[:, "era"] =validation_df.era.astype(int)

train_df = train_df.loc[lambda x: (x.era%divide) == 0]
validation_df = validation_df.loc[lambda x: (x.era%divide) == 0]

all_data = pd.concat([train_df, validation_df])
print(all_data.shape)
del train_df, validation_df

(412928, 661)


In [6]:
print(all_data['era'].unique())

[  12   24   36   48   60   72   84   96  108  120  132  144  156  168
  180  192  204  216  228  240  252  264  276  288  300  312  324  336
  348  360  372  384  396  408  420  432  444  456  468  480  492  504
  516  528  540  552  564  576  588  600  612  624  636  648  660  672
  684  696  708  720  732  744  756  768  780  792  804  816  828  840
  852  864  876  888  900  912  924  936  948  960  972  984  996 1008
 1020 1032 1044 1056 1068]


In [7]:
# Int8 datatype has pd.NA which don't play nice with models.  We simply fill NA with median values here
print("Cleaning up NAs")
all_data[features] = all_data[features].fillna(all_data[features].median(skipna=True)).astype("int8")
# Alternatively could convert nan columns to be floats and replace pd.NA with np.nan
print("Cleaned up NAs")

Cleaning up NAs
Cleaned up NAs


## Train models on train set

In [8]:
# params_name = "lg_lgbm"
params = {
    "n_estimators": 20000,
    "learning_rate": 0.001,
    "max_depth": 6,
    "num_leaves": 2**6,
    "colsample_bytree": 0.1,
}
# Very small fast params
# params_name = "vsm_lgbm"
# params = {"n_estimators": 2,
#           "learning_rate": 1,
#           "max_depth": 2,
#           "num_leaves": 2 ** 2,
#           "colsample_bytree": 0.1}

model_obj = lightgbm.LGBMRegressor(**params)

In [9]:
%%time
# corr = []

# for counter, train_eras in enumerate(train_data_library):
for target in targets[:3]:
    for subset in [10000, 888]:
        print(f"training model_{target}_{subset}_v60")
        model_obj.fit(all_data[all_data["era"] < subset].loc[:, features], all_data[all_data["era"] < subset].loc[:,target])
        save_model(model_obj, f"model_{target}_{subset}_v60")
#     # Load model
#     model_obj = load_model("/kaggle/input/numerai-recent-eras-hypo-train-on-val-set/models/",f"model_{counter}")
#     # VPredict on validation data to check performance (we want to maximize corr with validation data but also not overfit)
#     print("Predicting on validation")
#     pred_hillclimb = model_obj.predict(validation_df.loc[:, features])
#     validation_df.loc[:, 'pred_{}'.format(counter)] = pred_hillclimb
#     # Predict on live to get correlation with live model (we want to minimize corr with live model)
#     print("Predicting on live")
#     pred_live = model_obj.predict(live_df.loc[:, features])
#     live_df.loc[:, 'pred_{}'.format(counter)] = pred_live
    
        print(f'Completed: training model_{target}_{subset}_v60')

training model_target_nomi_v4_60_10000_v60
Completed: training model_target_nomi_v4_60_10000_v60
training model_target_nomi_v4_60_888_v60
Completed: training model_target_nomi_v4_60_888_v60
training model_target_tyler_v4_60_10000_v60
Completed: training model_target_tyler_v4_60_10000_v60
training model_target_tyler_v4_60_888_v60
Completed: training model_target_tyler_v4_60_888_v60
training model_target_victor_v4_60_10000_v60
Completed: training model_target_victor_v4_60_10000_v60
training model_target_victor_v4_60_888_v60
Completed: training model_target_victor_v4_60_888_v60
CPU times: user 14h 11min 11s, sys: 15min 59s, total: 14h 27min 11s
Wall time: 4h 5min 17s


In [10]:
# pred_cols = [c for c in validation_df.columns if (c.startswith('pred'))]
# (pred_cols)

### Save predictions

In [11]:
# validation_df[pred_cols].to_parquet("validation_pred.parquet")
# live_df[pred_cols].to_parquet("live_pred.parquet")

### Make an ensemble

In [12]:
# # make an ensemble
# validation_df.loc[:, "pred_equal_weight"] = validation_df[pred_cols].mean(axis=1)
# live_df["pred_equal_weight"] = live_df[pred_cols].mean(axis=1)
# pred_cols.append("pred_equal_weight")

### Neutralize with risky features

In [13]:
# # compute feature correlations with target 
# all_feature_corrs = train_df.groupby('era').apply(lambda d: d[features].astype(float).corrwith(d["target"]))
# # compute the volatility of the feature correlations
# feature_corr_volatility = all_feature_corrs.std()

# risky_feat_library = {}
# for pred in pred_cols:
#     print(pred)
#     # calculate the feature exposures of the predictions
#     feature_exposure_list = []
#     for feature in features:
#         feature_exposure_list.append(np.corrcoef(validation_df[feature].astype(float), validation_df[pred])[0,1])
#     feature_exposure_list = pd.Series(feature_exposure_list, index=features)
#     # get list of  riskiest features
#     risky_feat_library[pred] = (feature_exposure_list.abs()*feature_corr_volatility).sort_values(ascending=False).index.tolist()

In [14]:
# for pred in pred_cols:
#     for prop in [0.25, 0.5, 0.75]:
#         for risky_feat in [100, 200, 300]:
#             print(f"{pred}_{prop}_neutralized_{risky_feat}")
#             # make a 50% feature neutral variation of one of the models
#             validation_df[f"{pred}_{prop}_neutralized_{risky_feat}"] = neutralize(
#                 df=validation_df,
#                 columns=[f"{pred}"],
#                 neutralizers=risky_feat_library[pred][:risky_feat],
#                 proportion=prop,
#                 normalize=True,
#                 era_col="era",
#                 verbose=True,
#             )

#             # do the same for live data
#             live_df[f"{pred}_{prop}_neutralized_{risky_feat}"] = neutralize(
#                 df=live_df,
#                 columns=[f"{pred}"],
#                 neutralizers=risky_feat_library[pred][:risky_feat],
#                 proportion=prop,
#                 normalize=True,
#                 era_col="era",
#                 verbose=True,
#             )

# pred_cols = [c for c in validation_df.columns if (c.startswith('pred'))]
# (pred_cols)

In [15]:
# ## Print correlation performance of each prediction with target
# validation_perf_df = pd.DataFrame(columns=['corr_w_target', 'sharpe'])
# for pred in pred_cols:
#     corr_across_eras = validation_df.groupby('era').apply(lambda df: df[pred].corr(df['target'], method='spearman'))
#     validation_perf_df.loc[pred,'corr_w_target'] = corr_across_eras.mean()
#     validation_perf_df.loc[pred,'sharpe'] = corr_across_eras.mean() / corr_across_eras.std(ddof=0)

# validation_perf_df.sort_values(by='corr_w_target', ascending=False)

In [16]:
# # Correlation with live prediction
# live_perf_dict = {}
# for pred in pred_cols:
#     corr_mean = live_df[pred].corr(live_example_preds.reset_index(drop=True), method='spearman')
#     live_perf_dict[pred] = corr_mean
# live_perf_df = pd.DataFrame.from_dict(live_perf_dict, orient='index', columns=['corr_w_live_predictions'])
# live_perf_df.sort_values(by='corr_w_live_predictions', ascending=False)

In [17]:
# results_df = validation_perf_df.join(live_perf_df)
# results_df.sort_values(by='corr_w_target', ascending=False)