In [7]:
import pandas as pd
from lightgbm import LGBMRegressor
import gc
from numerapi import NumerAPI
from utils import save_model, load_model, load_model_production, neutralize, get_biggest_change_features, validation_metrics, download_data
import numpy as np

EXAMPLE_PREDS_COL = "example_preds"
TARGET_COL = "target_nomi_20"

downsample_cross_val = 5
downsample_full_train = 1

cv = 4

# std for gaussian noise
std = 0.3



In [2]:
# '''I needed to restart kernel'''
# training_data = pd.read_parquet('tmp_data.parquet')

napi = NumerAPI()

print('downloading training_data')
download_data(napi, 'numerai_training_data.parquet', 'numerai_training_data.parquet')

print("reading training data from local file")
training_data = pd.read_parquet('numerai_training_data.parquet').dropna()

downloading training_data
⠋ Downloading numerai_training_data.parquet⠙ Downloading numerai_training_data.parquet⠹ Downloading numerai_training_data.parquet⠸ Downloading numerai_training_data.parquet⠼ Downloading numerai_training_data.parquet⠴ Downloading numerai_training_data.parquet⠦ Downloading numerai_training_data.parquet⠧ Downloading numerai_training_data.parquet⠇ Downloading numerai_training_data.parquet⠏ Downloading numerai_training_data.parquet

2022-03-16 12:26:44,822 INFO numerapi.utils: target file already exists
2022-03-16 12:26:44,824 INFO numerapi.utils: download complete


✔ Downloading numerai_training_data.parquet
reading training data from local file


2022-03-16 12:26:57,731 INFO numexpr.utils: Note: NumExpr detected 24 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-03-16 12:26:57,732 INFO numexpr.utils: NumExpr defaulting to 8 threads.




In [3]:

# params we'll use to train all of our models.
# Ideal params would be more like 20000, 0.001, 6, 2**6, 0.1, but this is slow enough as it is
model_params = {"n_estimators": 2000,
                "learning_rate": .01,
                "max_depth": 5,
                "num_leaves": 2 ** 5,
                "colsample_bytree": 0.1}

# pick some targets to use
targets = ["target_nomi_20"]#[c for c in training_data.columns if c.startswith("target_")]
print("TARGETS", targets)

# all the possible features to train on
feature_cols = [c for c in training_data if c.startswith("feature_")]#

gc.collect()

TARGETS ['target_nomi_20']


0



In [4]:
'''Preprocess data'''
np.save( 'eras.npy', training_data['era'].astype('int').to_numpy() )
np.save( 'train_data.npy', training_data[ feature_cols ].to_numpy() )
np.save( 'train_targets.npy', training_data[targets].to_numpy() )
np.save( 'feature_cols.npy', np.array( feature_cols ) )



In [None]:
training_data.drop( columns=feature_cols, inplace=True )

In [9]:
train_data = np.load('train_data.npy', mmap_mode='r+')
train_targets = np.load('train_targets.npy', mmap_mode='r+')
eras = np.load('eras.npy', mmap_mode='r+')



In [10]:
def get_time_series_cross_val_splits(all_train_eras, cv = 3, embargo = 12):
    len_split = len(all_train_eras) // cv
    test_splits = [all_train_eras[i * len_split:(i + 1) * len_split] for i in range(cv)]
    # fix the last test split to have all the last eras, in case the number of eras wasn't divisible by cv
    test_splits[-1] = np.append(test_splits[-1], all_train_eras[-1])

    train_splits = []
    for test_split in test_splits:
        test_split_max = int(np.max(test_split))
        test_split_min = int(np.min(test_split))
        # get all of the eras that aren't in the test split
        train_split_not_embargoed = [e for e in all_train_eras if not (test_split_min <= int(e) <= test_split_max)]
        # embargo the train split so we have no leakage.
        # one era is length 5, so we need to embargo by target_length/5 eras.
        # To be consistent for all targets, let's embargo everything by 60/5 == 12 eras.
        train_split = [e for e in train_split_not_embargoed if abs(int(e) - test_split_max) > embargo and abs(int(e) - test_split_min) > embargo]
        train_splits.append(train_split)

    # convenient way to iterate over train and test splits
    train_test_zip = zip(train_splits, test_splits)
    return train_test_zip



In [11]:
print("entering time series cross validation loop")

train_test_zip = get_time_series_cross_val_splits(np.unique( eras ), cv=cv, embargo=12)

ensemble_cols = set()
pred_cols = set()

predictions = { "orig": np.empty(train_data.shape[0]), "noisy": np.empty(train_data.shape[0]), "both": np.empty(train_data.shape[0]) }

for split, train_test_split in enumerate(train_test_zip):
    
    gc.collect()
    print(f"doing split {split+1} out of {cv}")
    train_split, test_split = train_test_split
    train_split_index = np.isin(eras, train_split)
    test_split_index = np.isin(eras, test_split)
    downsampled_train_split_index = train_split_index
    print( train_split_index.shape, test_split_index.shape )

    print(f"entering model training loop for split {split+1}")
    for target in targets:
        
        """original features"""
        
        model_name = f"model_{target}_original"
        print(f"model: {model_name}")

        split_model = LGBMRegressor(**model_params)
        split_model.fit(
            train_data[downsampled_train_split_index][::downsample_cross_val],
            train_targets[downsampled_train_split_index][::downsample_cross_val].flatten()
        )

        print(f"predicting {model_name}")
        predictions['orig'][test_split_index] = split_model.predict(train_data[test_split_index])
        
        """new features"""
        
        model_name = f"model_{target}_with_noise"
        print(f"model: {model_name}")

        split_model = LGBMRegressor(**model_params)
        split_model.fit( 
            train_data[downsampled_train_split_index][::downsample_cross_val] + np.random.normal(0, std, train_data[downsampled_train_split_index][::downsample_cross_val].shape ) , 
            train_targets[downsampled_train_split_index][::downsample_cross_val].flatten() 
        )

        print(f"predicting {model_name}")
        predictions['noisy'][test_split_index] = split_model.predict( train_data[test_split_index] )
        
        """both"""
        
        model_name = f"model_{target}_both"
        print(f"model: {model_name}")

        split_model = LGBMRegressor(**model_params)
        split_model.fit( np.vstack([
            train_data[downsampled_train_split_index][::downsample_cross_val],
            train_data[downsampled_train_split_index][::downsample_cross_val] + np.random.normal(0, std, train_data[downsampled_train_split_index][::downsample_cross_val].shape )
        ]) , np.vstack([
            train_targets[downsampled_train_split_index][::downsample_cross_val],
            train_targets[downsampled_train_split_index][::downsample_cross_val]
        ]).flatten())

        print(f"predicting {model_name}")
        predictions['both'][test_split_index] = split_model.predict( train_data[test_split_index] )
        gc.collect()


entering time series cross validation loop
doing split 1 out of 4
(2390952,) (2390952,)
entering model training loop for split 1
model: model_target_nomi_20_original
predicting model_target_nomi_20_original
model: model_target_nomi_20_with_noise
predicting model_target_nomi_20_with_noise
model: model_target_nomi_20_both
predicting model_target_nomi_20_both
doing split 2 out of 4
(2390952,) (2390952,)
entering model training loop for split 2
model: model_target_nomi_20_original
predicting model_target_nomi_20_original
model: model_target_nomi_20_with_noise
predicting model_target_nomi_20_with_noise
model: model_target_nomi_20_both
predicting model_target_nomi_20_both
doing split 3 out of 4
(2390952,) (2390952,)
entering model training loop for split 3
model: model_target_nomi_20_original
predicting model_target_nomi_20_original
model: model_target_nomi_20_with_noise
predicting model_target_nomi_20_with_noise
model: model_target_nomi_20_both
predicting model_target_nomi_20_both
doing spl

In [12]:
''''''
# training_data.to_parquet('tmp_data.parquet')
training_data['preds_orig'] = predictions['orig']
training_data['preds_noisy'] = predictions['noisy']
training_data['preds_both'] = predictions['both']



In [13]:
training_stats = validation_metrics(training_data.dropna(), ['preds_orig', 'preds_noisy', 'preds_both'], example_col=f"preds_orig", fast_mode=True)
training_stats   

Unnamed: 0,mean,std,sharpe,max_drawdown,apy,mmc_mean,corr_plus_mmc_sharpe,corr_with_example_preds
preds_orig,0.054377,0.028398,1.914835,-0.201473,1215.404782,6.4e-05,1.846502,1.0
preds_noisy,0.050325,0.030708,1.638831,-0.280166,985.666244,0.004314,1.39911,0.810352
preds_both,0.055178,0.030051,1.836121,-0.260541,1262.290479,0.003292,1.701947,0.929724


