In [1]:
import json

import numerapi
import numpy as np
import pandas as pd

In [45]:
current_round = 254
directory = f'../round{current_round}/data'

with open('../dtypes.json') as f:
    dtypes = json.load(f)

full_path = f"{directory}/numerai_dataset_{current_round}/"
train_path = full_path + "numerai_training_data.csv"
test_path = full_path + "numerai_tournament_data.csv"
train = pd.read_csv(train_path, dtype=dtypes)
test = pd.read_csv(test_path, dtype=dtypes)

In [3]:
full = train#.append(test).reset_index(drop=True)
full['era'] = full.era.str.extract('(\d+|X)$', expand=False).str.zfill(4)

In [4]:
full.head()

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target
0,n000315175b67977,1,train,0.0,0.5,0.25,0.0,0.5,0.25,0.25,...,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75,0.5
1,n0014af834a96cdd,1,train,0.0,0.0,0.0,0.25,0.5,0.0,0.0,...,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0,0.25
2,n001c93979ac41d4,1,train,0.25,0.5,0.25,0.25,1.0,0.75,0.75,...,0.25,0.5,0.0,0.0,0.5,1.0,0.0,0.25,0.75,0.25
3,n0034e4143f22a13,1,train,1.0,0.0,0.0,0.5,0.5,0.25,0.25,...,1.0,1.0,0.75,0.75,1.0,1.0,0.75,1.0,1.0,0.25
4,n00679d1a636062f,1,train,0.25,0.25,0.25,0.25,0.0,0.25,0.5,...,0.75,0.75,0.25,0.5,0.75,0.0,0.5,0.25,0.75,0.75


In [5]:
train = full[full.data_type.isin(['train', 'validation'])]

In [6]:
features = [f for f in train.columns if f.startswith('feature')]
len(features)

310

In [22]:
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.metrics import make_scorer

from scipy.stats import spearmanr

In [23]:
from lightgbm import LGBMRegressor

In [33]:
param = {
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": "l2",
    "device": "gpu",
    "num_iterations": 2000,
    "learning_rate": 0.006,
    "lambda_l1": 1.4,
    "lambda_l2": 1.0,
    "bagging_fraction": 0.55,
    "bagging_freq": 1,
    "num_leaves": 107,
    "max_depth": 15,
    "verbose": 0,
    "random_state": 0,
}

In [34]:
def spearman(y_true, y_pred, axis=0):
    """ Calculate Spearman correlation """
    return spearmanr(y_true, y_pred, axis=axis)[0]

In [35]:
cv = GroupKFold(n_splits=10)

model = LGBMRegressor(**param)

X = train[features]
y = train.target
groups = train.era

In [None]:
n_scores = cross_val_score(model, X, y, groups=groups, scoring=make_scorer(spearman), cv=cv)

In [37]:
n_scores

array([0.03784321, 0.03493898, 0.04653282, 0.04852294, 0.03696135,
       0.05303415, 0.04603644, 0.05357357, 0.04637529, 0.0465557 ])

In [41]:
train_t = train[train.data_type == 'train']
model.fit(train_t[features], train_t.target)



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 501808, number of used features: 310
[LightGBM] [Info] Using GPU Device: GeForce GTX 1080 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 308 dense feature groups (74.66 MB) transferred to GPU in 0.038982 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 0.499997


LGBMRegressor(bagging_fraction=0.55, bagging_freq=1, device='gpu',
              lambda_l1=1.4, lambda_l2=1.0, learning_rate=0.006, max_depth=15,
              metric='l2', num_iterations=2000, num_leaves=107,
              objective='regression', random_state=0, verbose=1)

In [49]:
val = test[test.data_type == 'validation'].copy()
val['prediction'] = model.predict(val[features])

In [79]:
from sklearn.metrics import mean_absolute_error

def spearmanr(target, pred):
    return np.corrcoef(target, pred.rank(pct=True, method="first"))[0, 1]

def sharpe_ratio(corrs: pd.Series) -> np.float32:
        """
        Calculate the Sharpe ratio for Numerai by using grouped per-era data

        :param corrs: A Pandas Series containing the Spearman correlations for each era
        :return: A float denoting the Sharpe ratio of your predictions.
        """
        return corrs.mean() / corrs.std()

#https://parmarsuraj99.medium.com/evaluating-financial-machine-learning-models-on-numerai-3562da8fd90
def calculate_feature_exposure(df, feature_names, prediction_name='prediction') -> list:
    """
    Example:
    -----
    feature_exposure, max_feat_exposure, square_sum_feature_exposure = calculate_feature_exposure(df, feature_names)
    """
    
    exposures = []
    for feature_name in feature_names:
        exposures.append(spearmanr(df[feature_name], df[prediction_name]))
        
    max_feat_exposure = np.max(np.abs(exposures))
    square_sum_feature_exposure = np.sum([e**2 for e in exposures])
    feature_exposure = np.std(exposures)

    return [feature_exposure, max_feat_exposure, square_sum_feature_exposure]

#Calculating Max Drawdown
def max_drawdown(df, prediction_name='prediction', target_name='target'):
    scores_per_era = df.groupby("era").apply(
        lambda x: spearmanr(x[prediction_name], x[target_name]))

    rolling_max = (scores_per_era+1).cumprod().rolling(window=100, min_periods=1).max()
    daily_value = (scores_per_era+1).cumprod()
    max_drawdown = (rolling_max - daily_value).max()

    return max_drawdown

def evaluate(df: pd.DataFrame, features) -> tuple:
        """
        Evaluate and display relevant metrics for Numerai 

        :param df: A Pandas DataFrame containing the columns "era", "target" and a column for predictions
        :param pred_col: The column where the predictions are stored
        :return: A tuple of float containing the metrics
        """
        def _score(sub_df: pd.DataFrame) -> np.float32:
            """Calculates Spearman correlation"""
            return spearmanr(sub_df["target"], sub_df["prediction"])

        # Calculate metrics
        corrs = df.groupby("era").apply(_score)
        print(corrs)
        payout_raw = (corrs / 0.2).clip(-1, 1)
        spearman = round(corrs.mean(), 4)

        payout = round(payout_raw.mean(), 4)
        numerai_sharpe = round(sharpe_ratio(corrs), 4)
        mae = mean_absolute_error(df["target"], df["prediction"]).round(4)
        drawdown = max_drawdown(df)
        fe, max_fe, square_sum_fe = calculate_feature_exposure(df, features)

        # Display metrics
        print(f"Spearman Correlation: {spearman}")
        print(f"Average Payout: {payout}")
        print(f"Sharpe Ratio: {numerai_sharpe}")
        print(f"Mean Absolute Error (MAE): {mae}")
        print(f"Max drawdown: {drawdown}")
        print(f"Feature exposure: {fe}, Max Feature Exposure: {max_fe}, Square Sum: {square_sum_fe}")
        return spearman, payout, numerai_sharpe, mae
        
feature_spearman_val = [spearmanr(val["prediction"], val[f]) for f in features]
feature_exposure_val = np.std(feature_spearman_val).round(4)
spearman, payout, numerai_sharpe, mae = evaluate(val, features)

era
era121    0.043639
era122    0.020491
era123    0.041825
era124    0.061213
era125    0.022969
era126    0.039504
era127    0.013370
era128    0.063521
era129   -0.016328
era130    0.068619
era131    0.037344
era132    0.068893
era197    0.032808
era198    0.013265
era199   -0.026361
era200   -0.000153
era201    0.004489
era202    0.037945
era203    0.030135
era204    0.022000
era205    0.001084
era206   -0.012192
era207    0.050778
era208    0.055984
era209    0.042172
era210   -0.019489
era211   -0.025487
era212    0.025028
dtype: float64
Spearman Correlation: 0.0249
Average Payout: 0.1245
Sharpe Ratio: 0.8725
Mean Absolute Error (MAE): 0.1542
Max drawdown: 0.08634456532468171
Feature exposure: 0.0841855355399951, Max Feature Exposure: 0.3064697812725709, Square Sum: 2.411603982498413


In [73]:
    scores_per_era = val.groupby("era").apply(
        lambda df: spearmanr(df['prediction'], df['target']))

In [74]:
    rolling_max = (scores_per_era+1).cumprod().rolling(window=100, min_periods=1).max()

In [75]:
    daily_value = (scores_per_era+1).cumprod()
    max_drawdown = (rolling_max - daily_value).max()

In [76]:
max_drawdown

0.08634456532468171

-0.017578986239771176