Meta Learning

NumerAi

Tanmay Singh
2021569
CSAI
Class of '25

Importing the Dependencies

In [None]:
!pip install scipy
!pip install numpy
!pip install pandas
!pip install xgboost
!pip install seaborn
!pip install pyarrow
!pip install numerapi
!pip install imblearn
!pip install catboost
!pip install lightgbm
!pip install matplotlib
!pip install cloudpickle
!pip install mplcyberpunk
!pip install scikit-learn
!pip install torchsummary

In [None]:
import os
import gc
import time
import json
import pickle
import numpy as np
import pandas as pd
import seaborn as sb
import xgboost as xgb
import lightgbm as lgb
import cloudpickle as cp
import matplotlib.pyplot as plt

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
from torchsummary import summary
from torch.utils.data import DataLoader, Dataset

In [None]:
from tqdm import tqdm
from scipy import stats
from numerapi import NumerAPI
from scipy.stats import pearsonr
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [None]:
from sklearn.tree import *
from sklearn.metrics import *
from sklearn.ensemble import *
from sklearn.linear_model import *
from sklearn.decomposition import *
from sklearn.preprocessing import *
from sklearn.neural_network import *
from sklearn.model_selection import *
from sklearn.cluster._kmeans import KMeans
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import LocalOutlierFactor
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

Function to compute NumerAi Correlation

In [None]:
def numerai_corr(preds, target):
  ranked_preds = (preds.rank(method="average").values - 0.5) / preds.count()
  gauss_ranked_preds = stats.norm.ppf(ranked_preds)

  centered_target = target - target.mean()

  preds_p15 = np.sign(gauss_ranked_preds) * np.abs(gauss_ranked_preds) ** 1.5
  target_p15 = np.sign(centered_target) * np.abs(centered_target) ** 1.5

  return np.corrcoef(preds_p15, target_p15)[0, 1]

Loading the Dataset

In [8]:
NumerAi = NumerAPI()

In [9]:
all_datasets = NumerAi.list_datasets()
dataset_versions = list(set(d.split('/')[0] for d in all_datasets))
print("Available versions:\n", dataset_versions)

Available versions:
 ['v5.0']


In [10]:
DATA_VERSION = "v5.0"

current_version_files = [f for f in all_datasets if f.startswith(DATA_VERSION)]
print("availbable", DATA_VERSION, "files:\n", current_version_files)

availbable v5.0 files:
 ['v5.0/features.json', 'v5.0/live.parquet', 'v5.0/live_benchmark_models.parquet', 'v5.0/live_example_preds.csv', 'v5.0/live_example_preds.parquet', 'v5.0/meta_model.parquet', 'v5.0/train.parquet', 'v5.0/train_benchmark_models.parquet', 'v5.0/validation.parquet', 'v5.0/validation_benchmark_models.parquet', 'v5.0/validation_example_preds.csv', 'v5.0/validation_example_preds.parquet']


Creating a Feature Set

In [11]:
NumerAi.download_dataset(f"{DATA_VERSION}/features.json")

feature_metadata = json.load(open(f"{DATA_VERSION}/features.json"))
for metadata in feature_metadata:
  print(metadata, len(feature_metadata[metadata]))

2024-11-04 13:40:54,977 INFO numerapi.utils: target file already exists
2024-11-04 13:40:54,978 INFO numerapi.utils: download complete


feature_sets 17
targets 37


In [14]:
feature_sets = feature_metadata["feature_sets"]
for feature_set in ["small", "medium", "all"]:
  print(feature_set, len(feature_sets[feature_set]))

small 42
medium 705
all 2376


In [15]:
feature_sets = feature_metadata["feature_sets"]
feature_sets.keys()

dict_keys(['small', 'medium', 'all', 'v2_equivalent_features', 'v3_equivalent_features', 'fncv3_features', 'intelligence', 'charisma', 'strength', 'dexterity', 'constitution', 'wisdom', 'agility', 'serenity', 'sunshine', 'rain', 'midnight'])

In [16]:
for feature_set in feature_sets:
  print(f'Feature Set: {feature_set:<25}', f'Size: {len(feature_sets[feature_set])}')

Feature Set: small                     Size: 42
Feature Set: medium                    Size: 705
Feature Set: all                       Size: 2376
Feature Set: v2_equivalent_features    Size: 304
Feature Set: v3_equivalent_features    Size: 1000
Feature Set: fncv3_features            Size: 400
Feature Set: intelligence              Size: 35
Feature Set: charisma                  Size: 290
Feature Set: strength                  Size: 135
Feature Set: dexterity                 Size: 51
Feature Set: constitution              Size: 335
Feature Set: wisdom                    Size: 140
Feature Set: agility                   Size: 145
Feature Set: serenity                  Size: 95
Feature Set: sunshine                  Size: 325
Feature Set: rain                      Size: 666
Feature Set: midnight                  Size: 244


Loading the Validation Set, with a 'medium' feature set

In [17]:
feature_set = feature_sets["medium"]

NumerAi.download_dataset(f"{DATA_VERSION}/validation.parquet")

val = pd.read_parquet(
    f"{DATA_VERSION}/validation.parquet",
    columns=["era", "target"] + feature_set
)


2024-11-04 13:40:56,649 INFO numerapi.utils: target file already exists
2024-11-04 13:40:56,650 INFO numerapi.utils: download complete


Preprocessing the Validation Set (in the same manner as the Training Set)

In [18]:
val.rename(columns=lambda x: f'feature {feature_set.index(x)}' if x in feature_set else x, inplace=True)
feature_set = val.columns.drop(["era", "target"])

In [19]:
val['era'] = val['era'].astype('int32')

In [20]:
val

Unnamed: 0_level_0,era,target,feature 0,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,...,feature 695,feature 696,feature 697,feature 698,feature 699,feature 700,feature 701,feature 702,feature 703,feature 704
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n000101811a8a843,575,0.75,2,1,2,4,0,4,0,0,...,3,0,0,3,4,2,0,0,1,0
n001e1318d5072ac,575,0.00,0,2,2,3,2,3,0,0,...,3,2,3,0,1,1,0,3,4,3
n002a9c5ab785cbb,575,0.50,0,2,2,3,3,3,1,1,...,2,1,2,0,2,3,3,4,2,2
n002ccf6d0e8c5ad,575,0.00,0,1,1,4,3,4,0,4,...,1,1,4,2,0,0,1,3,1,3
n0041544c345c91d,575,0.50,4,2,0,0,3,3,3,4,...,2,3,4,3,2,4,3,3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nffcfe54de4ce8d6,1139,,3,1,4,2,0,4,2,1,...,4,1,0,4,0,0,4,0,4,1
nffdddf405f2c33d,1139,,0,0,3,4,4,3,1,3,...,2,3,4,1,3,4,3,4,0,3
nffe751e00183f5f,1139,,4,4,4,3,1,3,0,2,...,1,1,1,4,2,1,3,1,4,1
nffe7d4abc102a3d,1139,,0,4,2,1,3,4,1,2,...,0,3,1,2,3,2,0,4,2,4


In [21]:
unique_era = val['era'].unique()

In [22]:
val[val['era'] == unique_era[0]]

Unnamed: 0_level_0,era,target,feature 0,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,...,feature 695,feature 696,feature 697,feature 698,feature 699,feature 700,feature 701,feature 702,feature 703,feature 704
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n000101811a8a843,575,0.75,2,1,2,4,0,4,0,0,...,3,0,0,3,4,2,0,0,1,0
n001e1318d5072ac,575,0.00,0,2,2,3,2,3,0,0,...,3,2,3,0,1,1,0,3,4,3
n002a9c5ab785cbb,575,0.50,0,2,2,3,3,3,1,1,...,2,1,2,0,2,3,3,4,2,2
n002ccf6d0e8c5ad,575,0.00,0,1,1,4,3,4,0,4,...,1,1,4,2,0,0,1,3,1,3
n0041544c345c91d,575,0.50,4,2,0,0,3,3,3,4,...,2,3,4,3,2,4,3,3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nffd46d12142d65e,575,0.25,0,2,4,4,0,0,2,0,...,1,3,3,3,0,2,1,0,0,2
nffdcdda19649863,575,0.50,4,2,0,3,3,4,0,1,...,4,1,2,3,4,1,1,4,1,4
nfff40f4a726b37b,575,0.00,2,3,3,3,1,0,2,0,...,4,2,2,4,3,2,0,0,1,2
nfff6c2150983107,575,0.50,2,1,4,1,1,4,4,0,...,2,2,3,4,2,2,0,1,2,4


In [23]:
test_set = val
test_set

Unnamed: 0_level_0,era,target,feature 0,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,...,feature 695,feature 696,feature 697,feature 698,feature 699,feature 700,feature 701,feature 702,feature 703,feature 704
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n000101811a8a843,575,0.75,2,1,2,4,0,4,0,0,...,3,0,0,3,4,2,0,0,1,0
n001e1318d5072ac,575,0.00,0,2,2,3,2,3,0,0,...,3,2,3,0,1,1,0,3,4,3
n002a9c5ab785cbb,575,0.50,0,2,2,3,3,3,1,1,...,2,1,2,0,2,3,3,4,2,2
n002ccf6d0e8c5ad,575,0.00,0,1,1,4,3,4,0,4,...,1,1,4,2,0,0,1,3,1,3
n0041544c345c91d,575,0.50,4,2,0,0,3,3,3,4,...,2,3,4,3,2,4,3,3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nffcfe54de4ce8d6,1139,,3,1,4,2,0,4,2,1,...,4,1,0,4,0,0,4,0,4,1
nffdddf405f2c33d,1139,,0,0,3,4,4,3,1,3,...,2,3,4,1,3,4,3,4,0,3
nffe751e00183f5f,1139,,4,4,4,3,1,3,0,2,...,1,1,1,4,2,1,3,1,4,1
nffe7d4abc102a3d,1139,,0,4,2,1,3,4,1,2,...,0,3,1,2,3,2,0,4,2,4


In [24]:
test_set.isna().any().any()
test_set['target'].value_counts()

target
0.50    1728946
0.25     693777
0.75     693226
1.00     171548
0.00     171188
Name: count, dtype: int64

Encoding the Numeric Values in the Target into corresponding labels (class 0 to class 4)

In [25]:
label_encoder = LabelEncoder()
label_encoder.fit(test_set['target'])
test_set['target'] = label_encoder.transform(test_set['target'])

In [26]:
test_df_x = test_set.drop(['target'], axis=1, inplace=False)
test_df_y = test_set['target']

Loading the Saved Predictions from the 4 Runs

In [27]:
with open('numerai_prediction1.pkl', 'rb') as f:
    prediction1 = pickle.load(f)

print("Prediction 1:", prediction1)

Prediction 1: [2 2 2 ... 3 3 2]


In [28]:
with open('numerai_prediction2.pkl', 'rb') as f:
    prediction2 = pickle.load(f)

print("Prediction 2:", prediction2)

Prediction 2: [2 3 3 ... 2 2 1]


In [29]:
with open('numerai_prediction3.pkl', 'rb') as f:
    prediction3 = pickle.load(f)

print("Prediction 3:", prediction3)

Prediction 3: [3 1 3 ... 3 2 2]


In [30]:
with open('numerai_prediction4.pkl', 'rb') as f:
    prediction4 = pickle.load(f)

print("Prediction 4:", prediction4)

Prediction 4: [3 2 2 ... 3 2 3]


Concatenating the Predictions

In [41]:
arr = []

In [42]:
for i in range(len(prediction1)):
    arr.append(prediction1[i])

len(arr)

1000000

In [43]:
for i in range(len(prediction2)):
    arr.append(prediction2[i])

len(arr)

2000000

In [44]:
for i in range(len(prediction3)):
    arr.append(prediction3[i])

len(arr)

3000000

In [45]:
for i in range(len(prediction4)):
    arr.append(prediction4[i])

len(arr)

3490518

Converting the Concatenated/Stacked Predictions to a Numpy Array

In [56]:
stacked_predictions = np.array(arr)

len(stacked_predictions)

3490518

Computing Relevant Evaluation Metrics

In [57]:
acc = accuracy_score(stacked_predictions, test_df_y)
print("Accuracy on Validation Set: ", acc)

Accuracy on Validation Set:  0.4377908952195634


Pearson's Correlation

In [58]:
pearson_corr, _ = stats.pearsonr(stacked_predictions, test_df_y)
print("Pearson Correlation:", pearson_corr)

Pearson Correlation: 0.013963093068806005


Computing the NumerAi's Correlation Metric

In [None]:
stacked_predictions = pd.Series(stacked_predictions)

In [60]:
actual_corr = numerai_corr(stacked_predictions, test_df_y)
actual_corr

0.013807778180309257